In [1]:
import pandas as pd
import numpy as np

from joblib import dump, load
from zipfile import ZipFile, ZIP_DEFLATED
import glob
import itertools
from uuid import uuid4
import json
from os import path
from datetime import datetime

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, precision_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import normalize

# Функции

In [2]:
if not path.isfile('models.json'):
  f = open('models.json', 'w')
  f.write('{}')
  f.close()

def save_model(model, x, y, description={}):
  f1 = f1_score(y, model.predict(x))
  r = recall_score(y, model.predict(x))
  p = precision_score(y, model.predict(x))
  
  id = str(uuid4())
  dump(model, f'{id}.joblib')

  f =  open('models.json', 'r')
  j = json.load(f)
  f.close()
  j[id] =  {
      'f1_score': f1, 
      'recall_score': r, 
      'precision_score': p, 
      'description': dict(description, **{'time': str(datetime.now())})}
  f = open('models.json', 'w')
  json.dump(j, f, ensure_ascii=False, indent=4)
  f.close()


def grid_search(model, params, x_train, y_train, x_test, y_test,
                f1_min=0.9, recall_min=0.9, precision_min=0.9, description={}):
  keys = list(params.keys())
  for param in itertools.product(*[params[j] for j in keys]):
    par = {v: param[i] for i, v in enumerate(keys)}
    model.set_params(**par)
    model.fit(x_train, y_train)

    f = f1_score(y_test, model.predict(x_test))
    r = recall_score(y_test, model.predict(x_test))
    p = precision_score(y_test, model.predict(x_test))

    if f >= f1_min and r >= recall_min and p >= precision_min:
      print('\nTest f1 score:', f)
      print('Test recall score:', r)
      print('Test precision score:', p)
      print('\nTrain f1 score:', f1_score(y_train, model.predict(x_train)))
      print('Train recall score:', recall_score(y_train, model.predict(x_train)))
      print('Train precision score:', precision_score(y_train, model.predict(x_train)))

      save_model(model, x_test, y_test, description=dict(description, **par))

# Подготовка

In [3]:
data = pd.ExcelFile('Кардио1.xlsx')
data.sheet_names
data = data.parse('Суперфинал (3 регистра)', index_col='История болезни').dropna()
data

Unnamed: 0_level_0,Пол,Возраст,Мочевина (1-е сутки),Креатинин (1-е сутки),СКФ (CKD-EPI 2021),АСТ (1-е сутки),АЛТ (1-е сутки),CRP,Глюкоза (1-е сутки),Лейкоциты (1-е сутки),...,Лимфоциты (1-е сутки),Нейтрофильно-лимфоцитарное соотношение,Выжил?,Степень тяжести,"D-димер, ед.",АГ,СД,ИБС (с ХСН),ХОБЛ + астма,ХБП (3-5)
История болезни,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
103,жен,60,13.7,106.0,63.396226,42.00,47.00,7.31,4.7,16.40,...,0.90,16.333333,да,тяжелая,250.0,да,нет,нет,нет,нет
127,жен,64,15.3,103.0,52.000000,31.04,36.23,6.00,9.3,14.40,...,2.10,5.476190,да,тяжелая,2310.0,да,да,нет,нет,нет
136,жен,61,4.7,83.0,69.000000,40.00,14.00,244.00,13.1,10.00,...,1.20,7.166667,нет,тяжелая,1100.0,да,да,нет,нет,нет
147,муж,74,12.2,141.0,45.000000,24.85,30.14,8.04,11.9,14.50,...,2.30,4.956522,да,тяжелая,620.0,да,да,да,нет,нет
156,жен,82,3.7,55.0,89.000000,32.60,36.70,6.40,7.0,7.90,...,0.40,15.250000,да,тяжелая,155.0,нет,нет,нет,нет,нет
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
851394,жен,66,6.6,98.0,55.000000,20.00,12.90,25.00,5.5,10.30,...,2.20,3.136364,да,легкая,6254.0,да,да,да,нет,да
851832,муж,60,5.6,65.0,104.000000,22.00,50.00,15.30,5.9,12.21,...,1.25,7.384000,да,средняя,2018.0,да,нет,нет,нет,нет
852893,жен,86,16.2,154.0,28.000000,15.00,16.00,204.00,6.2,7.60,...,1.64,3.390244,нет,тяжелая,4054.0,да,нет,да,нет,да
855138,жен,88,6.5,71.0,70.000000,39.00,16.00,171.50,5.0,7.12,...,0.57,10.175439,нет,тяжелая,22563.0,да,нет,да,нет,нет


In [4]:
data.loc[data.Пол == 'муж', 'Пол'] = 0.
data.loc[data.Пол == 'жен', 'Пол'] = 1.
data.loc[data['Выжил?'] == 'нет', 'Выжил?'] = 0.
data.loc[data['Выжил?'] == 'да', 'Выжил?'] = 1.
data.loc[data['АГ'] == 'нет', 'АГ'] = 0.
data.loc[data['АГ'] == 'да', 'АГ'] = 1.
data.loc[data['СД'] == 'нет', 'СД'] = 0.
data.loc[data['СД'] == 'да', 'СД'] = 1.
data.loc[data['ИБС (с ХСН)'] == 'нет', 'ИБС (с ХСН)'] = 0.
data.loc[data['ИБС (с ХСН)'] == 'да', 'ИБС (с ХСН)'] = 1.
data.loc[data['ХОБЛ + астма'] == 'нет', 'ХОБЛ + астма'] = 0.
data.loc[data['ХОБЛ + астма'] == 'да', 'ХОБЛ + астма'] = 1.
data.loc[data['ХБП (3-5)'] == 'нет', 'ХБП (3-5)'] = 0.
data.loc[data['ХБП (3-5)'] == 'да', 'ХБП (3-5)'] = 1.
data.loc[data['Степень тяжести'] == 'тяжелая', 'Степень тяжести'] = 1.
data.loc[data['Степень тяжести'] == 'тяжелый', 'Степень тяжести'] = 1.
data.loc[data['Степень тяжести'] == 'легкая', 'Степень тяжести'] = 0.
data.loc[data['Степень тяжести'] == 'средняя', 'Степень тяжести'] = 0.5

In [5]:
data = data.astype('float64')

In [6]:
x_train, x_test, y_train, y_test = train_test_split(data.drop('Выжил?', axis=1), data['Выжил?'], random_state=5, test_size=0.1)

# KNeighbors

In [7]:
model = KNeighborsClassifier()
grid = GridSearchCV(model, {'n_neighbors': range(1, 20), 'p': range(1, 10)})

grid.fit(x_train, y_train)

print('\tKNeighbors')
print('\ngrid.best_params_:', grid.best_params_)

model.set_params(**grid.best_params_)
model.fit(x_train, y_train)

f = f1_score(y_test, model.predict(x_test))
r = recall_score(y_test, model.predict(x_test))
p = precision_score(y_test, model.predict(x_test))

print('\nTest f1 score:', f)
print('Test recall score:', r)
print('Test precision score:', p)
print('\nTrain f1 score:', f1_score(y_train, model.predict(x_train)))
print('Train recall score:', recall_score(y_train, model.predict(x_train)))
print('Train precision score:', precision_score(y_train, model.predict(x_train)))

if f > 0.85 and r > 0.85 and p > 0.85:
  print('\nSave model')
  save_model(model, x_test, y_test, description={'model': 'KNeighborsClassifier'})

	KNeighbors

grid.best_params_: {'n_neighbors': 3, 'p': 8}

Test f1 score: 0.6000000000000001
Test recall score: 0.6428571428571429
Test precision score: 0.5625

Train f1 score: 0.7876712328767124
Train recall score: 0.782312925170068
Train precision score: 0.7931034482758621


# SVC

In [None]:
model = SVC()
grid = GridSearchCV(model, {'C': range(1, 5), 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})

grid.fit(x_train, y_train)

print('\tSVC')
print('\ngrid.best_params_:', grid.best_params_)

model.set_params(**grid.best_params_)
model.fit(x_train, y_train)

f = f1_score(y_test, model.predict(x_test))
r = recall_score(y_test, model.predict(x_test))
p = precision_score(y_test, model.predict(x_test))

print('\nTest f1 score:', f)
print('Test recall score:', r)
print('Test precision score:', p)
print('\nTrain f1 score:', f1_score(y_train, model.predict(x_train)))
print('Train recall score:', recall_score(y_train, model.predict(x_train)))
print('Train precision score:', precision_score(y_train, model.predict(x_train)))

if f > 0.85 and r > 0.85 and p > 0.85:
  print('\nSave model')
  save_model(model, x_test, y_test, description={'model': 'SVC'})

grid.best_params_: {'C': 2, 'kernel': 'linear'}
Train score: 0.7491289198606271
Test score: 0.59375


# DecisionTreeClassifier

In [8]:
print('\nDecisionTreeClassifier')

for i in range(10):
  print('\nI: ', i)
  model = DecisionTreeClassifier()
  grid_search(model, {'criterion': ['gini', 'entropy'], 'max_depth': range(1, 20)},
              x_train, y_train, x_test, y_test,
              f1_min=0.85, recall_min=0.85, precision_min=0.85,
              description={'model': 'DecisionTreeClassifier'})


DecisionTreeClassifier

I:  0

I:  1

I:  2

I:  3

I:  4

I:  5

Test f1 score: 0.888888888888889
Test recall score: 0.8571428571428571
Test precision score: 0.9230769230769231

Train f1 score: 1.0
Train recall score: 1.0
Train precision score: 1.0

I:  6

I:  7

Test f1 score: 0.888888888888889
Test recall score: 0.8571428571428571
Test precision score: 0.9230769230769231

Train f1 score: 0.9965870307167235
Train recall score: 0.9931972789115646
Train precision score: 1.0

I:  8

I:  9

Test f1 score: 0.8571428571428571
Test recall score: 0.8571428571428571
Test precision score: 0.8571428571428571

Train f1 score: 1.0
Train recall score: 1.0
Train precision score: 1.0

Test f1 score: 0.888888888888889
Test recall score: 0.8571428571428571
Test precision score: 0.9230769230769231

Train f1 score: 1.0
Train recall score: 1.0
Train precision score: 1.0


# RandomForestClassifier

In [10]:
print('\nRandomForestClassifier')

for i in range(1):
  print('\nI: ', i)
  model = RandomForestClassifier()
  grid_search(model, {'n_estimators': range(1, 21), 'max_depth': range(1, 100)},
              x_train, y_train, x_test, y_test,
              f1_min=0.85, recall_min=0.85, precision_min=0.85, 
              description={'model': 'RandomForestClassifier'})


RandomForestClassifier

I:  0

Test f1 score: 0.888888888888889
Test recall score: 0.8571428571428571
Test precision score: 0.9230769230769231

Train f1 score: 0.9054054054054055
Train recall score: 0.9115646258503401
Train precision score: 0.8993288590604027

Test f1 score: 0.8571428571428571
Test recall score: 0.8571428571428571
Test precision score: 0.8571428571428571

Train f1 score: 0.9655172413793104
Train recall score: 0.9523809523809523
Train precision score: 0.9790209790209791

Test f1 score: 0.888888888888889
Test recall score: 0.8571428571428571
Test precision score: 0.9230769230769231

Train f1 score: 0.9664429530201342
Train recall score: 0.9795918367346939
Train precision score: 0.9536423841059603

Test f1 score: 0.8571428571428571
Test recall score: 0.8571428571428571
Test precision score: 0.8571428571428571

Train f1 score: 0.979310344827586
Train recall score: 0.9659863945578231
Train precision score: 0.993006993006993

Test f1 score: 0.8571428571428571
Test recall sc

# Сохранение моделей

In [11]:
with ZipFile('models.zip', 'w') as z:
  filenames = glob.glob('*.joblib')
  for i in filenames:
    z.write(i, i, ZIP_DEFLATED )
  filenames = glob.glob('*.json')
  for i in filenames:
    z.write(i, i, ZIP_DEFLATED )