In [1]:
%matplotlib inline

In [74]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pickle

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, FunctionTransformer

from sklearn.linear_model import LogisticRegression

# Model Training and Improvement

In [3]:
# epssilon
EPS = 1e-10

In [4]:
diabetes_data = pd.read_csv("data/diabetic_data.csv")

In [5]:
diabetes_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [6]:
diabetes_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [7]:
diabetes_data.columns[diabetes_data.columns.str.contains("_id")]

Index(['encounter_id', 'admission_type_id', 'discharge_disposition_id',
       'admission_source_id'],
      dtype='object')

In [8]:
diabetes_data.pioglitazone.unique()

array(['No', 'Steady', 'Up', 'Down'], dtype=object)

In [9]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [10]:
pd.get_dummies(diabetes_data, drop_first = True).dtypes

encounter_id                     int64
patient_nbr                      int64
admission_type_id                int64
discharge_disposition_id         int64
admission_source_id              int64
                                 ...  
metformin-pioglitazone_Steady    uint8
change_No                        uint8
diabetesMed_Yes                  uint8
readmitted_>30                   uint8
readmitted_NO                    uint8
Length: 2438, dtype: object

In [11]:
diabetes_attributes = diabetes_data.drop(columns = "readmitted")
diabetes_target = diabetes_data.readmitted

In [12]:
diabetes_attributes_dummies = pd.get_dummies(diabetes_attributes)

In [13]:
logistic_regression = LogisticRegression()

In [14]:
logistic_regression.fit(diabetes_attributes_dummies, diabetes_target)

In [15]:
logistic_regression.coef_

array([[-1.96952335e-09, -2.05892693e-09, -4.34014319e-04, ...,
        -1.24877634e-04, -6.27839232e-05, -1.13112782e-04],
       [-2.92053042e-10,  4.61961230e-09, -8.83494551e-05, ...,
        -5.14875620e-05, -4.74378759e-05,  2.31595662e-05],
       [ 2.29806616e-09, -2.56065263e-09,  5.22363774e-04, ...,
         1.76365196e-04,  1.10221799e-04,  8.99532157e-05]])

__Да закачим цялята обработка на данните към модела и да я експортна заедно с него. Това са sklearn pipelines.__

In [16]:
scaler = MinMaxScaler()

In [17]:
diabetes_attributes_scaled = scaler.fit_transform(diabetes_attributes_dummies)

In [18]:
logistic_regression.fit(diabetes_attributes_scaled, diabetes_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
logistic_regression.score(diabetes_attributes_scaled, diabetes_target)

0.5950317394807696

In [20]:
logistic_regression.score(diabetes_attributes_dummies, diabetes_target)



0.5395908260126172

In [21]:
pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("model", LogisticRegression())
])

In [22]:
pipeline

In [23]:
sample_data = diabetes_data.sample(5000, random_state=42)

In [24]:
sample_data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
35956,110939484,19274094,Caucasian,Female,[70-80),?,1,1,6,11,...,No,Steady,No,No,No,No,No,No,Yes,NO
60927,170328306,65634327,Caucasian,Male,[50-60),?,1,1,1,1,...,No,No,No,No,No,No,No,No,Yes,NO
79920,245688426,100657359,Caucasian,Female,[60-70),?,3,6,1,4,...,No,No,No,No,No,No,No,No,Yes,NO
50078,150826224,83144448,Caucasian,Male,[30-40),?,2,1,1,12,...,No,No,No,No,No,No,No,No,Yes,>30
44080,135993852,65234214,AfricanAmerican,Female,[60-70),?,1,2,7,1,...,No,No,No,No,No,No,No,No,Yes,<30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35562,110096790,33502212,Caucasian,Male,[50-60),?,6,1,7,12,...,No,Down,No,No,No,No,No,Ch,Yes,NO
98563,402583472,141357506,AfricanAmerican,Male,[50-60),?,7,3,7,6,...,No,Steady,No,No,No,No,No,No,Yes,NO
88066,282442506,45759951,Caucasian,Female,[40-50),?,1,1,7,3,...,No,No,No,No,No,No,No,No,Yes,NO
55955,161139018,112510251,Caucasian,Female,[40-50),?,5,1,1,3,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [25]:
sample_attributes = sample_data.drop(columns = "readmitted")

In [26]:
sample_target = sample_data.readmitted

In [27]:
sample_attributes = pd.get_dummies(sample_attributes)

In [28]:
pipeline.fit(sample_attributes, sample_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
ohe = OneHotEncoder()

In [30]:
ohe.fit(sample_attributes)

In [31]:
ohe.feature_names_in_

array(['encounter_id', 'patient_nbr', 'admission_type_id', ...,
       'change_No', 'diabetesMed_No', 'diabetesMed_Yes'], dtype=object)

In [32]:
# ohe.categories_

In [33]:
sample_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [34]:
categorical_columns = sample_data.dtypes[sample_data.dtypes == np.object_].index.values

In [35]:
categorical_columns = categorical_columns[:-1]

In [36]:
sample_data.dtypes[sample_data.dtypes != np.object_].index.values[2:]

array(['admission_type_id', 'discharge_disposition_id',
       'admission_source_id', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_outpatient',
       'number_emergency', 'number_inpatient', 'number_diagnoses'],
      dtype=object)

In [37]:
numerical_columns = [
    'admission_type_id', 'discharge_disposition_id',
    'time_in_hospital', 'num_lab_procedures',
    'num_procedures', 'num_medications', 'number_outpatient',
    'number_emergency', 'number_inpatient', 'number_diagnoses'
]

In [38]:
categorical = ColumnTransformer([
    ("categorical", OneHotEncoder(), categorical_columns),
    ("numerical", MinMaxScaler(), numerical_columns)
], remainder = "passtrought")

In [39]:
categorical

In [40]:
FunctionTransformer(np.log10).transform(sample_attributes[numerical_columns])

  result = func(self.values, **kwargs)


Unnamed: 0,admission_type_id,discharge_disposition_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
35956,0.000000,0.000000,1.041393,1.832509,-inf,1.301030,-inf,-inf,-inf,0.698970
60927,0.000000,0.000000,0.000000,1.301030,-inf,0.845098,-inf,-inf,-inf,0.903090
79920,0.477121,0.778151,0.602060,1.322219,0.477121,1.361728,0.0,-inf,0.30103,0.845098
50078,0.301030,0.000000,1.079181,1.447158,-inf,1.278754,-inf,-inf,0.00000,0.845098
44080,0.000000,0.301030,0.000000,1.322219,-inf,0.778151,-inf,-inf,-inf,0.845098
...,...,...,...,...,...,...,...,...,...,...
35562,0.778151,0.000000,1.079181,1.812913,0.301030,1.278754,-inf,-inf,-inf,0.954243
98563,0.845098,0.477121,0.778151,1.785330,0.000000,1.146128,-inf,-inf,-inf,0.954243
88066,0.000000,0.000000,0.477121,1.447158,0.778151,0.778151,-inf,-inf,-inf,0.477121
55955,0.698970,0.000000,0.477121,1.431364,-inf,1.000000,0.0,0.0,0.00000,0.903090


In [41]:
log_scaler = FunctionTransformer(lambda x: np.log10(x + EPS))

In [42]:
number_processor = Pipeline([
    ("log_transformer", FunctionTransformer(np.log10)),
    ("minmax", MinMaxScaler())
])

In [43]:
number_processor

In [50]:
preprocessor = ColumnTransformer([
    ("categorical", OneHotEncoder(), categorical_columns),
    ("numerical", MinMaxScaler(), numerical_columns)
], remainder = "drop")

In [75]:
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("classifies", LogisticRegression())
])

pipeline e estimator.

In [76]:
pipeline.steps

[('preprocess',
  ColumnTransformer(transformers=[('categorical', OneHotEncoder(),
                                   array(['race', 'gender', 'age', 'weight', 'payer_code',
         'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 'max_glu_serum',
         'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
         'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
         'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone',
         'acar...
         'glipizide-metformin', 'glimepiride-pioglitazone',
         'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
         'diabetesMed'], dtype=object)),
                                  ('numerical', MinMaxScaler(),
                                   ['admission_type_id',
                                    'discharge_disposition_id',
                                    'time_in_hospital', 'num_lab_procedures',
                                    'num_procedures', 'num_medications',
               

In [68]:
samples_attributes = sample_data.drop(columns = "readmitted")
sample_target = sample_data.readmitted

In [69]:
np.log10(0 + 1e-6)

-6.0

In [73]:
# pipeline.fit(sample_attributes, sample_target)

In [64]:
# pipeline.score(sample_attributes, sample_target)

### Така се запазва модел да живее в рам паметта на компютъра:

In [78]:
pickle.dump(pipeline, open("prediction_pipeline.pkl", "wb"))

In [79]:
pickle.dump(logistic_regression, open("prediction_log_regression.pkl", "wb"))

In [83]:
model = pickle.load(open("prediction_log_regression.pkl", "rb"))

In [84]:
model.coef_

array([[-0.19861529,  0.14901103, -0.12530781, ..., -0.02687513,
        -0.10804452,  0.0203764 ],
       [-0.71627926,  0.57149708,  0.16718931, ..., -0.00600077,
        -0.03499498,  0.03864166],
       [ 0.91489454, -0.72050811, -0.0418815 , ...,  0.0328759 ,
         0.14303949, -0.05901806]])

In [85]:
pipeline.steps[0][1].feature_names_in_

array(['encounter_id', 'patient_nbr', 'admission_type_id', ...,
       'change_No', 'diabetesMed_No', 'diabetesMed_Yes'], dtype=object)

In [86]:
pipeline.steps[0][1]

In [89]:
diabetes_data = pd.read_csv("data/diabetic_data.csv")

In [90]:
sample_data = diabetes_data.sample(5000, random_state = 12341234)
attributes = sample_data.drop(columns = ["readmitted"])
target = sample_data.readmitted

In [91]:
pipeline.fit(attributes, target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


variance - са външни фактори.

Най-просто казано __high bias__ има алгоритъм дори данните, които сме му показали. Очакваме да се справи по-добре на тях и той още бърка.

__High variance__ познава данните добре, но е хаотичен.