In [1]:
%matplotlib inline

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

from sklearn.linear_model import LogisticRegression

# Model Training and Improvement

In [3]:
diabetes_data = pd.read_csv("data/diabetic_data.csv")

In [4]:
diabetes_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
diabetes_data.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [6]:
diabetes_data.columns[diabetes_data.columns.str.contains("_id")]

Index(['encounter_id', 'admission_type_id', 'discharge_disposition_id',
       'admission_source_id'],
      dtype='object')

In [7]:
diabetes_data.pioglitazone.unique()

array(['No', 'Steady', 'Up', 'Down'], dtype=object)

In [8]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [9]:
pd.get_dummies(diabetes_data, drop_first = True).dtypes

encounter_id                     int64
patient_nbr                      int64
admission_type_id                int64
discharge_disposition_id         int64
admission_source_id              int64
                                 ...  
metformin-pioglitazone_Steady    uint8
change_No                        uint8
diabetesMed_Yes                  uint8
readmitted_>30                   uint8
readmitted_NO                    uint8
Length: 2438, dtype: object

In [10]:
diabetes_attributes = diabetes_data.drop(columns = "readmitted")
diabetes_target = diabetes_data.readmitted

In [11]:
diabetes_attributes_dummies = pd.get_dummies(diabetes_attributes)

In [12]:
logistic_regression = LogisticRegression()

In [13]:
logistic_regression.fit(diabetes_attributes_dummies, diabetes_target)

LogisticRegression()

In [14]:
logistic_regression.coef_

array([[-3.31701395e-09, -3.86905107e-09, -1.15814534e-15, ...,
        -3.26369841e-16, -1.63318872e-16, -3.02903976e-16],
       [-1.50329991e-10,  4.51003945e-09, -1.47130836e-16, ...,
        -1.14848644e-16, -1.11876233e-16,  7.41720589e-17],
       [ 3.46734394e-09, -6.40988385e-10,  1.30527618e-15, ...,
         4.41218484e-16,  2.75195106e-16,  2.28731917e-16]])

__Да закачим цялята обработка на данните към модела и да я експортна заедно с него. Това са sklearn pipelines.__

In [15]:
scaler = MinMaxScaler()

In [16]:
diabetes_attributes_scaled = scaler.fit_transform(diabetes_attributes_dummies)

In [None]:
logistic_regression.fit(diabetes_attributes_scaled, diabetes_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [None]:
logistic_regression.score(diabetes_attributes_scaled, diabetes_target)

0.5950317394807696

In [19]:
logistic_regression.score(diabetes_attributes_dummies, diabetes_target)



0.5395908260126172

In [20]:
pipeline = Pipeline([
    ("scaler", MinMaxScaler()),
    ("model", LogisticRegression())
])

In [21]:
pipeline

Pipeline(steps=[('scaler', MinMaxScaler()), ('model', LogisticRegression())])

In [22]:
sample_data = diabetes_data.sample(5000, random_state=42)

In [23]:
sample_data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
35956,110939484,19274094,Caucasian,Female,[70-80),?,1,1,6,11,...,No,Steady,No,No,No,No,No,No,Yes,NO
60927,170328306,65634327,Caucasian,Male,[50-60),?,1,1,1,1,...,No,No,No,No,No,No,No,No,Yes,NO
79920,245688426,100657359,Caucasian,Female,[60-70),?,3,6,1,4,...,No,No,No,No,No,No,No,No,Yes,NO
50078,150826224,83144448,Caucasian,Male,[30-40),?,2,1,1,12,...,No,No,No,No,No,No,No,No,Yes,>30
44080,135993852,65234214,AfricanAmerican,Female,[60-70),?,1,2,7,1,...,No,No,No,No,No,No,No,No,Yes,<30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35562,110096790,33502212,Caucasian,Male,[50-60),?,6,1,7,12,...,No,Down,No,No,No,No,No,Ch,Yes,NO
98563,402583472,141357506,AfricanAmerican,Male,[50-60),?,7,3,7,6,...,No,Steady,No,No,No,No,No,No,Yes,NO
88066,282442506,45759951,Caucasian,Female,[40-50),?,1,1,7,3,...,No,No,No,No,No,No,No,No,Yes,NO
55955,161139018,112510251,Caucasian,Female,[40-50),?,5,1,1,3,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [25]:
sample_attributes = sample_data.drop(columns = "readmitted")

In [27]:
sample_target = sample_data.readmitted

In [28]:
sample_attributes = pd.get_dummies(sample_attributes)

In [29]:
pipeline.fit(sample_attributes, sample_target)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('scaler', MinMaxScaler()), ('model', LogisticRegression())])