In [1]:
%matplotlib inline

In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC, SVC, OneClassSVM
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report

# Tree and Ensamble Methods
## Classification: Live Demos

In [3]:
diabetes_data = pd.read_csv("diabetic_data.csv")

In [4]:
diabetes_data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
diabetes_data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,443847548,100162476,AfricanAmerican,Male,[70-80),?,1,3,7,3,...,No,Down,No,No,No,No,No,Ch,Yes,>30
101762,443847782,74694222,AfricanAmerican,Female,[80-90),?,1,4,5,5,...,No,Steady,No,No,No,No,No,No,Yes,NO
101763,443854148,41088789,Caucasian,Male,[70-80),?,1,1,7,1,...,No,Down,No,No,No,No,No,Ch,Yes,NO
101764,443857166,31693671,Caucasian,Female,[80-90),?,2,3,7,10,...,No,Up,No,No,No,No,No,Ch,Yes,NO


In [6]:
diabetes_data.dtypes

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [7]:
diabetes_data.shape

(101766, 50)

In [8]:
diabetes_target = diabetes_data["readmitted"]
diabetes_attributes = diabetes_data.drop("readmitted", axis = 1)

In [9]:
diabetes_attributes = pd.get_dummies(diabetes_attributes)

In [10]:
diabetes_attributes.shape

(101766, 2472)

In [11]:
diabetes_attributes_scaled = MinMaxScaler().fit_transform(diabetes_attributes)

In [14]:
all_data, _, all_targets, _ = train_test_split(
    diabetes_attributes_scaled,
    diabetes_target,
    train_size=0.1,
    random_state=42
)

In [15]:
attributes_train, attributes_test, targets_train, targets_test = train_test_split(
    all_data,
    all_targets,
    test_size=0.2,
    stratify=all_targets
)

In [16]:
attributes_train.shape

(8140, 2472)

In [29]:
svm = LinearSVC(C=10)

In [30]:
svm.fit(attributes_train, targets_train)



LinearSVC(C=10)

In [26]:
gaussian_svm = SVC(kernel="rbf", C = 10)

In [27]:
gaussian_svm.fit(attributes_train, targets_train)

SVC(C=10)

In [33]:
print(classification_report(targets_train, svm.predict(attributes_train)))

              precision    recall  f1-score   support

         <30       0.76      0.23      0.35       886
         >30       0.58      0.70      0.63      2894
          NO       0.74      0.74      0.74      4360

    accuracy                           0.67      8140
   macro avg       0.69      0.56      0.58      8140
weighted avg       0.69      0.67      0.66      8140



In [34]:
print(classification_report(targets_train, gaussian_svm.predict(attributes_train)))

              precision    recall  f1-score   support

         <30       0.99      0.43      0.60       886
         >30       0.85      0.81      0.83      2894
          NO       0.83      0.95      0.89      4360

    accuracy                           0.84      8140
   macro avg       0.89      0.73      0.77      8140
weighted avg       0.85      0.84      0.84      8140



In [35]:
print(classification_report(targets_test, svm.predict(attributes_test)))

              precision    recall  f1-score   support

         <30       0.16      0.05      0.08       222
         >30       0.42      0.49      0.45       724
          NO       0.61      0.61      0.61      1090

    accuracy                           0.51      2036
   macro avg       0.39      0.39      0.38      2036
weighted avg       0.49      0.51      0.50      2036



In [36]:
print(classification_report(targets_test, gaussian_svm.predict(attributes_test)))

              precision    recall  f1-score   support

         <30       0.23      0.03      0.06       222
         >30       0.45      0.40      0.42       724
          NO       0.59      0.74      0.66      1090

    accuracy                           0.54      2036
   macro avg       0.43      0.39      0.38      2036
weighted avg       0.50      0.54      0.51      2036



# KNN

In [106]:
knn = KNeighborsClassifier(n_neighbors=-1)

In [107]:
knn.fit(attributes_train, targets_train)

ValueError: Expected n_neighbors > 0. Got -1

In [83]:
knn.predict(attributes_train)

array(['NO', 'NO', 'NO', ..., 'NO', '>30', 'NO'], dtype=object)

In [84]:
print(classification_report(targets_train, knn.predict(attributes_train)))


              precision    recall  f1-score   support

         <30       0.41      0.58      0.48       886
         >30       0.72      0.63      0.67      2894
          NO       0.78      0.78      0.78      4360

    accuracy                           0.71      8140
   macro avg       0.64      0.66      0.64      8140
weighted avg       0.72      0.71      0.71      8140



In [85]:
print(classification_report(targets_test, knn.predict(attributes_test)))

              precision    recall  f1-score   support

         <30       0.15      0.22      0.18       222
         >30       0.38      0.32      0.35       724
          NO       0.57      0.58      0.57      1090

    accuracy                           0.45      2036
   macro avg       0.37      0.37      0.37      2036
weighted avg       0.46      0.45      0.45      2036



# One class SVM (anomaly hunting)

##### Example of non supervised learning as we don't have any built in metrics to score the model. We input a % of outliers (nu) and make the judgement for our selves of how well the model performed.

In [103]:
anomaly_detector = OneClassSVM(nu = 0.2) 

In [99]:
anomaly_detector.fit(attributes_train)

OneClassSVM(nu=0.2)

In [100]:
anomaly_detector.predict(attributes_train).sum()

4880

In [101]:
predictions = anomaly_detector.predict(attributes_train)

In [102]:
predictions.sum() / len(attributes_train) # Number of non outliers (ones)

0.5995085995085995