### Classifying and predicting early hospital readmissions:

Classifying with supervised learning whether diabetic patients are readmitted, and if they are, if it's before or after 30 days.

Using the dataset from here: https://archive.ics.uci.edu/ml/datasets/Diabetes+130-US+hospitals+for+years+1999-2008

In [1]:
%pylab inline

import pandas as pd
import patsy as patsy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# SMOTE
from imblearn.over_sampling import SMOTE

# Undersampling
from imblearn.under_sampling import RandomUnderSampler

from sklearn import linear_model
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import pickle

%config InlineBackend.figure_format = 'svg'
sns.set_style("white")

Populating the interactive namespace from numpy and matplotlib


In [2]:
with open("x_liv.pkl", 'rb') as picklefile: 
    x_liv = pickle.load(picklefile)

with open("y_liv.pkl", 'rb') as picklefile: 
    y_liv = pickle.load(picklefile)

with open("y_train_liv.pkl", 'rb') as picklefile: 
    y_train_liv = pickle.load(picklefile)

with open("y_test_liv.pkl", 'rb') as picklefile: 
    y_test_liv = pickle.load(picklefile)

with open("x_train_liv.pkl", 'rb') as picklefile: 
    x_train_liv = pickle.load(picklefile)

with open("x_test_liv.pkl", 'rb') as picklefile: 
    x_test_liv = pickle.load(picklefile)

with open("x_train_scaled_liv.pkl", 'rb') as picklefile: 
    x_train_scaled_liv = pickle.load(picklefile)

with open("x_test_scaled_liv.pkl", 'rb') as picklefile: 
    x_test_scaled_liv = pickle.load(picklefile)
    
with open("x_scaled_liv.pkl", 'rb') as picklefile: 
    x_scaled_liv = pickle.load(picklefile)

with open("patientdataICD9_liv.pkl", 'rb') as picklefile: 
    patientdataICD9_liv = pickle.load(picklefile)


In [3]:
# using the scaled x-train and x-test because it's Logistic
x_train = x_train_scaled_liv
y_train = y_train_liv
x_test = x_test_scaled_liv
y_test = y_test_liv
x = x_scaled_liv
y = y_liv

In [4]:
# Binarizing the classes
y = y.str.replace('>30','NO')
y_test = y_test.str.replace('>30','NO')
y_train = y_train.str.replace('>30','NO')

In [None]:
# pulling these in from pickle now

# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42, stratify=y)

In [None]:
# check out class_weight="balanced"

linearSVMmodel = svm.LinearSVC(class_weight="balanced")
linearSVMmodel.fit(x_train, y_train)

y_pred = linearSVMmodel.predict(x_test)

print("Accuracy: %.3f"% metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

# Previous values from scaling before split:

#Accuracy: 0.555
#             precision    recall  f1-score   support

#        <30       0.24      0.20      0.22      2839
#        >30       0.48      0.44      0.46      8887
#         NO       0.65      0.70      0.67     13716

#avg / total       0.54      0.55      0.55     25442




Accuracy: 0.677
              precision    recall  f1-score   support

         <30       0.19      0.56      0.28      2839
          NO       0.92      0.69      0.79     22190

   micro avg       0.68      0.68      0.68     25029
   macro avg       0.56      0.63      0.54     25029
weighted avg       0.84      0.68      0.73     25029



In [None]:
# check out class_weight="balanced"
RBFSVMmodel = svm.SVC(class_weight="balanced")
RBFSVMmodel.fit(x_train, y_train)

y_pred = RBFSVMmodel.predict(x_test)

print("Accuracy: %.3f"% metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))



#### SMOTE:

In [None]:
sm = SMOTE(random_state=42)
x_train_smote, y_train_smote = sm.fit_sample(x_train, y_train)

#### Linear SVM with SMOTE:

In [None]:
# x_train_smote, y_train_smote

linearSVMmodel = svm.LinearSVC(class_weight="balanced")
linearSVMmodel.fit(x_train_smote, y_train_smote)

y_pred = linearSVMmodel.predict(x_test)

print("Accuracy: %.3f"% metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

#### Kernel RBF SVM with SMOTE:

In [None]:
# check out class_weight="balanced"
RBFSVMmodel = svm.SVC(class_weight="balanced")
RBFSVMmodel.fit(x_train_smote, y_train_smote)

y_pred = RBFSVMmodel.predict(x_test)

print("Accuracy: %.3f"% metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

#### Random undersampling:

In [None]:
rus = RandomUnderSampler(random_state=0)
x_train_undersampled, y_train_undersampled = rus.fit_sample(x_train, y_train)

#### Linear SVM with random undersampling:

In [None]:
# x_train_smote, y_train_smote

# using class_weight="balanced" is not actually necessary here
# since the samples are equal now, but hopefully it's not hurting anything

linearSVMmodel = svm.LinearSVC(class_weight="balanced")
linearSVMmodel.fit(x_train_undersampled, y_train_undersampled)

y_pred = linearSVMmodel.predict(x_test)

print("Accuracy: %.3f"% metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

#### Kernel RBF SVM with random undersampling:

In [None]:
# check out class_weight="balanced"
RBFSVMmodel = svm.SVC(class_weight="balanced")
RBFSVMmodel.fit(x_train_undersampled, y_train_undersampled)

y_pred = RBFSVMmodel.predict(x_test)

print("Accuracy: %.3f"% metrics.accuracy_score(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))