In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn.model_selection as MS


In [2]:
train = pd.read_csv("TRAIN.csv",index_col=0)
test = pd.read_csv("TEST.csv",index_col=0)

In [5]:
X = train.drop("readmitted", axis=1)
y = train["readmitted"]
X = test.drop("readmitted", axis=1)
y = test["readmitted"]

In [None]:
# ?Add others? Bagging or sth, work on previous work of M-B group

## Random Forest

In [6]:
# Without tampering probabilities nor any unbalanced method:
cv = MS.KFold(n_splits=10)
clf = RandomForestClassifier(n_estimators = 100)
y_pred = MS.cross_val_predict(clf, X, y, cv = cv)
print(classification_report(y, y_pred))
# Dismally low results, to be expected given the lack of examples of class 0.

              precision    recall  f1-score   support

           0       0.34      0.00      0.01      3409
           1       0.88      1.00      0.94     25951

   micro avg       0.88      0.88      0.88     29360
   macro avg       0.61      0.50      0.47     29360
weighted avg       0.82      0.88      0.83     29360



In [8]:
# Tampering with probablities:
clf = RandomForestClassifier(n_estimators = 100)
y_probs = MS.cross_val_predict(clf, X, y, cv = cv, method='predict_proba')

In [14]:
for thr in np.arange(0.1, 0.2, 0.02):
    y_pred = [0 if p>thr else 1 for p in y_probs[:,0]]
    print("Threshold: "+ str(thr) + "\n", classification_report(y, y_pred))

Threshold: 0.1
               precision    recall  f1-score   support

           0       0.15      0.67      0.24      3409
           1       0.92      0.50      0.65     25951

   micro avg       0.52      0.52      0.52     29360
   macro avg       0.53      0.58      0.45     29360
weighted avg       0.83      0.52      0.60     29360

Threshold: 0.12000000000000001
               precision    recall  f1-score   support

           0       0.16      0.57      0.25      3409
           1       0.91      0.61      0.73     25951

   micro avg       0.60      0.60      0.60     29360
   macro avg       0.54      0.59      0.49     29360
weighted avg       0.83      0.60      0.67     29360

Threshold: 0.14
               precision    recall  f1-score   support

           0       0.17      0.47      0.25      3409
           1       0.91      0.69      0.79     25951

   micro avg       0.67      0.67      0.67     29360
   macro avg       0.54      0.58      0.52     29360
weighted 

In [16]:
# Tampering with dataset distribution (SMOTE)
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=20)
X_n, y_n = sm.fit_sample(X, y)
print('Old dataset shape {}'.format(Counter(y)))
print('New dataset shape {}'.format(Counter(y_n)))

Old dataset shape Counter({1: 25951, 0: 3409})
New dataset shape Counter({1: 25951, 0: 25951})


In [17]:
# As a note, we've seen studies do something like this, but it is cheating:
clf = RandomForestClassifier(n_estimators = 100) 
y_pred = MS.cross_val_predict(clf, X_n, y_n, cv = cv)
print(classification_report(y_n, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.89      0.94     25951
           1       0.90      0.99      0.94     25951

   micro avg       0.94      0.94      0.94     51902
   macro avg       0.95      0.94      0.94     51902
weighted avg       0.95      0.94      0.94     51902



In [19]:
# What should be done instead:
from sklearn.model_selection import train_test_split
X2, X2_test, y2, y2_test = train_test_split(X, y, random_state=42,test_size=0.3)
sm = SMOTE(random_state=20)
X_n, y_n = sm.fit_sample(X2, y2)

In [25]:
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_n, y_n)
y_pred=clf.predict(X2_test)
print(classification_report(y2_test, y_pred))
# Not better than first try, this is the real outcome and not the one with CV_predictions

              precision    recall  f1-score   support

           0       0.23      0.01      0.02      1039
           1       0.88      1.00      0.94      7769

   micro avg       0.88      0.88      0.88      8808
   macro avg       0.56      0.50      0.48      8808
weighted avg       0.81      0.88      0.83      8808



In [26]:
# Using both techniques simultaneously
y_probs = clf.predict_proba(X2_test)

In [29]:
for thr in np.arange(0.1, 0.25, 0.02):
    y_pred = [0 if p>thr else 1 for p in y_probs[:,0]]
    print("Threshold: "+ str(thr) + "\n", classification_report(y2_test, y_pred))

Threshold: 0.1
               precision    recall  f1-score   support

           0       0.14      0.80      0.24      1039
           1       0.93      0.35      0.51      7769

   micro avg       0.40      0.40      0.40      8808
   macro avg       0.53      0.57      0.37      8808
weighted avg       0.84      0.40      0.48      8808

Threshold: 0.12000000000000001
               precision    recall  f1-score   support

           0       0.15      0.72      0.25      1039
           1       0.92      0.44      0.60      7769

   micro avg       0.48      0.48      0.48      8808
   macro avg       0.54      0.58      0.42      8808
weighted avg       0.83      0.48      0.56      8808

Threshold: 0.14
               precision    recall  f1-score   support

           0       0.16      0.64      0.25      1039
           1       0.92      0.54      0.68      7769

   micro avg       0.55      0.55      0.55      8808
   macro avg       0.54      0.59      0.47      8808
weighted 

In [None]:
# Best results are with Upsampling and probablities at 0.2 threshold.
