In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import sklearn.model_selection as MS


In [2]:
train = pd.read_csv("TRAIN.csv",index_col=0)
test = pd.read_csv("TEST.csv",index_col=0)

In [3]:
X = train.drop("readmitted", axis=1)
y = train["readmitted"]
X_test = test.drop("readmitted", axis=1)
y_test = test["readmitted"]

In [None]:
# ?Add others? Bagging or sth, work on previous work of M-B group

# Random Forest

In [4]:
# Without tampering probabilities nor any unbalanced method:
cv = MS.KFold(n_splits=10)
clf = RandomForestClassifier(n_estimators = 100)
y_pred = MS.cross_val_predict(clf, X, y, cv = cv)
print(classification_report(y, y_pred))
# Dismally low results, to be expected given the lack of examples of class 0.

              precision    recall  f1-score   support

           0       0.53      0.01      0.02      7755
           1       0.89      1.00      0.94     60750

   micro avg       0.89      0.89      0.89     68505
   macro avg       0.71      0.51      0.48     68505
weighted avg       0.85      0.89      0.84     68505



In [6]:
# Tampering with probablities:
clf = RandomForestClassifier(n_estimators = 100)
y_probs = MS.cross_val_predict(clf, X, y, cv = cv, method='predict_proba')

In [7]:
for thr in np.arange(0.1, 0.2, 0.02):
    y_pred = [0 if p>thr else 1 for p in y_probs[:,0]]
    print("Threshold: "+ str(thr) + "\n", classification_report(y, y_pred))

Threshold: 0.1
               precision    recall  f1-score   support

           0       0.15      0.66      0.24      7755
           1       0.92      0.52      0.67     60750

   micro avg       0.54      0.54      0.54     68505
   macro avg       0.54      0.59      0.46     68505
weighted avg       0.84      0.54      0.62     68505

Threshold: 0.12000000000000001
               precision    recall  f1-score   support

           0       0.16      0.56      0.25      7755
           1       0.92      0.62      0.74     60750

   micro avg       0.62      0.62      0.62     68505
   macro avg       0.54      0.59      0.50     68505
weighted avg       0.83      0.62      0.69     68505

Threshold: 0.14
               precision    recall  f1-score   support

           0       0.17      0.48      0.25      7755
           1       0.91      0.71      0.80     60750

   micro avg       0.68      0.68      0.68     68505
   macro avg       0.54      0.59      0.53     68505
weighted 

In [8]:
# Tampering with dataset distribution (SMOTE)
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=20)
X_n, y_n = sm.fit_sample(X, y)
print('Old dataset shape {}'.format(Counter(y)))
print('New dataset shape {}'.format(Counter(y_n)))

Old dataset shape Counter({1: 60750, 0: 7755})
New dataset shape Counter({1: 60750, 0: 60750})


In [9]:
# As a note, we've seen studies do something like this, but it is cheating:
clf = RandomForestClassifier(n_estimators = 100) 
y_pred = MS.cross_val_predict(clf, X_n, y_n, cv = cv)
print(classification_report(y_n, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.89      0.94     60750
           1       0.90      0.99      0.95     60750

   micro avg       0.94      0.94      0.94    121500
   macro avg       0.95      0.94      0.94    121500
weighted avg       0.95      0.94      0.94    121500



In [10]:
# What should be done instead:
from sklearn.model_selection import train_test_split
X2, X2_test, y2, y2_test = train_test_split(X, y, random_state=42,test_size=0.3)
sm = SMOTE(random_state=20)
X_n, y_n = sm.fit_sample(X2, y2)

In [11]:
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_n, y_n)
y_pred=clf.predict(X2_test)
print(classification_report(y2_test, y_pred))
# Not better than first try, this is the real outcome and not the one with CV_predictions

              precision    recall  f1-score   support

           0       0.34      0.03      0.05      2279
           1       0.89      0.99      0.94     18273

   micro avg       0.89      0.89      0.89     20552
   macro avg       0.62      0.51      0.49     20552
weighted avg       0.83      0.89      0.84     20552



In [12]:
# Using both techniques simultaneously
y_probs = clf.predict_proba(X2_test)

In [13]:
for thr in np.arange(0.1, 0.25, 0.02):
    y_pred = [0 if p>thr else 1 for p in y_probs[:,0]]
    print("Threshold: "+ str(thr) + "\n", classification_report(y2_test, y_pred))
# Best results are with Upsampling and probablities at 0.2 threshold.

Threshold: 0.1
               precision    recall  f1-score   support

           0       0.13      0.75      0.23      2279
           1       0.93      0.40      0.56     18273

   micro avg       0.44      0.44      0.44     20552
   macro avg       0.53      0.57      0.39     20552
weighted avg       0.84      0.44      0.52     20552

Threshold: 0.12000000000000001
               precision    recall  f1-score   support

           0       0.14      0.67      0.23      2279
           1       0.92      0.49      0.64     18273

   micro avg       0.51      0.51      0.51     20552
   macro avg       0.53      0.58      0.44     20552
weighted avg       0.84      0.51      0.60     20552

Threshold: 0.14
               precision    recall  f1-score   support

           0       0.15      0.60      0.24      2279
           1       0.92      0.57      0.70     18273

   micro avg       0.57      0.57      0.57     20552
   macro avg       0.53      0.58      0.47     20552
weighted 

In [14]:
# Final Test Random forest score:
sm = SMOTE(random_state=20)
X_n, y_n = sm.fit_sample(X, y)
clf = RandomForestClassifier(n_estimators = 100)
clf.fit(X_n, y_n)
y_probs = clf.predict_proba(X_test)
thr = 0.2
y_pred = [0 if p>thr else 1 for p in y_probs[:,0]]
print("Random forest final report", classification_report(y_test, y_pred))

Random forest final report               precision    recall  f1-score   support

           0       0.19      0.39      0.26      3409
           1       0.91      0.78      0.84     25951

   micro avg       0.74      0.74      0.74     29360
   macro avg       0.55      0.59      0.55     29360
weighted avg       0.82      0.74      0.77     29360



# AdaBoost