In [85]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from scipy import stats
import numpy as np

# Read data

In [86]:
wcct = pd.read_csv('WCCT_data.csv')
norm = 'True' # or False

In [87]:
wcct.head()

Unnamed: 0,VOLUNTEER,DAY,B.cells.plasma.STAT5+,Basophils,Bcells,Bcells.CSM,Bcells.NCSM,Bcells.plasma,CD66+,cMCs,...,Tcells.CD8+.CD161+,Tcells.CD8+.CD38+,Tcells.CD8+.CD38+Ki67+,Tcells.CD8+.Effector.CD38+,Tcells.CD8+.Effector.CD38+Ki67+,Tcells.CD8+.Memory,Tcells.CD8+.Memory.CD38+,Tcells.CD8+.Memory.CD38+Ki67+,Tcells.CD8+CD45RA+CD27-,label
0,101,Baseline 1,0.003108,1.71235,5.39499,1.103238,0.643297,0.046616,58.753327,25.943191,...,0.829759,0.180247,0.105662,0.015539,0.006215,4.583877,0.087016,0.065262,1.945429,1
1,101,Baseline 2,0.0,1.545919,6.057033,1.218394,0.768593,0.026202,54.366529,24.429014,...,0.790428,0.231451,0.152845,0.017468,0.008734,4.908511,0.061138,0.056771,1.248963,1
2,101,1,0.0,1.537663,5.168144,1.034428,0.571132,0.027958,66.719099,29.427271,...,0.635035,0.195703,0.1318,0.0,0.0,3.993929,0.095854,0.063903,1.282051,1
3,101,2,0.003162,1.479935,4.920469,0.961326,0.6451,0.044272,66.973242,26.581918,...,0.562881,0.25298,0.123328,0.006325,0.003162,4.003415,0.091705,0.050596,1.862568,1
4,101,3,0.0,1.613706,6.140787,1.069585,0.609415,0.04353,64.295463,27.734594,...,0.513028,0.195883,0.071513,0.006219,0.003109,3.867919,0.090169,0.049748,1.402276,1


# Preprocess data

In [88]:
def reverse_days(x):
    if x == 'Baseline 1':
        return -1
    if x == 'Baseline 2':
        return 1
    else:
        return int(x) +1

In [89]:
wcct['DAY'] = wcct['DAY'].apply(lambda x: reverse_days(x))

In [90]:
# if normalization
if norm == 'True':
    wcct.loc[:,  (wcct.columns != 'VOLUNTEER') &  (wcct.columns != 'DAY') & (wcct.columns != 'label')] = \
        wcct.loc[:,  (wcct.columns != 'VOLUNTEER') &  (wcct.columns != 'DAY') & (wcct.columns != 'label')].subtract(
        (wcct.groupby('VOLUNTEER').transform(lambda x:x.iloc[0])))#.round(2)
    wcct.head()

In [91]:
# filter unwanted days out
wcct = wcct[~ wcct['DAY'].isin([-1,29,60])]
wcct.head()

Unnamed: 0,VOLUNTEER,DAY,B.cells.plasma.STAT5+,Basophils,Bcells,Bcells.CSM,Bcells.NCSM,Bcells.plasma,CD66+,cMCs,...,Tcells.CD8+.CD161+,Tcells.CD8+.CD38+,Tcells.CD8+.CD38+Ki67+,Tcells.CD8+.Effector.CD38+,Tcells.CD8+.Effector.CD38+Ki67+,Tcells.CD8+.Memory,Tcells.CD8+.Memory.CD38+,Tcells.CD8+.Memory.CD38+Ki67+,Tcells.CD8+CD45RA+CD27-,label
1,101,1,-0.003108,-0.166431,0.662043,0.115156,0.125296,-0.020414,-4.386798,-1.514177,...,-0.039332,0.051204,0.047183,0.001929,0.002519,0.324634,-0.025878,-0.008491,-0.696466,1
2,101,2,-0.003108,-0.174687,-0.226846,-0.068811,-0.072165,-0.018658,7.965772,3.48408,...,-0.194725,0.015455,0.026137,-0.015539,-0.006215,-0.589948,0.008838,-0.001359,-0.663377,1
3,101,3,5.5e-05,-0.232415,-0.474521,-0.141913,0.001803,-0.002344,8.219915,0.638727,...,-0.266878,0.072733,0.017666,-0.009214,-0.003053,-0.580462,0.004689,-0.014666,-0.08286,1
4,101,4,-0.003108,-0.098644,0.745797,-0.033653,-0.033882,-0.003086,5.542136,1.791403,...,-0.316732,0.015636,-0.034149,-0.00932,-0.003106,-0.715959,0.003153,-0.015514,-0.543153,1
5,101,5,0.000222,-0.373733,1.464588,0.225389,0.095939,0.009992,5.695233,-0.27971,...,-0.220389,0.022876,-0.002436,-0.012209,-0.002886,-0.188419,-0.027078,-0.038623,-0.633451,1


In [92]:
def evaluate_preds(true, pred):
    auc = roc_auc_score(true, pred)
    pr = average_precision_score(true, pred)
    bin_pred = [1 if p > 0.5 else 0 for p in pred]
    f_score = f1_score(true, bin_pred)
    #print('ROC AUC:', auc)
    #print('PR AUC:', pr)
    #print('F1 score:', f_score)
    #print(confusion_matrix(true, bin_pred, normalize='true'))
    
    return auc, pr, f_score

## Random Forest

# Classification

In [93]:
train_days = [2,3,4,5,6,7,8]
for d in train_days:
    yw = wcct[wcct['DAY']==d]['label']
    xw = wcct[wcct['DAY']==d].drop(['VOLUNTEER', 'DAY', 'label'], axis =1)
    aucs = []
    for j in range(3): # change 3 to 100 for bootstrapping
        cv = StratifiedKFold(n_splits=5)
        rf = RandomForestClassifier()
        for i, (train, test) in enumerate(cv.split(xw, yw)):
            rf.fit(xw.iloc[train], yw.iloc[train])
            test_preds = rf.predict_proba(xw.iloc[test])[:, 1]
            auc, pr, f_score = evaluate_preds(yw.iloc[test], test_preds)
            aucs.append(auc)
    print(d-1, np.mean(aucs))

1 0.5277777777777778
2 0.7
3 0.7388888888888888
4 0.8527777777777779
5 0.836111111111111
6 0.913888888888889
7 0.8805555555555558


# Select shedders

In [94]:
wcct = wcct[wcct['label']==1]
wcct = wcct[~wcct['VOLUNTEER'].isin([101,109,304])]

# Regression

In [95]:
trus = []
preds = []
ids = wcct['VOLUNTEER'].unique()
for n, id_ in enumerate(ids):
    for d in range(1,9):
        regr = RandomForestRegressor()
        train = wcct[(wcct['VOLUNTEER']!=id_)]# & (model_data['DAY']!=1)]
        test = wcct[(wcct['VOLUNTEER']==id_) & (wcct['DAY']==d)]#.sample(n=1)
        X_tr = train.drop(['VOLUNTEER','DAY','label'], axis =1)
        Y_tr = train['DAY']
        X_te = test.drop(['VOLUNTEER','DAY','label'], axis =1)
        Y_te = test['DAY']
        regr.fit(X_tr, Y_tr)
        pr = regr.predict(X_te)
        trus.extend(Y_te)
        preds.extend(pr)

In [96]:
correlation, p_value = stats.pearsonr(trus,preds)
correlation, p_value

(0.9189318765596359, 9.490663996782356e-53)