## Author: Min Shi
## Last updated: 4/29/2021

## Description:
The code was created to implement the scoring models to predict peripheral blood hematopoietic stem cell mobilization in allogeneic donors.

In [2]:
import numpy as np
import os
import matplotlib.pyplot as plt
from openpyxl import load_workbook
import csv
import pandas as pd
import sys
from matplotlib import pyplot
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

# 1. Load the data

In [None]:
file = os.path.join('./','path of the input file')
dataset = pd.read_csv(file).set_index('Patient ID').rename(columns={'Day 1 CD34 Absolute per uL': 'count',
                                                                   'Day 1 CD34/kg (x10^6)':'optional_outcome'})

# Take the optional outcome into consideration
dataset.loc[(dataset['count'].isna()) & (dataset['optional_outcome'] >= 2.0), 'count'] = 100
dataset.loc[(dataset['count'].isna()) & (dataset['optional_outcome'] < 2.0), 'count'] = 10

dataset=dataset.drop(['optional_outcome'],axis=1)


# 2. Data preprocessing
1. Map CD34+ counts to Class 1 (0), Class 2 (1)

2. Normalize or scale features

In [None]:
# remove all rows containing N/A and map samples to respective classes
dataset_remove_NaN = dataset.dropna(axis=0, how='any')

dataset_remove_NaN.loc[dataset_remove_NaN['count'] < 40, 'count'] = 0
# dataset_remove_NaN.loc[(dataset_remove_NaN['count'] >= 20) & (dataset_remove_NaN['count'] < 40), 'count'] = 1
# dataset_remove_NaN.loc[dataset_remove_NaN['count'] >= 40, 'count'] = 2
dataset_remove_NaN.loc[dataset_remove_NaN['count'] >= 40, 'count'] = 1



# dataset_remove_NaN = dataset_remove_NaN[['count','Age','BMI','Sodium','Chloride','BUN','ALT','AST','WBC','RBC','Hgb','Hct', 'Platelet Ct',
#                    'MCV','MCH','MCHC','MPV','Neut Abs','Lymphocyte Abs']]

# dataset_remove_NaN = dataset_remove_NaN[['count','Age','BMI','WBC','RBC','Hct', 'Platelet Ct',
#                    'MCV','MCH']]

# dataset_remove_NaN = dataset_remove_NaN[['count','BMI','MCV','MCH']]

dataset_remove_NaN['count'] = dataset_remove_NaN['count'].apply(np.int64)
labels = list(dataset_remove_NaN['count'])
counter = dict((i, labels.count(i)) for i in labels)
print('Class distribution:', counter)

fig = plt.figure(figsize=(5.4, 4.3))
pyplot.xlabel('Distribution of Class 0 (171) and Class 1 (628)', fontsize=14, labelpad=10)
pyplot.ylabel('Number of Donors', fontsize=14)
pyplot.bar(counter.keys(), counter.values())
pyplot.xticks([r for r in range(len(counter.keys()))], ['Class 0', 'Class 1', '2'])
ax = fig.add_subplot(111)
ax.tick_params(axis='both', which='major', labelsize=14)
pyplot.show()

# apply linear normalization
df_temp = dataset_remove_NaN.iloc[:,1:]
dataset_remove_NaN.iloc[:,1:] = (df_temp - df_temp.min()) / (df_temp.max() - df_temp.min())

dataset_remove_NaN

In [None]:
from sklearn.model_selection import train_test_split
from collections import Counter

raw_features = np.array(dataset_remove_NaN.columns)[1:]
donor_ids = np.array(dataset_remove_NaN.index)
raw_inputs = dataset_remove_NaN.values
X = raw_inputs[:,1:]
y = raw_inputs[:,0]

cv = StratifiedKFold(n_splits=10)

raw_features

In [None]:
from sklearn.model_selection import train_test_split
from collections import Counter

raw_features = np.array(dataset_remove_NaN.columns)[1:]
raw_inputs = dataset_remove_NaN.values
X = raw_inputs[:,1:]
y = raw_inputs[:,0]

cv = StratifiedKFold(n_splits=10)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# counter_train = Counter(y_train)
# print('{} samples fot training, class distribution:'.format(len(y_train)), dict(counter_train))

# counter_test = Counter(y_test)
# print('{} samples fot test, class distribution:'.format(len(y_test)), dict(counter_test))

# x_inputs = X_train
# y_inputs = y_train

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

clf = ExtraTreesClassifier(n_estimators=50, random_state=0)
clf = clf.fit(X, y)

importances = clf.feature_importances_

std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)

model = SelectFromModel(clf, prefit=True)
X_selected = model.transform(X)
feat_num = X_selected.shape[1]

indx = sorted(range(len(importances)), key=lambda i: importances[i])[-feat_num:]
print("{} selected features are :\n".format(feat_num), raw_features[indx], importances[indx],'\n')


plt.figure()
plt.title("Feature importances", fontsize=13)
plt.bar(range(X_selected.shape[1]), importances[indx],
        color="r", yerr=std[indx], align="center")
plt.xticks(range(X_selected.shape[1]), raw_features[indx], rotation=90, fontsize=12)
plt.xlim([-1, X_selected.shape[1]])
plt.yticks(fontsize=12)
plt.show()

# T-SNE based Visualization of the two classes

In [None]:
# visualize the embeddings
from sklearn.manifold import TSNE
import random

embeddings = []
labels = []
label_map = {}

# with open(labelsfile) as lr:
#     for line in lr:
#         params = line.split()
#         node_id = params[0]
#         labelss = params[1]
#         label_map[node_id] = labelss

        
groups = list(set(y))
groups.sort()
# print(groups)

embeddings = np.array(X)
labels = np.array(y)

tsne = TSNE(perplexity=20, n_components=2, init='pca', n_iter=2000)
low_dim_embs = tsne.fit_transform(embeddings)

colors = {
'darkorange':'#FF8C00','deepskyblue':'#00BFFF','deeppink':'#FF1493','darkgreen':'#006400',
    'dimgray': '#696969','darkmagenta':'#8B008B','darkred':'#8B0000',
    'darkgray':'#A9A9A9','darkgoldenrod':'#B8860B','aliceblue':'#F0F8FF', 'darkseagreen':'#8FBC8F',
    'bisque':'#FFE4C4','antiquewhite':'#FAEBD7',
'aquamarine': '#7FFFD4','azure':'#F0FFFF','beige':'#F5F5DC','black': '#000000',
'blanchedalmond':'#FFEBCD','blue':'#0000FF','blueviolet':'#8A2BE2','burlywood':'#DEB887',
'cadetblue':'#5F9EA0','chartreuse':'#7FFF00','chocolate':'#D2691E','coral':'#FF7F50',
'cornflowerblue':'#6495ED','crimson':'#DC143C',
'cyan':'#00FFFF','darkblue':'#00008B','darkcyan':'#008B8B',
'darkolivegreen':'#556B2F',
'darkorchid':'#9932CC','darksalmon':'#E9967A','darkslateblue':'#483D8B',
'darkslategray':'#2F4F4F','darkturquoise':'#00CED1','darkkhaki':'#BDB76B'
}
labs = ['Class 0', 'Class 1']
plt.figure(figsize=(4.5, 4.5))
for i, c, label in zip(groups, colors, groups):
    plt.scatter(low_dim_embs[labels == i, 0], low_dim_embs[labels == i, 1], c = c, label=labs[int(label)], s =8)
    
plt.legend(fontsize=11,handletextpad=0.2)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11)
plt.show()

# 3. Methods to mitigate the class imbalanced problem
1. Oversampling

2. SMOTE

3. Cost-Sensitive Penalty

## 3.1 SMOTE (Synthetic Minority Over-sampling Technique) 
https://arxiv.org/pdf/1106.1813.pdf

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from collections import Counter

# y_inputs = LabelEncoder().fit_transform(y_inputs)
oversample = SMOTE(random_state=42)
X_smote, y_smote = oversample.fit_resample(X, y)
counter = Counter(y_smote)
print('Class distribution:', counter)

fig = plt.figure(figsize=(5.4, 4.3))
pyplot.xlabel('Distribution of Class 0 (628) and Class 1 (628)', fontsize=14, labelpad=10)
pyplot.ylabel('Number of Donors', fontsize=14)
pyplot.bar(counter.keys(), counter.values())
pyplot.xticks([r for r in range(len(counter.keys()))], ['Class 0', 'Class 1', '2'])
ax = fig.add_subplot(111)
ax.tick_params(axis='both', which='major', labelsize=14)
pyplot.show()

In [1]:
# from imblearn.over_sampling import SMOTE
# from sklearn.preprocessing import LabelEncoder
# from collections import Counter

# # y_inputs = LabelEncoder().fit_transform(y_inputs)
# oversample = SMOTE(random_state=42)
# X_smote_selected, y_smote_selected = oversample.fit_resample(X_selected, y)
# counter = Counter(y_smote_selected)
# print('Class distribution:', counter)

# fig = plt.figure(figsize=(5.4, 4.3))
# pyplot.xlabel('Distribution of Class 0 (628) and Class 1 (628)', fontsize=14, labelpad=10)
# pyplot.ylabel('Number of Donors', fontsize=14)
# pyplot.bar(counter.keys(), counter.values())
# pyplot.xticks([r for r in range(len(counter.keys()))], ['Class 0', 'Class 1', '2'])
# ax = fig.add_subplot(111)
# ax.tick_params(axis='both', which='major', labelsize=14)
# pyplot.show()

In [97]:
# X_smote, y_smote = X, y
# counter = Counter(y_smote)
# print('Class distribution:', counter)

# fig = plt.figure(figsize=(5.4, 4.3))
# pyplot.xlabel('Distribution of Class 1 and Class 2', fontsize=14, labelpad=10)
# pyplot.ylabel('Number of Patients', fontsize=14)
# pyplot.bar(counter.keys(), counter.values())
# pyplot.xticks([r for r in range(len(counter.keys()))], ['0', '1', '2'])
# ax = fig.add_subplot(111)
# ax.tick_params(axis='both', which='major', labelsize=14)
# pyplot.show()

# 4. Build the prediction model
1. Decision Tree (DT)  

2. Linear Regression (LR) 

3. Random Forest (RF) 

4. Feedforward Neural Networks (FNN)

5. Support Vector Machine (SVM) 

6. AdaBoost

7. GradientBoosting


## 4.1 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns

clf = DecisionTreeClassifier(random_state=0, max_depth=17, criterion='entropy', min_samples_split=4)

accs = []
confusion_mats = []
f1s = []
aucs = []
fprs = []
tprs = []
scores_id = []
scores = []
for i, (train, test) in enumerate(cv.split(X_smote_selected, y_smote_selected)):
    clf.fit(X_smote_selected[train], y_smote_selected[train])
    y_pred = clf.predict(X_smote_selected[test])
    y_test = y_smote_selected[test]
    accuracy = accuracy_score(y_test, y_pred)
    confusion_mat = confusion_matrix(y_test, y_pred)

    scores_id.append(test)
    scores.append(clf.predict_proba(X_smote_selected[test]))
    

# save prediction score
scores_id = np.concatenate(scores_id, axis=0) 
scores = np.concatenate(scores, axis=0) 
map_ids = []
for i in range(len(donor_ids)):
    map_ids.append(np.where(scores_id==i)[0][0])
map_scores = scores[map_ids]
y_predict = np.argmax(map_scores, axis=1) 
    
accuracy_dt = accuracy_score(y, y_predict)
confusion_mat_dt = confusion_matrix(y, y_predict)
f1_dt = f1_score(y, y_predict)    
auc_dt = roc_auc_score(y, y_predict)
fpr_dt,tpr_dt,_ = roc_curve(y, y_predict)


# print(scores[9])

# clf.fit(X_smote, y_smote)
# y_pred = clf.predict(X_test)
# print('y_true', y_test)
# print('y_predict', y_pred)

# accuracy = accuracy_score(y_test, y_pred)
print('Pred Accuray=%.3f'% accuracy_dt)


# confusion_mat = confusion_matrix(y_test, y_pred)
print('confusion_mat:\n', confusion_mat_dt)

# f1_dt, auc_dt = f1_score(y_test, y_pred), roc_auc_score(y_test,y_pred)
print('F1=%.3f Auc=%.3f' % (f1_dt, auc_dt))
# fpr_dt, tpr_dt, _ = roc_curve(y_test, y_pred)
pyplot.plot(fpr_dt, tpr_dt, marker='.', label='Decision Tree')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

# clf.predict_proba(X_test)

# save prediction score
# scores_id = np.concatenate(scores_id, axis=0) 
# scores = np.concatenate(scores, axis=0) 
# map_ids = []
# for i in range(len(donor_ids)):
#     map_ids.append(np.where(scores_id==i)[0][0])
# map_scores = scores[map_ids]

df_dt = pd.DataFrame(map_scores, index=donor_ids,  columns=['class 0', 'Class 1'])

writer = pd.ExcelWriter("model_prediction_scores_new.xlsx")
df_dt.to_excel(writer, 'Decision Tree')
writer.save()



## 4.2 Linear Regression

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, max_iter=1000, multi_class='ovr')

accs = []
confusion_mats = []
f1s = []
aucs = []
fprs = []
tprs = []
scores_id = []
scores = []
for i, (train, test) in enumerate(cv.split(X_smote_selected, y_smote_selected)):
    clf.fit(X_smote_selected[train], y_smote_selected[train])
    y_pred = clf.predict(X_smote_selected[test])
    y_test = y_smote_selected[test]
    accuracy = accuracy_score(y_test, y_pred)

    scores_id.append(test)
    scores.append(clf.predict_proba(X_smote_selected[test]))
    
    
# save prediction score
scores_id = np.concatenate(scores_id, axis=0) 
scores = np.concatenate(scores, axis=0) 
map_ids = []
for i in range(len(donor_ids)):
    map_ids.append(np.where(scores_id==i)[0][0])
map_scores = scores[map_ids]
y_predict = np.argmax(map_scores, axis=1) 
    
accuracy_lr = accuracy_score(y, y_predict)
confusion_mat_lr = confusion_matrix(y, y_predict)
f1_lr = f1_score(y, y_predict)    
auc_lr = roc_auc_score(y, y_predict)
fpr_lr,tpr_lr,_ = roc_curve(y, y_predict)

# clf.fit(X_smote, y_smote)
# y_pred = clf.predict(X_test)
# print('y_true', y_test)
# print('y_predict', y_pred)

# accuracy = accuracy_score(y_test, y_pred)
print('Prediction accuray:', accuracy_lr)

# confusion_mat = confusion_matrix(y_test, y_pred)
print('confusion_mat:\n', confusion_mat_lr)

# f1_lr, auc_lr = f1_score(y_test, y_pred), roc_auc_score(y_test,y_pred)
print('F1=%.3f Auc=%.3f' % (f1_lr, auc_lr))
# fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred)
pyplot.plot(fpr_lr, tpr_lr, marker='.', label='Linear Regression')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

# save prediction score
# scores_id = np.concatenate(scores_id, axis=0) 
# scores = np.concatenate(scores, axis=0) 
# map_ids = []
# for i in range(len(donor_ids)):
#     map_ids.append(np.where(scores_id==i)[0][0])
# map_scores = scores[map_ids]

df_dt = pd.DataFrame(map_scores, index=donor_ids,  columns=['class 0', 'Class 1'])

df_dt.to_excel(writer, 'Linear Regression')
writer.save()



## 4.3 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

clf = RandomForestClassifier(max_depth=18, random_state=0)

accs = []
confusion_mats = []
f1s = []
aucs = []
fprs = []
tprs = []
scores_id = []
scores = []
for i, (train, test) in enumerate(cv.split(X_smote, y_smote)):
    clf.fit(X_smote[train], y_smote[train])
    y_pred = clf.predict(X_smote[test])
    y_test = y_smote[test]
    
#     scores.append(clf.predict_proba(X_smote[test]))
    scores_id.append(test)
    scores.append(clf.predict_proba(X_smote[test]))

# save prediction score
scores_id = np.concatenate(scores_id, axis=0) 
scores = np.concatenate(scores, axis=0) 
map_ids = []
for i in range(len(donor_ids)):
    map_ids.append(np.where(scores_id==i)[0][0])
map_scores = scores[map_ids]
y_predict = np.argmax(map_scores, axis=1) 
    
accuracy_rf = accuracy_score(y, y_predict)
confusion_mat_rf = confusion_matrix(y, y_predict)
f1_rf = f1_score(y, y_predict)    
auc_rf = roc_auc_score(y, y_predict)
fpr_rf,tpr_rf,_ = roc_curve(y, y_predict)


# clf.fit(X_smote, y_smote)
# y_pred = clf.predict(X_test)
# print('y_true', y_test)
# print('y_predict', y_pred)

# accuracy = accuracy_score(y_test, y_pred)
print('Prediction accuray:', accuracy_rf)

# confusion_mat = confusion_matrix(y_test, y_pred)
print('confusion_mat:\n', confusion_mat_rf)

# f1_rf, auc_rf = f1_score(y_test, y_pred), roc_auc_score(y_test,y_pred)
print('F1=%.3f Auc=%.3f' % (f1_rf, auc_rf))
# fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred)
pyplot.plot(fpr_rf, tpr_rf, marker='.', label='Random Forest')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()


df_dt = pd.DataFrame(map_scores, index=donor_ids,  columns=['class 0', 'Class 1'])

df_dt.to_excel(writer, 'Random Forest')
writer.save()


## 4.4 Feedforward Neural Networks (MLP)

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

clf = MLPClassifier(random_state=1, solver='adam', activation='relu', hidden_layer_sizes=(140,),
                    learning_rate_init=1e-2, learning_rate='adaptive', tol=1e-4, max_iter=2000)


accs = []
confusion_mats = []
f1s = []
aucs = []
fprs = []
tprs = []
scores = []
scores_id=[]
for i, (train, test) in enumerate(cv.split(X_smote, y_smote)):
    clf.fit(X_smote[train], y_smote[train])
    y_pred = clf.predict(X_smote[test])
    y_test = y_smote[test]
    
#     scores.append(clf.predict_proba(X_smote[test]))
    scores_id.append(test)
    scores.append(clf.predict_proba(X_smote[test]))


# save prediction score
scores_id = np.concatenate(scores_id, axis=0) 
scores = np.concatenate(scores, axis=0) 
map_ids = []
for i in range(len(donor_ids)):
    map_ids.append(np.where(scores_id==i)[0][0])
map_scores = scores[map_ids]
y_predict = np.argmax(map_scores, axis=1) 
    
accuracy_mlp = accuracy_score(y, y_predict)
confusion_mat_mlp = confusion_matrix(y, y_predict)
f1_mlp = f1_score(y, y_predict)    
auc_mlp = roc_auc_score(y, y_predict)
fpr_mlp,tpr_mlp,_ = roc_curve(y, y_predict)



# clf.fit(X_smote, y_smote)
# y_pred = clf.predict(X_test)
# print('y_true', y_test)
# print('y_predict', y_pred)

# accuracy = accuracy_score(y_test, y_pred)
print('Prediction accuray:', accuracy_mlp)

# confusion_mat = confusion_matrix(y_test, y_pred)
print('confusion_mat:\n', confusion_mat_mlp)

# clf.predict_proba(X_test)


# f1_mlp, auc_mlp = f1_score(y_test, y_pred), roc_auc_score(y_test,y_pred)
print('F1=%.3f Auc=%.3f' % (f1_mlp, auc_mlp))
# fpr_mlp, tpr_mlp, _ = roc_curve(y_test, y_pred)
pyplot.plot(fpr_mlp, tpr_mlp, marker='.', label='Feedforward Neural Networks')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()


# # save prediction score
# scores_id = np.concatenate(scores_id, axis=0) 
# scores = np.concatenate(scores, axis=0) 
# map_ids = []
# for i in range(len(donor_ids)):
#     map_ids.append(np.where(scores_id==i)[0][0])
# map_scores = scores[map_ids]

df_dt = pd.DataFrame(map_scores, index=donor_ids,  columns=['class 0', 'Class 1'])

df_dt.to_excel(writer, 'Feedforward Neural Networks')
writer.save()



## 4.5 Support Vector Machine

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', kernel='rbf', C=2, probability=True))

accs = []
confusion_mats = []
f1s = []
aucs = []
fprs = []
tprs = []
scores = []
scores_id=[]
for i, (train, test) in enumerate(cv.split(X_smote, y_smote)):
    clf.fit(X_smote[train], y_smote[train])
    y_pred = clf.predict(X_smote[test])
    y_test = y_smote[test]

    scores_id.append(test)
    scores.append(clf.predict_proba(X_smote[test]))


# save prediction score
scores_id = np.concatenate(scores_id, axis=0) 
scores = np.concatenate(scores, axis=0) 
map_ids = []
for i in range(len(donor_ids)):
    map_ids.append(np.where(scores_id==i)[0][0])
map_scores = scores[map_ids]
y_predict = np.argmax(map_scores, axis=1) 
    
accuracy_svm = accuracy_score(y, y_predict)
confusion_mat_svm = confusion_matrix(y, y_predict)
f1_svm = f1_score(y, y_predict)    
auc_svm = roc_auc_score(y, y_predict)
fpr_svm,tpr_svm,_ = roc_curve(y, y_predict)



# clf.fit(X_smote, y_smote)
# y_pred = clf.predict(X_test)
# print('y_true', y_test)
# print('y_predict', y_pred)

# accuracy = accuracy_score(y_test, y_pred)
print('Prediction accuray:', accuracy_svm)

# confusion_mat = confusion_matrix(y_test, y_pred)
print('confusion_mat:\n', confusion_mat_svm)


# f1_svm, auc_svm = f1_score(y_test, y_pred), roc_auc_score(y_test,y_pred)
print('F1=%.3f Auc=%.3f' % (f1_svm, auc_svm))
# fpr_svm, tpr_svm, _ = roc_curve(y_test, y_pred)
pyplot.plot(fpr_svm, tpr_svm, marker='.', label='Support Vector Machine')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()


# # save prediction score
# scores_id = np.concatenate(scores_id, axis=0) 
# scores = np.concatenate(scores, axis=0) 
# map_ids = []
# for i in range(len(donor_ids)):
#     map_ids.append(np.where(scores_id==i)[0][0])
# map_scores = scores[map_ids]

df_dt = pd.DataFrame(map_scores, index=donor_ids,  columns=['class 0', 'Class 1'])

df_dt.to_excel(writer, 'Support Vector Machine')
writer.save()





## 4.6 AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

# clf_base =DecisionTreeClassifier(random_state=0, max_depth=16, criterion='entropy', min_samples_split=4)
clf_base = RandomForestClassifier(max_depth=18, random_state=0)

clf = AdaBoostClassifier(n_estimators=220, random_state=0, learning_rate=1e-3, base_estimator=clf_base)

accs = []
confusion_mats = []
f1s = []
aucs = []
fprs = []
tprs = []
scores = []
scores_id=[]
for i, (train, test) in enumerate(cv.split(X_smote, y_smote)):
    clf.fit(X_smote[train], y_smote[train])
    y_pred = clf.predict(X_smote[test])
    y_test = y_smote[test]
    
#     scores.append(clf.predict_proba(X_smote[test]))
    scores_id.append(test)
    scores.append(clf.predict_proba(X_smote[test]))
    
# save prediction score
scores_id = np.concatenate(scores_id, axis=0) 
scores = np.concatenate(scores, axis=0) 
map_ids = []
for i in range(len(donor_ids)):
    map_ids.append(np.where(scores_id==i)[0][0])
map_scores = scores[map_ids]
y_predict = np.argmax(map_scores, axis=1) 
    
accuracy_adaboost = accuracy_score(y, y_predict)
confusion_mat_adaboost = confusion_matrix(y, y_predict)
f1_adaboost = f1_score(y, y_predict)    
auc_adaboost = roc_auc_score(y, y_predict)
fpr_adaboost,tpr_adaboost,_ = roc_curve(y, y_predict)


# clf.fit(X_smote, y_smote)
# y_pred = clf.predict(X_test)
# print('y_true', y_test)
# print('y_predict', y_pred)

# accuracy = accuracy_score(y_test, y_pred)
print('Prediction accuray:', accuracy_adaboost)

# confusion_mat = confusion_matrix(y_test, y_pred)
print('confusion_mat:\n', confusion_mat_adaboost)

# clf.predict_proba(X_test)


# f1_adaboost, auc_adaboost = f1_score(y_test, y_pred), roc_auc_score(y_test,y_pred)
print('F1=%.3f Auc=%.3f' % (f1_adaboost, auc_adaboost))
# fpr_adaboost, tpr_adaboost, _ = roc_curve(y_test, y_pred)
pyplot.plot(fpr_adaboost, tpr_adaboost, marker='.', label='AdaBoost')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

# # save prediction score
# scores_id = np.concatenate(scores_id, axis=0) 
# scores = np.concatenate(scores, axis=0) 
# map_ids = []
# for i in range(len(donor_ids)):
#     map_ids.append(np.where(scores_id==i)[0][0])
# map_scores = scores[map_ids]

df_dt = pd.DataFrame(map_scores, index=donor_ids,  columns=['class 0', 'Class 1'])

df_dt.to_excel(writer, 'AdaBoost')
writer.save()




# 4.7 GradientBoosting 
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=200, learning_rate=1e-3,
     max_depth=1, random_state=0, loss='exponential')


accs = []
confusion_mats = []
f1s = []
aucs = []
fprs = []
tprs = []
scores = []
scores_id=[]
for i, (train, test) in enumerate(cv.split(X_smote, y_smote)):
    clf.fit(X_smote[train], y_smote[train])
    y_pred = clf.predict(X_smote[test])
    y_test = y_smote[test]
    
#     scores.append(clf.predict_proba(X_smote[test]))
    scores_id.append(test)
    scores.append(clf.predict_proba(X_smote[test]))


# save prediction score
scores_id = np.concatenate(scores_id, axis=0) 
scores = np.concatenate(scores, axis=0) 
map_ids = []
for i in range(len(donor_ids)):
    map_ids.append(np.where(scores_id==i)[0][0])
map_scores = scores[map_ids]
y_predict = np.argmax(map_scores, axis=1) 
    
accuracy_gradientboosting = accuracy_score(y, y_predict)
confusion_mat_gradientboosting = confusion_matrix(y, y_predict)
f1_gradientboosting = f1_score(y, y_predict)    
auc_gradientboosting = roc_auc_score(y, y_predict)
fpr_gradientboosting,tpr_gradientboosting,_ = roc_curve(y, y_predict)

# clf.fit(X_smote, y_smote)
# y_pred = clf.predict(X_test)
# print('y_true', y_test)
# print('y_predict', y_pred)

# accuracy = accuracy_score(y_test, y_pred)
print('Prediction accuray:', accuracy_gradientboosting)

# confusion_mat = confusion_matrix(y_test, y_pred)
print('confusion_mat:\n', confusion_mat_gradientboosting)

# f1_gradientboosting, auc_gradientboosting = f1_score(y_test, y_pred), roc_auc_score(y_test,y_pred)
print('F1=%.3f Auc=%.3f' % (f1_gradientboosting, auc_gradientboosting))
# fpr_gradientboosting, tpr_gradientboosting, _ = roc_curve(y_test, y_pred)
pyplot.plot(fpr_gradientboosting, tpr_gradientboosting, marker='.', label='GradientBoosting')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

# # save prediction score
# scores_id = np.concatenate(scores_id, axis=0) 
# scores = np.concatenate(scores, axis=0) 
# map_ids = []
# for i in range(len(donor_ids)):
#     map_ids.append(np.where(scores_id==i)[0][0])
# map_scores = scores[map_ids]

df_dt = pd.DataFrame(map_scores, index=donor_ids,  columns=['class 0', 'Class 1'])

df_dt.to_excel(writer, 'GradientBoosting')
writer.save()



# 5. PLot ROC Curves 

In [None]:
pyplot.plot(fpr_dt, tpr_dt, marker='.', linestyle='--', label='Decision Tree')
pyplot.plot(fpr_lr, tpr_lr, marker='.', label='Linear Regression')
pyplot.plot(fpr_rf, tpr_rf, marker='.', label='Random Forest')
pyplot.plot(fpr_svm, tpr_svm, marker='.', label='Support Vector Machine')
pyplot.plot(fpr_mlp, tpr_mlp, marker='.', label='Feedforward Neural Networks')
pyplot.plot(fpr_adaboost, tpr_adaboost, linestyle=':', marker='.', label='AdaBoost')
# pyplot.plot(fpr_rusboost, tpr_rusboost, marker='.', label='RUSBoost')
# pyplot.plot(fpr_smoteboost, tpr_smoteboost, marker='.', label='SMOTEBoost')
pyplot.plot(fpr_gradientboosting, tpr_gradientboosting, marker='.', label='GradientBoosting')
# axis labels
pyplot.xlabel('False Positive Rate',fontsize=12)
pyplot.ylabel('True Positive Rate', fontsize=12)
# show the legend
pyplot.legend(loc=0,)

print('1. Decision Tree: Acc=%.3f, F1=%.3f, Auc=%.3f' % (accuracy_dt, f1_dt, auc_dt))
print('2. Linear Regression: Acc=%.3f, F1=%.3f, Auc=%.3f' % (accuracy_lr, f1_lr, auc_lr))
print('3. Random Forest: Acc=%.3f, F1=%.3f, Auc=%.3f' % (accuracy_rf, f1_rf, auc_rf))
print('4. Support Vector Machine: Acc=%.3f, F1=%.3f, Auc=%.3f' % (accuracy_svm, f1_svm, auc_svm))
print('5. Feedforward Neural Networks: Acc=%.3f, F1=%.3f, Auc=%.3f' % (accuracy_mlp, f1_mlp, auc_mlp))
print('6. AdaBoost: Acc=%.3f, F1=%.3f, Auc=%.3f' % (accuracy_adaboost, f1_adaboost, auc_adaboost))
# print('7. RUBBoost: F1=%.3f, Auc=%.3f' % (f1_rusboost, auc_rusboost))
# print('8. SMOTEBoost: F1=%.3f, Auc=%.3f' % (f1_smoteboost, auc_smoteboost))
print('9. GradientBoosting: Acc=%.3f, F1=%.3f, Auc=%.3f' % (accuracy_gradientboosting, f1_gradientboosting, auc_gradientboosting))

plt.legend(fontsize=11,handletextpad=0.2)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

# show the plot
pyplot.show()

# 6. PLot Confusion Matrices

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="white")

y=['Class 0','Class 1']
# Generate a large random dataset
fig1=plt.figure(figsize=(3, 3))
sns.set(font_scale=1.2)
ax = sns.heatmap(confusion_mat_dt, vmin=0, vmax=550, annot=True, square=True, xticklabels=y, yticklabels=y, cmap='RdPu', fmt="d",cbar=False)
ax.xaxis.tick_top()
ax.yaxis.tick_left()
ax.set_xticklabels(y, fontsize=14)
ax.set_yticklabels(y, fontsize=14)
plt.xlabel('Decision Tree', fontsize=14, labelpad=11)
plt.title("Predicted", fontsize=14)
pyplot.ylabel('Actual', fontsize=14)
plt.yticks(rotation=0) 
plt.xticks(rotation=0) 
plt.show()


############################
y=['Class 0','Class 1']
# Generate a large random dataset
fig2=plt.figure(figsize=(3, 3))
sns.set(font_scale=1.2)
ax = sns.heatmap(confusion_mat_lr,vmin=0, vmax=550, annot=True, square=True, xticklabels=y, yticklabels=y, cmap='RdPu', fmt="d",cbar=False)
ax.xaxis.tick_top()
ax.yaxis.tick_left()
ax.set_xticklabels(y, fontsize=14)
ax.set_yticklabels(y, fontsize=14)
plt.title("Predicted", fontsize=14)
pyplot.ylabel('Actual', fontsize=14)
plt.xlabel('Linear Regression', fontsize=14, labelpad=11)

plt.yticks(rotation=0) 
plt.xticks(rotation=0) 
plt.show()

############################
y=['Class 0','Class 1']
# Generate a large random dataset
fig3 = plt.figure(figsize=(3, 3))
sns.set(font_scale=1.2)
ax = sns.heatmap(confusion_mat_rf,vmin=0, vmax=550, annot=True, square=True, xticklabels=y, yticklabels=y, cmap='RdPu', fmt="d",cbar=False)
ax.xaxis.tick_top()
ax.yaxis.tick_left()
ax.set_xticklabels(y, fontsize=14)
ax.set_yticklabels(y, fontsize=14)
plt.title("Predicted", fontsize=14)
pyplot.ylabel('Actual', fontsize=14)
plt.xlabel('Random Forest', fontsize=14, labelpad=11)

plt.yticks(rotation=0) 
plt.xticks(rotation=0) 
plt.show()


#####################

fig4 = plt.figure(figsize=(3, 3))
sns.set(font_scale=1.2)
ax = sns.heatmap(confusion_mat_svm,vmin=0, vmax=550, annot=True, square=True, xticklabels=y, yticklabels=y, cmap='RdPu', fmt="d",cbar=False)
ax.xaxis.tick_top()
ax.yaxis.tick_left()
ax.set_xticklabels(y, fontsize=14)
ax.set_yticklabels(y, fontsize=14)
plt.title("Predicted", fontsize=14)
pyplot.ylabel('Actual', fontsize=14)
plt.xlabel('Support Vector Machine', fontsize=14, labelpad=11)

plt.yticks(rotation=0) 
plt.xticks(rotation=0) 
plt.show()


#####################

fig5 = plt.figure(figsize=(3, 3))
sns.set(font_scale=1.2)
ax = sns.heatmap(confusion_mat_mlp,vmin=0, vmax=550, annot=True, square=True, xticklabels=y, yticklabels=y, cmap='RdPu', fmt="d",cbar=False)
ax.xaxis.tick_top()
ax.yaxis.tick_left()
ax.set_xticklabels(y, fontsize=14)
ax.set_yticklabels(y, fontsize=14)
plt.title("Predicted", fontsize=14)
pyplot.ylabel('Actual', fontsize=14)
plt.xlabel('Feedforward Neural Networks', fontsize=14, labelpad=11)

plt.yticks(rotation=0) 
plt.xticks(rotation=0) 
plt.show()


#####################

fig6 = plt.figure(figsize=(3, 3))
sns.set(font_scale=1.2)
ax = sns.heatmap(confusion_mat_adaboost,vmin=0, vmax=550, annot=True, square=True, xticklabels=y, yticklabels=y, cmap='RdPu', fmt="d",cbar=False)
ax.xaxis.tick_top()
ax.yaxis.tick_left()
ax.set_xticklabels(y, fontsize=14)
ax.set_yticklabels(y, fontsize=14)
plt.title("Predicted", fontsize=14)
pyplot.ylabel('Actual', fontsize=14)
plt.xlabel('AdaBoost', fontsize=14, labelpad=11)

plt.yticks(rotation=0) 
plt.xticks(rotation=0) 
plt.show()


#####################

fig7 = plt.figure(figsize=(3, 3))
sns.set(font_scale=1.2)
ax = sns.heatmap(confusion_mat_gradientboosting,vmin=0, vmax=550, annot=True, square=True, xticklabels=y, yticklabels=y, cmap='RdPu', fmt="d",cbar=False)
ax.xaxis.tick_top()
ax.yaxis.tick_left()
ax.set_xticklabels(y, fontsize=14)
ax.set_yticklabels(y, fontsize=14)
plt.title("Predicted", fontsize=14)
pyplot.ylabel('Actual', fontsize=14)
plt.xlabel('GradientBoosting', fontsize=14, labelpad=11)

plt.yticks(rotation=0) 
plt.xticks(rotation=0) 
plt.show()



# 7. Plot Acc, F1 and AUC

In [None]:
DT = [accuracy_dt, f1_dt, auc_dt]
LR = [accuracy_lr, f1_lr, auc_lr]
RF = [accuracy_rf, f1_rf, auc_rf]
SVM = [accuracy_svm, f1_svm, auc_svm]
FNN = [accuracy_mlp, f1_mlp, auc_mlp]
adaboost = [accuracy_adaboost, f1_adaboost, auc_adaboost]
gradboost = [accuracy_gradientboosting, f1_gradientboosting, auc_gradientboosting]

# create plot
fig, ax = plt.subplots()
index = np.arange(3)
bar_width = 0.1
opacity = 0.8
plt.figure()
 
rects1 = plt.bar(index, DT, bar_width,
                 alpha=opacity,
                 color='#B8860B',
                 label='Decision Tree')
rects2 = plt.bar(index + bar_width, LR, bar_width,
                 alpha=opacity,
                 color='#FF7F00',
                 label='Linear Regression')
rects3 = plt.bar(index + 2*bar_width, RF, bar_width,
                 alpha=opacity,
                 color='green',
                 label='Random Forest') 
rects4 = plt.bar(index + 3*bar_width, SVM, bar_width,
                 alpha=opacity,
                 color='indigo',
                 label='SVM')
rects5 = plt.bar(index + 4*bar_width, FNN, bar_width,
                 alpha=opacity,
                 color='#1C86EE',
                 label='FNN')
rects6 = plt.bar(index + 5*bar_width, adaboost, bar_width,
                 alpha=opacity,
                 color='#838B8B',
                 label='AdaBoost')
rects7 = plt.bar(index + 6*bar_width, gradboost, bar_width,
                 alpha=opacity,
                 color='#DEB887',
                 label='GradientBoosting')
 
plt.xlabel('Comparisons on Different Metrics', fontsize=13)
plt.ylabel('Performance', fontsize=13)
plt.xticks(index + 2.5*bar_width, ['Accuracy','F1','AUC'], fontsize=13)
 
plt.legend(fontsize=11,handletextpad=0.2,loc=1, bbox_to_anchor=(1, 0.6))
plt.xticks(fontsize=13)
plt.yticks(fontsize=13)
plt.tight_layout()
plt.show()