### Mini capstone project

In [None]:
import pandas as pd
import numpy as np
from google.colab import drive

In [None]:
col_names = ['ID', 'Age Quantile', 'Covid Exam Result']
from google.colab import drive 
drive.mount('/content/gdrive')
#h5FilePath="gdrive/My Drive/GL_Tensor_Flow_HandsOn_Collab/Mini_Capstone/HospitalAlbertEinstein_dataset.xlsx"
h5FilePath="gdrive/My Drive/Colab Notebooks/HospitalAlbertEinstein_dataset.xlsx"

diagnostic_df = pd.read_excel(open(h5FilePath, 'rb') )


In [None]:
diagnostic_df.info()

In [None]:
diagnostic_df.head()

In [None]:
diagnostic_df.shape

In [None]:
diagnostic_df.dtypes

In [None]:
diagnostic_df.isnull().sum()

***Deleting all the columns/features that are missing values for >= 95% of the patients***

In [None]:
rows_count = diagnostic_df.shape[0]
rows_count
diagnostic_df = diagnostic_df.loc[:, (diagnostic_df.notnull().sum(axis=0) > (rows_count * 0.05))]
diagnostic_df.shape

***List of Columns after deleting columns that are having more than 95% NAN values***

In [None]:
diagnostic_df.info()

In [None]:
diagnostic_df.isnull().sum()

In [None]:
diagnostic_df.shape

**Delete Rows which are null or NAN**

In [None]:
diagnostic_df = diagnostic_df[diagnostic_df.Hemoglobin.notnull()]
diagnostic_df

In [None]:
diagnostic_df.isnull().sum()

In [None]:
diagnostic_df.corr().T

In [None]:
diagnostic_df.describe().T

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize = (25,10))
sns.heatmap(diagnostic_df.corr(), annot=True, linewidth=0.5)

In [None]:
sns.countplot(x ="SARS-Cov-2 exam result", data = diagnostic_df)

**There is disparity between count of Negative and Positive results**



1.   Strong positive correlation between **Hemoglobin and Hemotocrit** (~90% correlation)
2.   Positive correlation between **Red Blood Cells and Hemoglobin/Hemotocrit** (~80% correlation)
3.   Negative correlation between **Neutrophils and Lymphocytes** (~70% correlation)



**Hemocrit and Hemoglobin  value is >0 or Positive if Diagnosed with Covid**

In [None]:
diagnostic_df.drop(['Hematocrit'], axis = 1, inplace=True)

In [None]:
diagnostic_df.groupby(['SARS-Cov-2 exam result','Platelets','Hemoglobin'])['SARS-Cov-2 exam result'].count()

In [None]:
diagnostic_df.isnull().sum()

**Based on Correlation Table and Heat Map , Below of the Columns that needs to be dropped**

In [None]:
diagnostic_df.drop(['Patient ID','Strepto A','Sodium', 'Potassium', 'Creatinine','Urea','Strepto A','Influenza A, rapid test','Influenza B, rapid test','Parainfluenza 2','Metapneumovirus'], axis = 1, inplace=True) 
diagnostic_df.drop(['Proteina C reativa mg/dL','Bordetella pertussis','Inf A H1N1 2009','CoronavirusOC43','CoronavirusNL63','Coronavirus229E','Parainfluenza 4','Adenovirus'], axis = 1, inplace=True) 
diagnostic_df.drop(['Influenza A','Influenza B','Parainfluenza 1','Rhinovirus/Enterovirus','Coronavirus HKU1','Parainfluenza 3','Chlamydophila pneumoniae','Respiratory Syncytial Virus'], axis = 1, inplace=True)

In [None]:
diagnostic_df.isnull().sum()

In [None]:
 diagnostic_df.dropna(axis=0, inplace=True)
 print(diagnostic_df.shape)

In [None]:
%matplotlib inline
sns.set(style="darkgrid")

In [None]:
diagnostic_df[['SARS-Cov-2 exam result','Platelets']].groupby(['SARS-Cov-2 exam result']).mean().plot.bar()
sns.countplot('SARS-Cov-2 exam result',hue='SARS-Cov-2 exam result',data=diagnostic_df,)
plt.show()

In [None]:
diagnostic_df[['SARS-Cov-2 exam result','Hemoglobin']].groupby(['SARS-Cov-2 exam result']).mean().plot.bar()
sns.countplot('SARS-Cov-2 exam result',hue='SARS-Cov-2 exam result',data=diagnostic_df,)
plt.show()

In [None]:
fig1, (ax1,  ax2, ax3) = plt.subplots (1,3, figsize = (20, 6))
fig2, (ax4,ax5,ax6) = plt.subplots (1,3, figsize = (20, 6))
fig3, (ax7,ax8,ax9) = plt.subplots (1,3, figsize = (20, 6))
fig4, (ax10, ax11, ax12) = plt.subplots (1,3, figsize = (20, 6))

sns.boxplot(diagnostic_df['Patient age quantile'], ax=ax1)
sns.boxplot(diagnostic_df['Hemoglobin'], ax=ax2)
sns.boxplot(diagnostic_df['Platelets'], ax=ax3)

sns.boxplot(diagnostic_df['Red blood Cells'], ax=ax4)
sns.boxplot(diagnostic_df['Lymphocytes'], ax=ax5)
sns.boxplot(diagnostic_df['Leukocytes'], ax=ax6)

sns.boxplot(diagnostic_df['Basophils'], ax=ax7)
sns.boxplot(diagnostic_df['Mean corpuscular hemoglobin (MCH)'], ax=ax8)
sns.boxplot(diagnostic_df['Eosinophils'], ax=ax9)

sns.boxplot(diagnostic_df['Mean corpuscular volume (MCV)'], ax=ax10)
sns.boxplot(diagnostic_df['Monocytes'], ax=ax11)
sns.boxplot(diagnostic_df['Red blood cell distribution width (RDW)'], ax=ax12)



In [None]:
diagnostic_df.boxplot(figsize=(20,20),grid=True, rot=60)

In [None]:
#sns.pairplot(diagnostic_df)

In [None]:
#sns.pairplot(diagnostic_df,hue = 'SARS-Cov-2 exam result',diag_kind = "kde",kind = "scatter",palette = "husl")

In [None]:
diagnostic_df.columns

Encoding the Categorical Data

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
diagnostic_df['SARS-Cov-2 exam result'] = label_encoder.fit_transform(diagnostic_df['SARS-Cov-2 exam result'])
diagnostic_df['SARS-Cov-2 exam result'].value_counts()

**Building the Models**

**Applying Logistic Regression**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report 

In [None]:
X = diagnostic_df.drop(['SARS-Cov-2 exam result'], axis = 1)
y = diagnostic_df['SARS-Cov-2 exam result']

In [None]:
array = diagnostic_df.values  
test_size = 0.30
seed = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)
##Under sampling and Over sampling

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import confusion_matrix, classification_report 

In [None]:
model_lg = LogisticRegression()
model_lg.fit(X_train, y_train)
y_predict_lg = model_lg.predict(X_test)
model_score_lg = model_lg.score(X_test, y_test)

In [None]:
print(model_score_lg)
print(confusion_matrix(y_test, y_predict_lg))

In [None]:
sns.heatmap(pd.DataFrame(confusion_matrix(y_test, y_predict_lg)), annot=True, cmap="YlGnBu" ,fmt='g')

In [None]:
print("Accuracy using Logistic Regression:",accuracy_score(y_test, y_predict_lg))

In [None]:
print("Precision using Logistic Regression:",precision_score(y_test, y_predict_lg))

In [None]:
print (classification_report(y_test, y_predict_lg))

In [None]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train == 0))) 

***Using SMOTE Algorithm since we have minority class imbalaance which happens to be our target column.***

In [None]:
from imblearn.over_sampling import SMOTE 
smote = SMOTE(random_state = 2) 
X_train_res, y_train_res = smote.fit_sample(X_train, y_train.ravel()) 

In [None]:
print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling, counts of label '1': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res == 0))) 

**Runninng Logistic Regrssion for the 2nd time after applying SMOTE Algorithm**

In [None]:
lr_2 = LogisticRegression() 
lr_2.fit(X_train_res, y_train_res.ravel()) 
y_predict_lg2 = lr_2.predict(X_test) 
model_score_lg2 = model_lg.score(X_test, y_test)

In [None]:
print(confusion_matrix(y_test, y_predict_lg2))
print(classification_report(y_test, y_predict_lg2))

Naive Bayes Model

In [None]:
from sklearn.naive_bayes import GaussianNB

model_gNB = GaussianNB()
model_gNB.fit(X_train, y_train)
y_predict_gNB = model_gNB.predict(X_test)

In [None]:
model_score_gNB = model_gNB.score(X_test, y_test)
print(model_score_gNB)

print(metrics.confusion_matrix(y_test, y_predict_gNB))

In [None]:
sns.heatmap(pd.DataFrame(metrics.confusion_matrix(y_test, y_predict_gNB)), annot=True, cmap="YlGnBu" ,fmt='g')

In [None]:
print("Accuracy using naive_bayes :",metrics.accuracy_score(y_test, y_predict_gNB))
print("Precision using naive_bayes:",metrics.precision_score(y_test, y_predict_gNB))

In [None]:
print (metrics.classification_report(y_test, y_predict_gNB))

Gradient Boosting model without upsampling

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1,2.56999479, 3.16075093, 4.35936352, 1.61271951]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=58, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=78)
    gb_clf.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))

##Gradient Boosting model with upsampling

In [None]:
lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1,2.56999479, 3.16075093, 4.35936352, 1.61271951]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=58, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=78)
    gb_clf.fit(X_train_res, y_train_res)
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_train, y_train)))
    print("Accuracy score (test): {0:.3f}".format(gb_clf.score(X_test, y_test)))

In [None]:
gb = GradientBoostingClassifier(n_estimators=58, learning_rate = 0.5, max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train_res, y_train_res)
predictions = gb.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))
print()
print("Classification Report")
print(classification_report(y_test, predictions))

Desicion Tree Model


In [None]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline

In [None]:
def try_decisionTree(model, X, y):
  model.fit(X_train, y_train)
  y_predict_train = model.predict(X_train)
  y_predict_test = model.predict(X_test)
  return model.score(X_train, y_train), model.score(X_test, y_test)

In [None]:
model_dt = DecisionTreeClassifier(criterion = 'gini')
train_acc, test_acc = try_decisionTree(model_dt, X, y)
print('Train accuracy', train_acc)
print('Test accuracy', test_acc)

In [None]:
model_dt = DecisionTreeClassifier(criterion = 'entropy')
train_acc, test_acc = try_decisionTree(model_dt, X, y)
print('Train accuracy', train_acc)
print('Test accuracy', test_acc)

In [None]:
def evaluate_decision_tree(X, y):
  dt_train_acc = []
  dt_test_acc = []
  depth_val = [2,3,4,5,6,7,8]
  min_samples_leaf = [2,3,4,5]
  for x in depth_val:
    model_dt = DecisionTreeClassifier(criterion = 'entropy', max_depth=x, min_samples_leaf=2)
    train_acc, test_acc = try_decisionTree(model_dt, X, y)
    dt_train_acc.append(train_acc)
    dt_test_acc.append(test_acc)
  plt.plot(depth_val, dt_train_acc, 'r--', depth_val, dt_test_acc, 'b--')
  plt.xlabel('max_depth')
  plt.ylabel('Accuracy ')
  plt.show() 

In [None]:
evaluate_decision_tree(X, y)

Applying the Smote for Over Sampling

In [None]:
over_sampled = SMOTE()
X_smote, y_smote = over_sampled.fit_resample(X, y)
evaluate_decision_tree(X_smote, y_smote)

In [None]:
from imblearn.pipeline import Pipeline
over_sampled = SMOTE(sampling_strategy = 0.3)
under_sampled = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over_sampled), ('under', under_sampled)]

pipeline = Pipeline(steps=steps)
X_combined, y_combined = pipeline.fit_resample(X, y)
evaluate_decision_tree(X_combined, y_combined)

Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
def try_randomForest(model, X, y):
  model.fit(X_train, y_train)
  y_predict_train = model.predict(X_train)
  y_predict_test = model.predict(X_test)
  return model.score(X_train, y_train), model.score(X_test, y_test)

In [None]:
def evaluate_random_forest(X, y, criteria):
  rf_n_train_acc = []
  rf_n_test_acc = []
  estimators = [2,3,4,5,6,7,8]
  for x in estimators:
    model_rf = RandomForestClassifier(n_estimators=x, criterion=criteria, max_depth=3, min_samples_leaf=2)
    train_acc, test_acc = try_decisionTree(model_rf, X, y)
    rf_n_train_acc.append(train_acc)
    rf_n_test_acc.append(test_acc)
  plt.plot(estimators, rf_n_train_acc, 'r--', estimators, rf_n_test_acc, 'b--')
  plt.xlabel('Estimators')
  plt.ylabel('Accuracy ')
  plt.show() 

In [None]:
evaluate_random_forest(X, y, 'gini')

In [None]:
evaluate_random_forest(X, y, 'entropy')

RandomForestClassifier is evaluated with criterian gini and entropy. Between 2, entropy looks to be giving better accuracy compare to gini So, in further analysis using criterian=entropy

In [None]:
over_sampled = SMOTE()
X_smote, y_smote = over_sampled.fit_resample(X, y)
evaluate_random_forest(X_smote, y_smote, 'entropy')

In [None]:
over_sampled = SMOTE(sampling_strategy = 0.3)
under_sampled = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over_sampled), ('under', under_sampled)]

pipeline = Pipeline(steps=steps)
X_combined, y_combined = pipeline.fit_resample(X, y)
evaluate_random_forest(X_combined, y_combined, 'entropy')

In total 3 types of datasets are used to evaluated RandomeForestClassifier.
1.   Original dataset
2.   Dataset prepared using SMOTE (Minotiry Oversampling)
3.   Dataset prepared using SMOTE and RandomeUnderSampler( Minotiry Oversampling and Majority undersampling)

Out of all, 2nd option (dataset prepared using SMOTE) has given better train and test accuracy with hyper parameters ***n_estimators=6, max_depth=3 and min_samples_leaf=2***

Final model building with above mentioned hyper parameters and dataset preparation technique

In [None]:
rf_model = RandomForestClassifier(n_estimators=6, criterion='entropy', max_depth=3, min_samples_leaf=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)
rf_model.fit(X_train, y_train)
y_train_predict = rf_model.predict(X_train)
y_test_predict = rf_model.predict(X_test)

In [None]:
print('Train accuracy - ', rf_model.score(X_train, y_train))
print(metrics.classification_report(y_train, y_train_predict))
print(metrics.confusion_matrix(y_train, y_train_predict))


In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
ns_probs = [0 for _ in range(len(y_test))]
lr_probs = rf_model.predict_proba(X_test)
lr_probs = lr_probs[:, 1]
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
print("AUROC" ,ns_auc )
print("AUROC" ,lr_auc )
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label='RandomForest Model')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Random Forest: ROC AUC=%.3f' % (lr_auc))

In [None]:
print('\n\nTest accuracy - ', rf_model.score(X_test, y_test))
print(metrics.classification_report(y_test, y_test_predict))
print(metrics.confusion_matrix(y_test, y_test_predict))

In [None]:
rf_model = RandomForestClassifier(n_estimators=6, criterion='entropy', max_depth=3, min_samples_leaf=2)
over_sampled = SMOTE(k_neighbors=3, n_jobs=2)
X_smote, y_smote = over_sampled.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_smote, y_smote, test_size=0.30, random_state=1)
rf_model.fit(X_train, y_train)
y_train_predict = rf_model.predict(X_train)
y_test_predict = rf_model.predict(X_test)

In [None]:
print('Train accuracy - ', rf_model.score(X_train, y_train))
print(metrics.classification_report(y_train, y_train_predict))
print(metrics.confusion_matrix(y_train, y_train_predict))

In [None]:
print('\n\nTest accuracy - ', rf_model.score(X_test, y_test))
print(metrics.classification_report(y_test, y_test_predict))
print(metrics.confusion_matrix(y_test, y_test_predict))

In [None]:
rf_model = RandomForestClassifier(n_estimators=6, criterion='entropy', max_depth=3, min_samples_leaf=2)
over_sampled = SMOTE(sampling_strategy=0.3)
under_sampled = RandomUnderSampler(sampling_strategy=0.5)
steps = [('over', over_sampled), ('under', under_sampled)]

pipeline = Pipeline(steps=steps)
X_combined, y_combined = pipeline.fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.30, random_state=1)
rf_model.fit(X_train, y_train)
y_train_predict = rf_model.predict(X_train)
y_test_predict = rf_model.predict(X_test)
print('Train accuracy - ', rf_model.score(X_train, y_train))
print(metrics.classification_report(y_train, y_train_predict))
print(metrics.confusion_matrix(y_train, y_train_predict))
print('\n\nTest accuracy - ', rf_model.score(X_test, y_test))
print(metrics.classification_report(y_test, y_test_predict))
print(metrics.confusion_matrix(y_test, y_test_predict))

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
ns_probs = [0 for _ in range(len(y_test))]
lr_probs = rf_model.predict_proba(X_test)
lr_probs = lr_probs[:, 1]
ns_auc = roc_auc_score(y_test, ns_probs)
lr_auc = roc_auc_score(y_test, lr_probs)
print("AUROC" ,ns_auc )
print("AUROC" ,lr_auc )
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
# plot the roc curve for the model
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.plot(lr_fpr, lr_tpr, marker='.', label='RandomForest Model')
# axis labels
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
# show the legend
plt.legend()
# show the plot
plt.show()

In [None]:
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Random Forest: ROC AUC=%.3f' % (lr_auc))

ADA Boosting

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std

In [None]:
classifier = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=100),
    n_estimators=2000
)
classifier.fit(X_train, y_train)

In [None]:
predictionsada = classifier.predict(X_test)

In [None]:
confusion_matrix(y_test, predictionsada)

In [None]:
print(classification_report(y_test, predictionsada))

In [None]:
print("Accuracy score (training): {0:.3f}".format(classifier.score(X_train, y_train)))
print("Accuracy score (validation): {0:.3f}".format(classifier.score(X_test, y_test)))

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
n_scores = cross_val_score(classifier, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print(n_scores)

In [None]:
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
# get a list of models to evaluate
def get_models():
	models = dict()
	# define number of trees to consider
	n_trees = [10, 50, 100, 500, 1000, 5000]
	for n in n_trees:
		models[str(n)] = AdaBoostClassifier(n_estimators=n)
	return models

In [None]:
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
	# define the evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# evaluate the model and collect the results
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores

In [None]:
from matplotlib import pyplot
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
	# evaluate the model
	scores = evaluate_model(model, X, y)
	# store the results
	results.append(scores)
	names.append(name)
	# summarize the performance along the way
	print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()