<a href="https://colab.research.google.com/github/araldi/HS22_Big-Data-analysis-in-Biomedical-Research/blob/main/Week_09/15_Classification_part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predict breast cancer malignancy with supervised learning classification methods

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

dataset_url = 'https://raw.githubusercontent.com/araldi/HS21---Big-Data-Analysis-in-Biomedical-Research-376-1723-00L-/main/Week6/Logistic_regression_data.csv'

breast = pd.read_csv(dataset_url)

In [None]:
breast.info()


## Exploratory data analysis

In [None]:
feature_columns = breast.select_dtypes(exclude = ['int64','object']).columns
features = [i for i in feature_columns if 'Unnamed: 32' not in i]
features = [i.replace(' ', '_') for i in features]

breast.rename(columns = dict(zip(feature_columns, features)), inplace=True)

In [None]:
# in-depth exploration of the features
plt.figure(figsize = (20,20))
i=1
m = breast['diagnosis'] == 'M'
b = breast['diagnosis'] == 'B'
for feature in features:

  plt.subplot(6,5, i)
  plt.hist(x=feature, data = breast[m], label = 'M', bins = 50, alpha = 0.5)
  plt.hist(x=feature, data = breast[b], label = 'B', bins = 50, alpha = 0.5)
  i = i+1
  plt.xlabel(feature)
  plt.legend()
plt.show()
plt.close()

In [None]:
# test the correlation among features
plt.figure(figsize=(20,20))
sns.heatmap(breast[features].corr(),annot=True, cmap='RdBu', vmin=-1, vmax=1)
plt.title('Correlation Matrix');

# !!!!!
# high correlation between radius, perimeter, area --> careful with forest models where correlation is highly problematic! 

## Exercise

* Prepare the dataset for machine learning (remove features that might be problematic, convert features in binary/continuos, split in train/test set - 25% test)
* Classify malignant or benign breast tumors with Random Forest and Gradient Boosted trees.

* Test different parameters (for Random Forest: criterion, depth, estimators, etc - for Gradient Boosted: loss,  depth, estimators, etc) to determine the best model.

* Determine feature importance
* Predict "malignant" or "benign" in the 25% split test set with the best fitting model.
* Create a confusion matrix for the predictions.





#### Prepare the dataset for CART

*   It is a good idea for CART to remove highly correlated features. Let's start with removing all perimeters and areas (which highly correlate with radius).
*   Split train-test datasets



In [None]:
# remove highly correlated features

In [None]:
# split train-test datasets

from sklearn.model_selection import train_test_split


#### Train Random forest models

Create different models to test different criteria.

In [None]:
from sklearn.ensemble import RandomForestClassifier
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

#model 1


In [None]:
#model 2


In [None]:
#model ...

#### Determine feature importance in the Random forest models

In [None]:
# Compare feature importance in the models above
 # use model.feature_importances_ to determine the importance of each feature
 # also create a plot with the variance of the feature importance in each of the estimators

# Model 1
# ....


#### Predict malignant and benign tumors with the trained models above in the test dataset

In [None]:
from sklearn import metrics

# use model.predict()

In [None]:
def getresults(test,pred_variable):
  precision = metrics.precision_score(test,pred_variable,  pos_label="M", average="binary")
  recall =  metrics.recall_score(test,pred_variable,  pos_label="M", average="binary")
  accuracy = metrics.accuracy_score(test,pred_variable)
  f1 = metrics.f1_score(test,pred_variable,  pos_label="M", average="binary")
  return 'Precision:',precision , 'Recall: ', recall,"Accuracy:", accuracy, "F1:", f1

In [None]:
# model 1
getresults()

In [None]:
# model 2
getresults()

In [None]:
# Draw confusion matrix for each model

from sklearn.metrics import ConfusionMatrixDisplay

# Model 1
ConfusionMatrixDisplay.from_estimator()
plt.title("confusion matrix model 1")
plt.show()

In [None]:
# Model 2

ConfusionMatrixDisplay.from_estimator(rf2,test_df[ml_features].values, test_df['diagnosis'], cmap='inferno',values_format='g')
plt.title("confusion matrix model 2")
plt.show()

####  Train Gradient boosted tree models



In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html



#### Determine feature importance in the Gradient boosted models

In [None]:
#there is no standard deviation in gradient boosted tree!

# use .feature_importances_ as above


#### Predict malignant and benign tumors with the GBT trained models above in the test dataset

In [None]:
# use .predict()

In [None]:
# evaluate the predictions
getresults()

In [None]:
# Draw a confusion matrix


# GridCV: most efficient way to tune hyperparameters

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# determine the classifier and basic parameters of the classifier
rf =  RandomForestClassifier(random_state=2022)

# decide the hyperparameters to be tested
min_samples_split = [2,3,4]                                      
max_depth=[3,4,5,6,8, None]
criterion= ['gini','entropy']
parameters_rf=dict(min_samples_split=min_samples_split,criterion=criterion,max_depth=max_depth)

#training random forest model with combinations of all hyperparameters above using GridSearchCV
# GridSearchCV will find the hyperparameters that will give you the best predictions in cross validation according to the scoring method chosen
gridrf=GridSearchCV(rf,parameters_rf,cv=10, scoring = 'accuracy')
gridrf.fit(train_df[ml_features],train_df['diagnosis']);

In [None]:
# Evaluate which hyperparameter gives the best predictions according to the scoring method chosen
def examinebestmodel(model_name):
    print(model_name.best_score_)
    print(model_name.best_params_)
    print(model_name.best_estimator_)

In [None]:
examinebestmodel(gridrf)

In [None]:
# predict the classification on the test dataset with the best estimators obtained from GridSearchCV
test_prediction_rf=gridrf.best_estimator_.predict(test_df[ml_features])

getresults(test_df['diagnosis'],test_prediction_rf)

In [None]:
#confusion matrix for the best model
ConfusionMatrixDisplay.from_estimator(gridrf.best_estimator_,test_df[ml_features],test_df['diagnosis'],cmap='inferno',values_format='g')
plt.show()

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

gbt =  GradientBoostingClassifier()
loss = ['exponential', 'deviance']                                      
max_depth=[3,4,5,6,8, None]
criterion= ['squared_error','friedman_mse']

parameters_gbt=dict(loss=loss,criterion=criterion,max_depth=max_depth)

#training GBT model, finding best params
gridgbt=GridSearchCV(gbt,parameters_gbt,cv=10, scoring = 'accuracy', error_score='raise')
gridgbt.fit(train_df[ml_features],train_df['diagnosis']);

In [None]:
examinebestmodel(gridgbt)

In [None]:
test_prediction_gbt=gridgbt.best_estimator_.predict(test_df[ml_features])


In [None]:
getresults(test_df['diagnosis'],test_prediction_gbt)

In [None]:
#confusion matrix for gbt
ConfusionMatrixDisplay.from_estimator(gridgbt.best_estimator_,test_df[ml_features],test_df['diagnosis'],cmap='inferno',values_format='g')
plt.show()

# K-NN classifier

In this example, we will use GridSearchCV to find the best hyperparameters for the model

In [None]:
breast[features]

Divide train/test and scale (this time, scaling is important!)

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(breast, test_size=0.25, random_state=2022)


scaler = preprocessing.StandardScaler()
train_df[features]=scaler.fit_transform(train_df[features])
test_df[features]=scaler.fit_transform(test_df[features])


#### Use GridSearchCV to find optimal hyperparameters for the model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html

knn = KNeighborsClassifier()
neighbors=[3,5,9,13]
metric=['manhattan','euclidean', 'chebyshev']
algorithm=['ball_tree', 'brute','auto', 'kd_tree']
parameters_knn=dict(n_neighbors=neighbors,metric=metric,algorithm=algorithm)


#training KNN model, finding best params
gridknn=GridSearchCV(knn,parameters_knn,cv=10,verbose=1, scoring = 'accuracy')
gridknn.fit(train_df[features],train_df['diagnosis'])

In [None]:
examinebestmodel(gridknn)

#### Predict malignant/benign on test set with the optimal hyperparameters

In [None]:
#predicting on test set
test_prediction_knn=gridknn.best_estimator_.predict(test_df[features])
getresults(test_df['diagnosis'],test_prediction_knn)

In [None]:

#confusion matrix for KNN 
ConfusionMatrixDisplay.from_estimator(gridknn.best_estimator_,test_df[features],test_df['diagnosis'],cmap='inferno',values_format='g')
plt.show()

# SVM

In [None]:
from sklearn.svm import SVC

# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

svm_model= SVC()
parameters_SVC = {
 'C': [1, 10,100], 'kernel': ['linear','rbf','sigmoid', 'poly'], 
 'gamma': [1,0.01, 0.0001]}


# C is the regularization parameter:
#  large C will find a hyperplane with smaller margins (= more segmentations, therefore better fit)

# gamma is how far the influence of a single training example reaches
# high gamma: points close to plausible line are considered in calculation
# low gamma: points far away from plausible seperation line are considered in calculation for the separation line

model_svm = GridSearchCV(svm_model, parameters_SVC,cv=10, scoring = 'accuracy')
model_svm.fit(train_df[features],train_df['diagnosis'])



In [None]:
examinebestmodel(model_svm)

In [None]:
#predicting on test set
test_prediction_svm=model_svm.best_estimator_.predict(test_df[features])
getresults(test_df['diagnosis'],test_prediction_svm)

In [None]:
#confusion matrix for svm
ConfusionMatrixDisplay.from_estimator(model_svm.best_estimator_,test_df[features],test_df['diagnosis'],cmap='inferno',values_format='g')
plt.show()