# <font color='green'>1. Import needed packages</font>

Additional needed packages:
- imblearn
- tpot

In [45]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import seaborn as sns

# <font color='green'>2. Importing and understanding the data</font>

In [None]:
df_labels= pd.read_csv('all-ClinicalLabels-OLD_case_sensitive.csv', low_memory=False)
df_inputs= pd.read_csv('dataset_Elderberry_plus_120nm_64_onh_vessel_removed.csv', low_memory=False)

#display(df_labels) #FULL_ID
#display(df_inputs) #Observations

df_inputs = df_inputs.rename(columns={'Observations': 'FULL_ID'})
merged_df = pd.merge(df_labels, df_inputs, on='FULL_ID', how='inner')
display(merged_df)

#print(df_labels['FULL_ID'].nunique())
#print(df_inputs['FULL_ID'].nunique())
print(merged_df['PATIENTID'].nunique())

In [None]:
merged_df['PET_RESULT'].value_counts(sort=True, ascending=True).plot(kind='barh',color='green');
plt.savefig('plot0.png', dpi=300, bbox_inches='tight',transparent =True)

In [None]:
merged_df['PET_RESULT'].value_counts(sort=True, ascending=True)

# <font color='green'>3. First manual Classifier and studying of the results </font>

In [49]:
from sklearn import metrics, preprocessing, tree
from sklearn.metrics import make_scorer, SCORERS
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, fbeta_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold 

In [50]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# assume you have a pandas dataframe 'data' with a column 'target' and a column 'group' that you want to group on
X = merged_df.drop(['PET_RESULT'], axis=1)
y = merged_df['PET_RESULT']
groups = merged_df['PATIENTID']

# # perform undersampling with RandomUnderSampler
# rus = RandomUnderSampler(random_state=42)
# X, y = rus.fit_resample(X, y, groups=groups)

# perform oversampling with RandomOverSampler
ros = RandomOverSampler(random_state=42)
X, y = ros.fit_resample(X, y)

In [51]:
# #Balancing data using the SMOTE oversampler to get better and less biased results
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler
# from imblearn.under_sampling import NearMiss

# variables=merged_df.drop(['FULL_ID','PATIENTID','IOL','AGE','GENDER','EYE',
#                          'COGNITION','HIS_SCORE','PET_RESULT',
#                          'UNANIMOUS','APOE_GENOTYPE'], axis=1)
# X= variables
# y= merged_df.loc[:,'PET_RESULT']

# #Balancing the data
# #sampling = RandomUnderSampler(sampling_strategy='auto')
# sampling = SMOTE(sampling_strategy='auto')
# X, y = sampling.fit_resample(X,y)

In [None]:
y.value_counts(sort=True, ascending=True).plot(kind='barh',color='purple');
plt.savefig('plot1.png', dpi=300, bbox_inches='tight',transparent =True)

In [53]:
# kf=StratifiedKFold(n_splits=5)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5) 
# # 80% training and 20% test

In [54]:
from sklearn.model_selection import GroupKFold
groups = X['PATIENTID']

# create GroupKFold object
gkf = GroupKFold(n_splits=5)

# perform cross-validation
for train_idx, test_idx in gkf.split(X, y, groups):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    # train and evaluate model

In [None]:
print(X_train['PATIENTID'].nunique())
print(X_test['PATIENTID'].nunique())
X_train=X_train.drop(['FULL_ID','IOL','AGE','GENDER','EYE',
                          'COGNITION','HIS_SCORE','PET_RESULT',
                          'UNANIMOUS','APOE_GENOTYPE'], axis=1)

In [None]:
display(X_train)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=18, svd_solver='arpack')
X_train=pca.fit_transform(X_train)
display(X_train)

# <font color='green'>5. Automated Classifier and Hyperparameter tunning with TPOT-AutoML</font>

In [None]:
import warnings
warnings.filterwarnings('ignore')

from tpot import TPOTClassifier

# Define TPOT classifier
tpot_model = TPOTClassifier(generations=20, population_size=10, 
                            periodic_checkpoint_folder="tpot_mnst1.txt", verbosity=2, 
                            random_state=42, cv = kf, scoring = 'accuracy', n_jobs=-1, config_dict=None) 

# Fit/start training
tpot_model.fit(X_train, y_train)
print('Done training/fitting TPOT session.')

# Get TPOT's score on test set (default metric is 'accuracy'; define something else in TPOT classifier if needed)
print('TPOTs score on test set is...')
print(tpot_model.score(X_test, y_test))

# Export the best pipeline
tpot_model.export('tpot_best_pipeline.py')

# Create sorted by CV (highest to lowest) dataframe 
my_dict = list(tpot_model.evaluated_individuals_.items())
# Create an empty dataframe to append the model strings, model info strings and CV score strings to
model_scores = pd.DataFrame()
for model in my_dict:
    model_name = model[0]
    model_info = model[1] # You could take this out if the values of the pipeline aren't important to you
    cv_score = model[1].get('internal_cv_score')  # Pull out cv_score as a column (i.e., sortable)
    model_scores = model_scores.append({'model': model_name,
                                        'cv_score': cv_score, # You could take this out if the values of the pipeline aren't important to you
                                        'model_info': model_info,},
                                       ignore_index=True)
    
# Sort by best CV score to worst (top to bottom)
model_scores = model_scores.sort_values('cv_score', ascending=False)
print('Model Scores dataframe is...')
print(model_scores)

# Remove duplicate CV score rows and keep top X pipelines (to get best, 'unique' pipelines)
model_scores = model_scores.drop_duplicates(subset ="cv_score", keep = False)

In [48]:
# from string import ascii_letters
# import numpy as np
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# sns.set_theme(style="white")

# # Compute the correlation matrix
# corr = df.drop("legitimate", axis='columns').corr()

# # Generate a mask for the upper triangle
# mask = np.triu(np.ones_like(corr, dtype=bool))

# # Set up the matplotlib figure
# f, ax = plt.subplots(figsize=(20, 15))

# # Generate a custom diverging colormap
# cmap = sns.diverging_palette(230, 20, as_cmap=True)

# # Draw the heatmap with the mask and correct aspect ratio
# fig=sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
#             square=True, linewidths=.5, cbar_kws={"shrink": .5})

# plt.savefig('plot4.png', dpi=300, bbox_inches='tight',transparent =True)

In [None]:
# # Create Decision Tree classifer object
# clf = RandomForestClassifier(n_jobs=-1)

# # Train Decision Tree Classifer
# clf = clf.fit(X_train,y_train)

# print('Score: '+str(clf.score(X_test, y_test)))

# #Predict the response for test dataset
# y_pred = clf.predict_proba(X_test)
# y_pred_proba = clf.predict_proba(X_test)[::,1]
# y_predX = clf.predict_proba(X)[::,1]
# y_predent = clf.predict_proba(X_train)[::,1]

# # print(y_pred) 

# fig, ax = plt.subplots(dpi=300)

# disp = ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test,
#                                  cmap=plt.cm.Blues, xticks_rotation='vertical', normalize=None, ax=ax)
# disp.ax_.set_title("Confusion matrix, without normalization")

# plt.show()

# #plt.savefig('plot5.png', dpi=300, bbox_inches='tight',transparent =True)

# <font color='green'>4. Manual Classifier and manual Hyperparameter tunning (RandomizedSearchCV)</font>

In [51]:
# kf=StratifiedKFold(n_splits=5)

# models = [
#     DecisionTreeClassifier(),
#     RandomForestClassifier(n_jobs=-1),
#     SVC()
# ]

# CV = 5
# cv_df = pd.DataFrame(index=range(CV * len(models)))
# entries = []
# for model in models:
#   model_name = model.__class__.__name__
#   accuracies = cross_val_score(model, X, y, scoring='accuracy', cv=kf)
#   for fold_idx, accuracy in enumerate(accuracies):
#     entries.append((model_name, fold_idx, accuracy))
# cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
# cv_df

# mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
# std_accuracy = cv_df.groupby('model_name').accuracy.std()

# acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
#           ignore_index=True)
# acc.columns = ['Mean Accuracy', 'Standard deviation']
# acc['Mean Accuracy'] = acc['Mean Accuracy'].astype(float).map(lambda n: '{:.2%}'.format(n))
# acc['Standard deviation'] = acc['Standard deviation'].astype(float).map(lambda n: '{:.2%}'.format(n))
# acc

In [None]:
# #Hyperparameters tuning with RandomizedSearchCV

# n_estimators = [100,200,500,1000]
# max_depth = [3, 4, 5, 6]
# min_weight_fraction_leaf=list(np.arange(0.01, 0.12, 0.005))
# criterion=['gini','entropy'] 
# max_features=['sqrt','log2', None]
# bootstrap = [True,False]

# hyperF = dict(max_depth = max_depth, 
#               n_estimators = n_estimators,   
#               min_weight_fraction_leaf = min_weight_fraction_leaf,
#               max_features = max_features,
#               criterion = criterion, 
#               bootstrap = bootstrap)

# est = RandomForestClassifier() 

# grid_obj=RandomizedSearchCV(est, hyperF, n_iter=5, scoring='accuracy',random_state=3,
#                             n_jobs=-1, cv=kf, verbose = 2, return_train_score=True)

# grid_fit=grid_obj.fit(X_train,y_train) 

# df_cv=pd.DataFrame.from_dict(grid_fit.cv_results_)
# df_cv=df_cv.sort_values(by=['mean_test_score'], ascending=False)

# print('Score CV Test: '+str(grid_fit.best_score_),
#       '\n','Score CV Training : '+str(df_cv['mean_train_score'][0]),
#       '\n',grid_fit.best_params_)

# est = RandomForestClassifier(**grid_fit.best_params_, random_state = 5, verbose = 0, n_jobs=-1)

In [None]:
df_cv.head()

In [None]:
# from sklearn.model_selection import ParameterGrid
# from sklearn.linear_model import SGDClassifier
# from sklearn.metrics import roc_auc_score
# import parfit.parfit as pf

# grid = {
#     'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5], # learning rate
#     'loss': ['log_loss'], # logistic regression
#     'max_iter':[1000,2000,5000,10000],
#     'penalty': ['l2']}

# est = SGDClassifier() 

# grid_obj=RandomizedSearchCV(est, grid, n_iter=50, scoring='accuracy',random_state=3,
#                             n_jobs=-1, cv=kf, verbose = 2, return_train_score=True)

# grid_fit=grid_obj.fit(X_train,y_train) 

# df_cv=pd.DataFrame.from_dict(grid_fit.cv_results_)
# df_cv=df_cv.sort_values(by=['mean_test_score'], ascending=False)

# print('Score CV Test: '+str(grid_fit.best_score_),
#       '\n','Score CV Training : '+str(df_cv['mean_train_score'][0]),
#       '\n',grid_fit.best_params_)

# est = SGDClassifier(**grid_fit.best_params_, random_state = 5, verbose = 0, n_jobs=-1)