# This notebook evaluates and number of algorithms onto the prediction task thickness grid to latest diagnose

In [230]:
from sklearn.datasets import load_iris 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm 
from sklearn import tree
import os
import csv
import pandas as pd
import numpy as np
path = r'C:\Users\s-oholmber\OCT_depth_project\data\ML_prediction_task_1'

In [216]:
#load data process data
X_path =  os.path.join(path , "X.csv")
X = pd.read_csv(X_path).drop(columns = ["series_ID"])

Y_path =  os.path.join(path , "Y.csv")
Y = pd.read_csv(Y_path).drop(columns = ["Unnamed: 0"]).drop_duplicates()
Y = Y.rename(columns={'Katalog_des_Diagnosecode_DKAT': 'DKAT'})

#X.shape, X[X.isnan()].shape
number_of_record_with_missing_values = sum(X.isnull().any(axis=1))
indexes_with_Nan = ~X.isnull().any(axis=1)
indexes = indexes_with_Nan[indexes_with_Nan != False]

#remove Nans
X = X[indexes_with_Nan]
Y['P_ID_DB'].head(), X['PatientID_DB'].head()

full_data = X.merge(Y, left_on = "PatientID_DB",right_on = "P_ID_DB", how = "left").dropna()
full_data = full_data.drop(['P_ID_DB'], axis = 1)
print("Number of Nan values found in full data are: {}".format(np.sum(full_data.isnull().any(axis=1))))
indexes_with_Nan = full_data.isnull().any(axis=1)
#print("the records without diagnosis are: {}".format(full_data[indexes_with_Nan]))
full_data.columns

Number of Nan values found in full data are: 0


Index(['PatientID_DB', 'C0', 'I1', 'I2', 'N1', 'N2', 'S1', 'S2', 'T1', 'T2',
       'DKAT'],
      dtype='object')

### Construct some pipelines

In [None]:
pipe_lr = Pipeline([('scl', StandardScaler()),
            ('clf', LogisticRegression(random_state=42))])

pipe_lr_pca = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('clf', LogisticRegression(random_state=42))])

pipe_rf = Pipeline([('scl', StandardScaler()),
            ('clf', RandomForestClassifier(random_state=42))])

pipe_rf_pca = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('clf', RandomForestClassifier(random_state=42))])

pipe_svm = Pipeline([('scl', StandardScaler()),
            ('clf', svm.SVC(random_state=42))])

pipe_svm_pca = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('clf', svm.SVC(random_state=42))])

### Set grid params

In [235]:
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range_fl = [1.0, 0.5, 0.1]

grid_params_lr = [{'clf__penalty': ['l1', 'l2'],
        'clf__C': param_range_fl,
        'clf__solver': ['liblinear']}] 

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
        'clf__min_samples_leaf': param_range,
        'clf__max_depth': param_range,
        'clf__min_samples_split': param_range[1:]}]

grid_params_svm = [{'clf__kernel': ['linear', 'rbf'], 
        'clf__C': param_range}]

### Construct grid searches

In [236]:
jobs = -1

gs_lr = GridSearchCV(estimator=pipe_lr,
            param_grid=grid_params_lr,
            scoring='accuracy',
            cv=10) 

gs_lr_pca = GridSearchCV(estimator=pipe_lr_pca,
            param_grid=grid_params_lr,
            scoring='accuracy',
            cv=10)

gs_rf = GridSearchCV(estimator=pipe_rf,
            param_grid=grid_params_rf,
            scoring='accuracy',
            cv=10, 
            n_jobs=jobs)

gs_rf_pca = GridSearchCV(estimator=pipe_rf_pca,
            param_grid=grid_params_rf,
            scoring='accuracy',
            cv=10, 
            n_jobs=jobs)

gs_svm = GridSearchCV(estimator=pipe_svm,
            param_grid=grid_params_svm,
            scoring='accuracy',
            cv=10,
            n_jobs=jobs)

gs_svm_pca = GridSearchCV(estimator=pipe_svm_pca,
            param_grid=grid_params_svm,
            scoring='accuracy',
            cv=10,
            n_jobs=jobs)

### Fit the grid search objects

In [237]:
#extract features and labels for train and test
X = full_data.loc[:, 'C0':'T2']
Y = full_data.loc[:, 'DKAT']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# List of pipelines for ease of iteration
grids = [gs_lr, gs_lr_pca, gs_rf, gs_rf_pca, gs_svm, gs_svm_pca]

# Dictionary of pipelines and classifier types for ease of reference
grid_dict = {0: 'Logistic Regression', 1: 'Logistic Regression w/PCA', 
        2: 'Random Forest', 3: 'Random Forest w/PCA', 
        4: 'Support Vector Machine', 5: 'Support Vector Machine w/PCA'}

# Fit the grid search objects
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    # Fit grid search	
    gs.fit(X_train, y_train)
    # Best params
    print('Best params: %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])



Performing model optimizations...

Estimator: Logistic Regression




Best params: {'clf__C': 0.1, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Best training accuracy: 0.231
Test set accuracy score for best params: 0.236 

Estimator: Logistic Regression w/PCA




Best params: {'clf__C': 0.5, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Best training accuracy: 0.202
Test set accuracy score for best params: 0.211 

Estimator: Random Forest




Best params: {'clf__criterion': 'entropy', 'clf__max_depth': 10, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 2}
Best training accuracy: 0.426
Test set accuracy score for best params: 0.445 

Estimator: Random Forest w/PCA




Best params: {'clf__criterion': 'gini', 'clf__max_depth': 8, 'clf__min_samples_leaf': 2, 'clf__min_samples_split': 7}
Best training accuracy: 0.259
Test set accuracy score for best params: 0.279 

Estimator: Support Vector Machine




Best params: {'clf__C': 10, 'clf__kernel': 'rbf'}
Best training accuracy: 0.342
Test set accuracy score for best params: 0.333 

Estimator: Support Vector Machine w/PCA




Best params: {'clf__C': 7, 'clf__kernel': 'rbf'}
Best training accuracy: 0.247
Test set accuracy score for best params: 0.257 

Classifier with best test set accuracy: Random Forest


In [238]:
# Save pipeline to file 
problem_path = r"C:\Users\s-oholmber\OCT_depth_project\prediction_problem1"
write_loc = os.path.join(problem_path, 'best_pipeline.pkl')
joblib.dump(best_pipe, write_loc, compress=1) 
print('Saved %s pipeline to file' % pipe_dict[best_clf]) 


Saved Decision Tree pipeline to file
