In [1]:
import numpy as np
import cv2
import os
import pandas as pd
np.set_printoptions(suppress=True)
from matplotlib import pyplot as plt
%matplotlib inline
import time
import re
from sqlalchemy import create_engine
import pickle

##Sklearn (Model Imports)
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold,RandomizedSearchCV,GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, f1_score
from sklearn.metrics import accuracy_score, classification_report,roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
import sklearn
import warnings
warnings.filterwarnings('ignore')
from Convert_To_Num import convert_wrapper,convert_series_to_float,convert_series_to_int

# Pull In Data From AWS

In [None]:
engine_aws = create_engine('postgresql://**ec2IP**/project03',echo=False)
meta = MetaData(engine_aws)


## Create The Data Frame From AWS

In [None]:
col=[]
for i in range(1,1801):
    col=np.append(col,'P'+str(i))

col=np.append(col,"Class")
col = tuple(col)

In [None]:
conn = engine_aws.connect()
pixel_data=Table('pixel_data',meta,autoload=True,schema='public')
select_st = pixel_data.select()
res = conn.execute(select_st)
temp_list=[]
count=0
for _row in res:
    count+=1
    if count%1000==0:
        print(count)
    image_list=[]
    image_list=np.append(image_list,np.array(list(_row[1])))
    image_list=tuple(np.append(image_list,_row[2].lower()))
    dict_temp={col[i]:image_list[i] for i in range(0,len(col))}
    temp_list.append(dict_temp)
    
df=pd.DataFrame(temp_list)

# Clean Data

In [5]:
convert_wrapper(df,to_int=True)

In [None]:
df_pix=pd.get_dummies(df_pix,columns=['Class'],prefix='class',drop_first=True)

# EDA
1. Check number of classes (Normal v Drusen) - Good to start with
2. Check the classification differences between Normal and Drusen - Does not look good.
    - If quick modeling doesn't look good, will need to find new image processing features

In [6]:
## Check number of classes
df['Class'].value_counts()

normal    26565
drusen     8866
Name: Class, dtype: int64

# Model Creation
**Random Forest Worked Best**

## Set Up KFolds and Data For Model
**Normal = True**

In [6]:
df=pd.get_dummies(df,columns=['Class'],prefix='class',drop_first=True)

In [7]:
y = np.array(df['class_normal'])
X = np.array(df.drop(columns=['class_normal']))
kf = StratifiedKFold(n_splits=3)

In [22]:
std = StandardScaler()

## Random Forest
1. Accuracy
2. AUC

In [8]:
forest = RandomForestClassifier(n_estimators=1400,min_samples_split=2,min_samples_leaf=1,max_features='auto',max_depth=40,bootstrap=False,n_jobs=-1)
for train,test in kf.split(X,y):
    forest.fit(X[train],y[train])
    print("The score for Random Forest is")
    print("Training: {:6.2f}%".format(100*forest.score(X[train], y[train])))
    print("Test set: {:6.2f}%".format(100*forest.score(X[test], y[test])))
    pred_prob = forest.predict_proba(X[test])
    pred = forest.predict(X[test])
    print("AUC Score For Random Forest: {:0.2f}%".format(100*roc_auc_score(y[test],pred_prob[:,1])))
    print("AUC Score For Random Forest: {:0.2f}%".format(100*roc_auc_score(y[test],pred)))
    print('\n')
    

The score for Random Forest is
Training:  99.49%
Test set:  80.27%
AUC Score For Random Forest: 85.52%
AUC Score For Random Forest: 60.69%


The score for Random Forest is
Training:  99.40%
Test set:  79.36%
AUC Score For Random Forest: 84.36%
AUC Score For Random Forest: 58.96%


The score for Random Forest is
Training:  99.51%
Test set:  79.75%
AUC Score For Random Forest: 83.81%
AUC Score For Random Forest: 59.69%




# Random Forest CV - Finds the Best Parameters

In [12]:
#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}


In [13]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier(n_estimators=1400,min_samples_split=2,min_samples_leaf=1,max_features='auto',max_depth=40,bootstrap=False)
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, scoring='roc_auc', n_iter = 50, cv = 2, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X, y)


Fitting 2 folds for each of 50 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed: 32.6min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 168.7min finished


RandomizedSearchCV(cv=2, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=50, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=2)

In [14]:
rf_random.best_params_

{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 40,
 'bootstrap': False}

Best Params
{'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 40,
 'bootstrap': False}

# Pickle Model

In [None]:
pickle.dump(forest,open('RF_Pix_Model.p','wb'))