In [None]:
import pandas as pd
import numpy as np

import sklearn.model_selection as skl
from sklearn import metrics as skm
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

from imblearn.over_sampling import SMOTE 

import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

## 1. Load Data

In [None]:
# load all features and all labels

features =  pd.read_csv("data/s-2_features_preprocessed_19_20.csv") # all features on variable x - used for training
all_thresholds = pd.read_csv("data/stress_incidence_labels_25.csv") # labels for 25 thresholds (5-30)

# Single column removal with INPLACE - from features
features.drop(features.columns[0], axis=1, inplace=True)

# Single column removal with INPLACE - from labels
all_thresholds.drop(all_thresholds.columns[0], axis=1, inplace=True)

In [None]:
feature_list = list(features.columns)
threshold_list = list(all_thresholds.columns)

In [None]:
# join the features and thresholds dataframes for resampling
data = pd.concat([features, all_thresholds], axis=1)
data.head()

In [None]:
from sklearn.svm import SVC as svc

from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.ensemble import AdaBoostClassifier as adaboost
from sklearn.ensemble import GradientBoostingClassifier as gboost

from sklearn.tree import DecisionTreeClassifier as trees

from sklearn.linear_model import LogisticRegression as logreg

from sklearn.naive_bayes import GaussianNB as gnb
from sklearn.naive_bayes import MultinomialNB as mnb

from sklearn.neighbors import KNeighborsClassifier as knn

from sklearn.neural_network import MLPClassifier as nn

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as qda

from sklearn.model_selection import RandomizedSearchCV

In [None]:
# models and parameters to be tested in GridSearchCV

model_params = {
    'svm': {
        'model': svc(random_state=4),
        'params' : {
            'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1.0, 10.0],
            'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
            'kernel': ['rbf','sigmoid', 'linear']
        }  
    },
    'random_forest': {
        'model': rf(random_state=4),
        'params' : {
            'n_estimators': list(range(100,510,100)),
            'criterion': ['gini', 'entropy'],
            'min_samples_split':[2,4,6,8]
        }
    },
    'decision_tree': {
        'model': trees(random_state=4),
        'params': {
            'criterion': ['gini','entropy'],
            'splitter':['best','random'],
            'min_samples_split':[2,3,4,5,8,10]
        }
    },
    'ada_boost': {
        'model': adaboost(random_state=4),
        'params': {
            'n_estimators': [20,50,100,150],
            'learning_rate': [0.01,0.1,1.0,1.5,2.0]
        }
    },
    'gradient_boosting': {
        'model': gboost(random_state=4),
        'params': {
            'learning_rate':[0.01,0.1,1.0,1.5,2.0],
            'n_estimators':[20,50,100,150],
            'criterion':['friedman_mse', 'mse']
        }
    },
    'logistic_regression' : {
        'model': logreg(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
            'max_iter':[50,75,100,200,300,400,500]
        }
    },
    'naive_bayes_gaussian': {
        'model': gnb(),
        'params': {
            'var_smoothing': [0.000000001,0.0000001,0.00001,0.001,0.1]
        }
    },
    'naive_bayes_multinomial': {
        'model': mnb(),
        'params': {
            'alpha': [0.001,0.01,0.1,1.0,10.,100.,1000.],
            'fit_prior': [True,False]
        }
    },
    'k_nearest_neighbors': {
        'model':knn(),
        'params': {
            'n_neighbors':[1,2,5,8,10,15,20],
            'p':[1,2,3,4,5],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
            'n_jobs': [-2,-1,1]
        }
    },
    'mlp': {
        'model':nn(random_state=4),
        'params': {
            'hidden_layer_sizes':[50,100,150],
            'activation': ['identity','logistic','tanh','relu'],
            'solver': ['lbfgs', 'sgd','adam'],
            'alpha': [0.001,0.01,0.1],
            'learning_rate': ['constant','invscaling','adaptive'],
            'max_iter': [400,500,600,700,800,900,1000]
        }
    },
    'linear_discriminant': {
        'model': lda(),
        'params': {
            'solver': ['lsqr','eigen'], #'svd' was excluded, as it did not compute the covariance matrix
            'shrinkage': [None,'auto',0.01,0.1,0.2,0.5,0.75,1],
            'store_covariance':[True,False]
        }
    },
    'quadratic_discriminant': {
        'model': qda(),
        'params': {
            'reg_param': [0.0,0.01,0.1,0.5],
            'store_covariance':[True,False],
            'tol': [0.001]
        }
    }
    
}

### Make a first test with GridSearchCV without searching all possible incidence thresholds

## Applying Random Search for thresholds 5-30 using the grid of classifiers and hyperparameters selected in the previous step

### Info concerning data splitting into train and test

All estimators in scikit where name ends with CV perform cross-validation. You need to keep a separate test set for measuring the performance.

So you need to split your whole data to train and test. Forget about this test data for a while.

And then pass this train data only to grid-search. GridSearch will split this train data further into train and test to tune the hyper-parameters passed to it. And finally fit the model on the whole train data with best found parameters.

Now you need to test this model on the test data you kept aside in the beginning. This will give you the near real world performance of model.

If you use the whole data into GridSearchCV, then there would be leakage of test data into parameter tuning and then the final model may not perform that well on newer unseen data.

In [None]:
%%time
# create an empty dictionary with lists to store the outputs of each iteration

storing_dict = {'incidence_threshold':[],
                'model_name':[],
                'auc_score': [],
                'sensitivity':[],
                'specificity':[],
                'f1_score':[],
                'avg_precision':[],
                'best_parameters':[]
               }

# iterate for every threshold we want to test
for i in threshold_list:
#     for each threshold iteration redefine the features (X) and label (y(incidence_threshold))
    X = data[feature_list]
    y = data[i]
    
#     augment dataset using SMOTE
    sm = SMOTE(random_state=4)
    X_sm, y_sm = sm.fit_resample(X, y)
    
#     split the oversampled dataset into training and test sets
    X_train, X_test, y_train, y_test = skl.train_test_split(X_sm,y_sm,test_size=0.30,random_state=4)
    
#     instantiate the grid and loop for all models we want to test
    for k in model_params:
        rndm_grid = RandomizedSearchCV(model_params[k]['model'],
                    param_distributions=model_params[k]['params'],
                    n_iter=100,
                    cv=10,
                    scoring='roc_auc',
                    random_state=4
                   )
        
#         fit the model to the data
        clf = rndm_grid.fit(X_train,y_train)
        
#         apply predictions on the validation set
        y_pred = clf.predict(X_test)
    
#     create a confusion matrix
        confusion = skm.confusion_matrix(y_test,y_pred)
    
#         calculate true positives, true negatives, false positives and false negatives based on the confusion matrix
        TP = confusion[1,1]
        TN = confusion[0,0]
        FP = confusion[0,1]
        FN = confusion[1,0]
#         calculate additional metrics for reference
        f1 = skm.f1_score(y_test,y_pred)
        sens = skm.recall_score(y_test, y_pred)
        spec = TN / float(TN + FP)
        avg_precision = skm.average_precision_score(y_test,y_pred)
    
#         store the best values for each iteration
        storing_dict['incidence_threshold'].append(i)
        storing_dict['model_name'].append(k)
        storing_dict['auc_score'].append(rndm_grid.best_score_)
        storing_dict['sensitivity'].append(sens)
        storing_dict['specificity'].append(spec)
        storing_dict['f1_score'].append(f1)
        storing_dict['avg_precision'].append(avg_precision)
        storing_dict['best_parameters'].append(rndm_grid.best_params_)

In [None]:
for i in storing_list_test:
    print(i['auc_score'], '--->', i['model_name'])

### Preview the results and find the best performing model for this testing

In [None]:
# input the dictionary with the stored values to a dataframe and inspect it
df_results = pd.DataFrame(storing_dict)
df_results.sort_values('auc_score', ascending=False)

In [None]:
# export the dataframe to an excel file
to_extract = df_results.sort_values('auc_score', ascending=False)
with pd.ExcelWriter('rndm_search_outputs/all_incidence_iter_100_cv10.xlsx',mode='w') as writer: # check the export name here
    to_extract.to_excel(writer)

In [None]:
df.shape[0]

In [None]:
# select the best model based on roc auc score

best_thresh = "default"
best_auc = 0.5

print(best_thresh)
print(best_auc)
print("Iterations are initiated")
print("-----------------------------------")

# Testign with auc score
for j in range(df.shape[0]):
    if df["auc_score"].iloc[j] > best_auc:
        print("WE FOUND ONE!!!")
        print('Iteration '+ str(j))
        best_auc = df["auc_score"].iloc[j]
        best_thresh = df["threshold_classifier"].iloc[j]
        print("Optimum incidence threshold = ",best_thresh)
        print("Best computed auc = ",best_auc)
        print("-------------------------------------------------")
        best_index = j

        
        
# print(best_clf)
print("-----------------------------------")
print("END OF ITERATIONS")
print("-----------------------------------")
print(df.iloc[best_index])

# 2. Single test runs for optimum threshold & classifier parameters

After the optimum threshold and parameter settings are found, this part is used to run the model with ALL  metrics and export it

### Plot Receiver Operator Characteristic to assess trained models

In [None]:
roc_auc.sort(reverse=True)
roc_auc

In [None]:
skm.roc_curve()

In [None]:
skm.plot_roc_curve()

In [None]:
# finding the best index from the initial csv

best_inc_value = str(df.incidence_threshold.iloc[best_index])
print(best_inc_value)

In [None]:
# define x and y for this single test run
x =  pd.read_csv("data/s-2_features_preprocessed_19_20.csv") # all features
all_thresholds = pd.read_csv("data/stress_incidence_labels_25.csv")  # all 25 labels

y = all_thresholds["incidence_" + best_inc_value]

# AUGMENT (SMOTE)

X_sm, y_sm = sm.fit_resample(x, y)

# Get metrics for the best model

### Best performance by RF classifier

In [None]:
x =  pd.read_csv("data/s-2_features_preprocessed_19_20.csv") 
y = all_thresholds.incidence_6

sm = SMOTE(random_state=4)

X_sm, y_sm = sm.fit_resample(x,y)

x_train, x_test, y_train, y_test = skl.train_test_split(X_sm, y_sm, test_size=0.33, random_state=4)

clf = rf(n_estimators=200, min_samples_split=2, criterion='entropy', random_state=4)

clf.fit(x_train,y_train)

y_pred = clf.predict(x_test)

In [None]:
# split data into training and testing sets
clf_type = 'RF'


x_train, x_test, y_train, y_test = skl.train_test_split(X_sm, y_sm, test_size=0.33, random_state=4)

# train an RF model on the training set
clf = rf(n_estimators=df.trees[best_index])

clf.fit(x_train,y_train)

# apply prediction on the testing set
y_pred = clf.predict(x_test)

In [None]:
confusion = skm.confusion_matrix(y_test,y_pred)
print(confusion)

TP = confusion[1,1]
TN = confusion[0,0]
FP = confusion[0,1]
FN = confusion[1,0]

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
skm.plot_confusion_matrix(clf, x_test,y_test, cmap='Greys',ax=ax)

In [None]:
fig.savefig('D:\Dropbox\Publications\sentinel-2 olive tree stress detection\Results & Discussion\\figures\\best_threshold_6_rf.jpg')

In [None]:
# metrics for the model

accuracy = skm.accuracy_score(y_test,y_pred)
avg_precision = skm.average_precision_score(y_test,y_pred)
f1_score = skm.f1_score(y_test,y_pred)
recall = skm.recall_score(y_test, y_pred)
spec = TN / float(TN + FP)
fpr = FP / float(TN + FP)
tpr = TP / float(TP + FN)
roc_auc = skm.roc_auc_score(y_test, y_pred)

print("accuracy = ", accuracy)
print("average precision = ",avg_precision)
print("f1 score = ",f1_score)
print("recall = ",recall)
print("specificity = ",spec)
print("false positive rate (1 - specificity) = ",fpr)
print("true positive rate (sensitivity) = ",tpr)
print("**area under the ROC curve** = ",roc_auc)

In [None]:
skm.plot_roc_curve(clf, x_test, y_test)



In [None]:
# metrics for the model

accuracy = skm.accuracy_score(y_test,y_pred)
avg_precision = skm.average_precision_score(y_test,y_pred)
f1_score = skm.f1_score(y_test,y_pred)
recall = skm.recall_score(y_test, y_pred)
spec = TN / float(TN + FP)
fpr = FP / float(TN + FP)
tpr = TP / float(TP + FN)
roc_auc = skm.roc_auc_score(y_test, y_pred)

print("accuracy = ", accuracy)
print("average precision = ",avg_precision)
print("f1 score = ",f1_score)
print("recall = ",recall)
print("specificity = ",spec)
print("false positive rate (1 - specificity) = ",fpr)
print("true positive rate (sensitivity) = ",tpr)
print("**area under the ROC curve** = ",roc_auc)

# 3. Exports for presenting results

In [None]:
# create a dataframe with the best performing models from each categorie (best threshold/classifier/etc)

dict_compare = {"inc_thres":"incidence_"+best_inc_value,"classifier_type":clf_type, "accuracy":accuracy,"averag_precision":avg_precision,
                "f1_score":f1_score, "recall":recall, "specificity":spec,"false_positive_rate":fpr,"true_positive_rate":tpr,"roc_auc":roc_auc
               }
df_opt_clfs = pd.DataFrame(dict_compare, columns=["inc_thres","classifier_type", "accuracy","averag_precision", "f1_score", "recall",
                                                  "specificity","false_positive_rate", "true_positive_rate", "roc_auc", 
                                                 ], index=[0]
                          )