In [None]:
"""
Main classification code designed to sample and separate training and test data. 
Scaling workflow is included within this code with the scale applied to high spatial 
resolution data. Any tuning or training is done using the variables created in this cell. 
"""
import time
from datetime import datetime
import psutil
import geopandas as gpd
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid
from sklearn import metrics
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable, axes_size
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
import os
import re
import math
from sklearn.model_selection import cross_val_score, StratifiedKFold, StratifiedShuffleSplit
from collections import Counter
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
import statistics as st
from itertools import combinations 
import itertools
def plot_confusion_matrix(cm, target_names, title='Confusion matrix', cmap=None, normalize=True, 
                         save_path = ""):
    
    accuracy = (np.trace(cm) / float(np.sum(cm))) * 100
    titlefont = {"fontname":"Times New Roman", "fontsize" : 35, "fontweight": "bold"}
    labelfont = {"fontname":"Times New Roman", "fontsize" : 25}
    misclass = (1 - (accuracy/100)) * 100

    if cmap is None:
        cmap = plt.get_cmap("Blues")

    plt.figure(figsize=(8, 8))
    plt.imshow(cm, interpolation = "nearest", cmap = cmap) 
    plt.grid(False)
    cb = plt.colorbar(fraction = 0.046, pad = 0.04)
    cb.ax.tick_params(labelsize='large')
    pattern = "[A-Z][^A-Z]*"
    titlelist = re.findall(pattern, title)
    print(mpl.rcParams["savefig.dpi"])
    if "MLPClassifier" in title:
        newtitle = "".join(titlelist[:3]) + " " + titlelist[3]
    elif title == "LinearSVC":
        newtitle = "".join(titlelist[-3:-1]) + "M Classifier"
    else:
        newtitle = " ".join(titlelist)
    plt.title(newtitle, fontdict = titlefont)
    
    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation = 45, **labelfont)
        plt.yticks(tick_marks, target_names, rotation = 0, **labelfont)

    if normalize:
        cm = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]) * 100

    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.2f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color = "white" if cm[i, j] > thresh else "black", **labelfont)
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color = "white" if cm[i, j] > thresh else "black", **labelfont)
            
    if save_path == "":
        plt.tight_layout()
        plt.ylabel("True label", fontdict = labelfont)
        plt.xlabel(
            "Predicted label\naccuracy={:0.2f}; misclass={:0.2f}".format(
                accuracy,misclass), fontdict = labelfont
        )
        plt.show()
        
    else:
        
        plt.ylabel("True label", labelfont)
        plt.xlabel(
            "Predicted label\naccuracy={:0.2f}; misclass={:0.2f}".format(
                accuracy, misclass), fontdict = labelfont
        )
        
        plt.tight_layout()
        plt.savefig(save_path, dpi = 300)
        plt.show()

iteration = 16

classes_path_root = r"D:/Documents/PhD/GIS/Classification_Code_Output/FHkHz"

classes_path_base = r"/Final_Classification" + str(iteration)

wex_path_root = r"D:/Documents/PhD/GIS/Classification_Code_Output/Wex"

wex_path_base = r"/Final_Classification" + str(iteration)

hem_path_root = r"D:/Documents/PhD/GIS/Classification_Code_Output/Hemp"

hem_path_base = r"/Final_Classification" + str(iteration)

csv_path = (
    r"D:/Documents/PhD/GIS/Classification_Code_Output/FHkHz/Samples BPI2 Copyrev.csv"
)

classes_path_new = (
    r"D:/Documents/PhD/GIS/Classification_Code_Output/FHkHz/Final_Classification" \
    + str(iteration) + ".shp"
)

cv_path_root = r"D:/Documents/PhD/GIS/Classification_Code_Output/Cross_Validation"

classreport_path_root = (
    r"D:/Documents/PhD/GIS/Classification_Code_Output/Classification_Reports_Confusion_Matrices"
)

corkcm_image = classes_path_root + classes_path_base

wexs_classes_path_new = wex_path_root + wex_path_base + "s.shp"

wexn_classes_path_new = wex_path_root + wex_path_base + "n.shp"

wextotal_classes_path_new = wex_path_root + wex_path_base + "to.shp"

hem_classes_path_new = hem_path_root + hem_path_base + ".shp"

csv_file = pd.read_csv(csv_path, delimiter = ";", header = 0)

wexscm_image = wex_path_root + wex_path_base + "s"

wexncm_image = wex_path_root + wex_path_base + "n"

wextocm_image = wex_path_root + wex_path_base + "to"

hemcm_image = hem_path_root + hem_path_base 

wexhemcm_image = wex_path_root + "Hem" + "/" + wex_path_base

intersect_path = r"D:/Documents/PhD/GIS/Classification_Code_Output/FHkHz/BPIInterJoin2vii.shp"

hemshape_path = r"D:/Documents/PhD/GIS/Classification_Code_Output/Hemp/HemJoin5revii.shp"

wexsshape_path = r"D:/Documents/PhD/GIS/Classification_Code_Output/Wex/WexSJoin5rev.shp"

wexseshape_path = r"D:/Documents/PhD/GIS/Classification_Code_Output/Wex/WexSeJoin5rev.shp"

wexswshape_path = r"D:/Documents/PhD/GIS/Classification_Code_Output/Wex/WexSwJoin5rev.shp"

wexnshape_path = r"D:/Documents/PhD/GIS/Classification_Code_Output/Wex/WexNJoin5rev.shp"

wex_classes_path = (
    r"D:/Documents/PhD/GIS/Classification_Code_Output/WexHem/SamplesWexHemviirevcorvi.csv"
)

col_drop_normalisation = [
    "geometry", "Class", "Flat", "Ridge", "Valley", "RelBorderR", "RelBorderV",
    "HueSlopeCos", "HueSlopeSin",
]

rename_dict = {
    "Classifi_1": "Ridge", "Classified": "Flat", "Classifi_2": "Valley", "Mean_BPI25": "bpi25z", 
    "Mean_BPI9": "bpi9z", "HSI_Transf": "HueSlopeSin", "HSI_Tran_1": "HueSlopeCos", 
    "VRM25z": "vrm25z", "Zero9": "zero9z", "Zero3": "Mean_Zerom","VRM9z": "vrm9z", 
    "Rel_border": "RelBorderV", "Rel_bord_1": "RelBorderR"
}

extra_dict = {
    "HSV_HueR_1": "HueSlopeSin", "HSV_HueRSl": "HueSlopeCos", 
    "Border_Len" : "Border_L_1", "Mean_Zero9": "zero9z", "Mean_BPI3": "Mean_BPI",
     "Mean_VRM3": "Mean_VRM"
}

neulabel = "NeuClass"

neulabel2 = "NeuClass2"

svmlabel = "SVM_Class"

knnlabel = "KNN_Class"

rfclabel = "Random_Forest_Class"

voting = "Vote_Class"

def dataset_split(input_classes, input_objects, rename_dict, valid_or_train, n_classes):
    
    csv_file = pd.read_csv(input_classes, delimiter = ";", header = 0)
    
    class_id_list = csv_file["id"].to_list()
    
    class_names_list = csv_file["Class_Names"].to_list()
    
    print(np.unique(class_names_list))
    
    csv_dictionary = dict(zip(class_id_list, class_names_list))
    
    csv_classes = list(csv_dictionary.values())
    
    objects_total = gpd.read_file(input_objects)  
    
    objects_total["Class"] = np.select(
        [objects_total["Class"] == x for x in csv_dictionary], csv_dictionary.values(), "Flat"
    )
    
    columns_to_remove = [
        val for val in objects_total.columns if "modeMed" in val
    ]
    
    objects_total = objects_total.replace(-9999, np.nan)
    
    objects_total = objects_total.rename(columns = rename_dict)
    
    waves = objects_total.loc[
        ((objects_total["Valley"] == 1) | (objects_total["Ridge"] == 1)) |\
        (objects_total["Class"].isin(csv_classes))
    ]
    
    lower_list = [val for val in csv_classes if "Lower" in val]
    
    higher_list = [val for val in csv_classes if "Upper" in val]
    
    not_waves = objects_total.loc[objects_total["Flat"] == 1]
    
    samples = waves.loc[waves["Class"].isin(csv_classes)]
    
    upper = samples.loc[samples["Class"].isin(higher_list)]

    lower = samples.loc[samples["Class"].isin(lower_list)]
    
    if valid_or_train == "train": 
        correct_lower = lower.loc[(lower["Valley"] == 1) | (lower["Flat"] == 1) 
        ] # the or flat here preserves the original samples
        correct_upper = upper.loc[(upper["Ridge"] == 1) | (upper["Flat"] == 1)
        ] # the same as above
        samples = pd.concat([correct_lower, correct_upper])
    else: 
        correct_lower = lower.loc[lower["Valley"] == 1] #All flat samples are excluded
        correct_upper = upper.loc[upper["Ridge"] == 1] #All flat samples are excluded
        samples = pd.concat([correct_lower, correct_upper])
    
    if n_classes == 2:
        samples["Class"] = samples.apply(
            lambda row: class_split(row, "Class"), axis = 1
        )
    
    samples = pd.DataFrame(samples)
    
    samples = samples.copy()
    
    samples_group = samples.groupby("Class", as_index = False)

    samples_equal = samples_group.apply(
        lambda x: x.sample(samples_group.size().min(), 
                           random_state = 42).reset_index(drop = False)
    )
    
    samples_equal_df = samples.loc[samples_equal["index"].values, :]
    
    test_data = waves.loc[waves.index.difference(samples_equal_df.index)]
    
    if (not_waves[not_waves.columns.difference(["Class"])].empty) and (test_data.empty):
        print("\nSamples Equal only")
        return samples_equal_df, csv_dictionary
    elif samples.empty:
        print("\nTest and Waves only")
        not_waves["Class"] = "Flat"
        return test_data, not_waves
    elif not_waves[not_waves.columns.difference(["Class"])].empty:
        print("\nSamples and Test only")
        return samples_equal_df, test_data
    else:
        print("\nAll Data")
        return samples_equal_df, test_data, not_waves, csv_dictionary

interpol = "linear"

def prep_for_norm(
    samples = pd.DataFrame(), test_data = pd.DataFrame(), not_waves_data = pd.DataFrame(), 
    columns_to_ignore = [], valid_or_train = None
):
    
    sample_classes = samples.Class.to_numpy()
    
    cols = columns_to_ignore
    
    sample_variables = samples.drop(
        columns = cols, errors = "ignore"
    )
    
    sample_variables = sample_variables.sort_index(axis = 1)
    
    print(sample_variables.columns)
    
    sample_variables["Class"] = sample_classes 

    sample_variables_df = pd.DataFrame(sample_variables)
    
    if not sample_variables_df.empty:
        if sample_variables_df.isnull().values.any() == True:
            sample_variables_df.interpolate(
                method = interpol, order = 2, axis = 0, inplace = True, 
                limit_direction = "both"
            )
        
        if sample_variables_df.isnull().values.any() == True:
            sample_variables_df.interpolate(
                method = "linear", order = 2, inplace = True,
                limit_direction = "both"
            )
    sample_classes = sample_variables_df.Class.to_numpy()
    
    print(np.unique(sample_classes))
    
    sample_variables_df = sample_variables_df.drop(columns = ["Class"])
    
    sample_var_array = sample_variables_df.to_numpy()

    collist = sample_variables_df.columns
       
    test_data_variables = test_data.drop(columns = cols, errors = "ignore")

    test_data_variables = test_data_variables.sort_index(axis = 1)

    test_data_variables_df = pd.DataFrame(test_data_variables)
    
    if not test_data_variables_df.empty:
        if test_data_variables_df.isnull().values.any() == True:
            print(test_data_variables_df.isnull().values.any())
            test_data_variables_df.interpolate(
                method = interpol, order = 2, axis = 0, inplace = True, 
                limit_direction = "both"
            )
        
        if test_data_variables_df.isnull().values.any() == True:
            test_data_variables_df.interpolate(
                method = "linear", order = 2, inplace = True,
                limit_direction = "both"
            )
        
    test_data_array = test_data_variables_df.to_numpy()
    
    if samples.empty:
        if not_waves_variables.isnull().values.any() == True:
            not_waves_variables = not_waves_data.drop(columns = cols, errors = "ignore")

            not_waves_variables = not_waves_variables.sort_index(axis = 1)

            not_waves_variables = pd.DataFrame(not_waves_variables)

            not_waves_variables.interpolate(
                method = interpol, order = 2, axis = 0, inplace = True, 
                limit_direction = "both")
        
        if not_waves_variables.isnull().values.any() == True:
            not_waves_variables.interpolate(
                method = "linear", order = 2, inplace = True,
                limit_direction = "both"
            )
        
        not_waves_array = not_waves_variables.to_numpy()
        
        return test_data_array, not_waves_array
    
    elif (test_data.empty) and (not_waves_data.empty):
                
        if valid_or_train == "train":
            
            return sample_classes, sample_var_array, collist
        else:
            
            return sample_classes, sample_var_array
    
    elif not_waves_data.empty:
        
        if valid_or_train == "train":
            
            return sample_classes, sample_var_array, test_data_array, collist
        else:
            
            return sample_classes, sample_var_array, test_data_array 
    else:         
        
        not_waves_variables = not_waves_data.drop(columns = cols, errors = "ignore")

        not_waves_variables = not_waves_variables.sort_index(axis = 1)

        not_waves_variables = pd.DataFrame(not_waves_variables)
        if not_waves_variables.isnull().values.any() == True:
            not_waves_variables.interpolate(method = interpol, order = 2,
                                            axis = 0, inplace = True, limit_direction = "both")
        
        if not_waves_variables.isnull().values.any() == True:
            not_waves_variables.interpolate(method = "linear", order = 2, inplace = True,
                                           limit_direction = "both")
        
        not_waves_variables

        not_waves_array = not_waves_variables.to_numpy()
        
        if valid_or_train == "train":
            print("\nAll data with column list")
            return sample_classes, sample_var_array, test_data_array, not_waves_array, collist
        else:
            print("\nAll data without column list")
            return sample_classes, sample_var_array, test_data_array, not_waves_array

def scaled_data_train(sample_var_array, test_data_array, not_waves_array, scaler):
    if scaler == "minmax":
        print("\nMinmax")
        scaler = MinMaxScaler()
        scaler.fit(sample_var_array)
        sample_stan = scaler.transform(sample_var_array)
        test_stan = scaler.transform(test_data_array)
        not_waves_stan = scaler.transform(not_waves_array)
    elif scaler == "standard":
        print("\nStandard")
        scaler = StandardScaler()
        scaler.fit(sample_var_array)
        sample_stan = scaler.transform(sample_var_array)
        test_stan = scaler.transform(test_data_array)
        not_waves_stan = scaler.transform(not_waves_array)
    return sample_stan, test_stan, not_waves_stan, scaler

def scaled_valid_deploy(
    scaler, sample_var_array = np.array([]), test_data_array = np.array([]),
    not_waves_array = np.array([])
):
    valid_stan = scaler.transform(sample_var_array)
    
    if (test_data_array.size == 0) and (not_waves_array.size == 0):
        print("\nValid standardised")
        return valid_stan 
    
    elif not_waves_array.size == 0:
        
        test_stan = scaler.transform(test_data_array) 
        print("\nValid and test standardised")
        return valid_stan, test_stan
    else:
        test_stan = scaler.transform(test_data_array) 
        
        not_waves_stan = scaler.transform(not_waves_array)
        #print("\nAll Data Standardised")
        return valid_stan, test_stan, not_waves_stan

def model_deploy(model, train_data = np.array([]), test_data = np.array([])):
    
    train_pred = model.predict(train_data)
    
    if test_data.size == 0:
        return train_pred
    else:
        test_pred = model.predict(test_data)
        return train_pred, test_pred

def create_df(original_data, data, model_pred_dict, header, n_classes, index = []):
    
    if len(index) == 0:
        data_df = pd.DataFrame(data, columns = header)   
        
    else:
        data_df = pd.DataFrame(data, columns = header, index = index)
        
    ridge = original_data["Ridge"].to_numpy()

    valley = original_data["Valley"].to_numpy()

    flat = original_data["Flat"].to_numpy()
    
    geometry = original_data["geometry"].to_numpy()
    
    labels = original_data["Class"].to_numpy()

    data_df["Ridge"] = ridge

    data_df["Valley"] = valley

    data_df["Flat"] = flat

    data_df["geometry"] = geometry
    
    data_df["Labels"] = labels

    if n_classes == 2:
        for model, classes in model_pred_dict.items():
            data_df[model] = classes       
            data_df[model] = data_df.apply(
                lambda row: class_reunite(row, model), axis = 1
            )
    else:
        for model, classes in model_pred_dict.items():
            data_df[model] = classes
    return data_df

def create_id_file(path1, path2, sampleshape, csv_path, csv_dict, test_data, model1, model2):
    idfilename = path1.split(".")[0] + ".txt"
    idfile = open(idfilename, "w+")
    idfile.write(
        "Shapefile used:" + os.path.basename(sampleshape)
    )
    idfile.write("\nSample file used: " + os.path.basename(csv_path).split(".")[0] + ".asc")
    idfile.write("\nSamples used: " + str(csv_dict))
    idfile.write("\nNumber of variables used: " + str(len(
        [x for x in test_data.columns if x not in col_drop_normalisation]
        )))
    idfile.write("\nVariables used: " + str(
        [x for x in test_data.columns if x not in col_drop_normalisation]
    ))
    idfile.write("\nHyperparameters chosen for first neunet: " + str(model1.get_params))
    idfile.write("\nHyperparameters chosen for second neunet: " + str(model2.get_params))
    idfile.close()
    return

def accuracy_analysis(train_classes, predicted_classes, label, class_label, save_path = ""):
    
    cm = confusion_matrix(train_classes, predicted_classes, labels = class_label)
    
    if save_path == "":
        plot_confusion_matrix(
            cm, class_label, title = label, cmap = plt.cm.Blues, normalize = True,
            save_path = "")            

    else:
        image_path = save_path + label + ".png"

        report_csv_path = save_path + label + "report.csv"

        plot_confusion_matrix(
            cm, class_label, title = label, cmap = plt.cm.Blues, normalize = True,
            save_path = image_path
        )

        report = metrics.classification_report(
            train_classes, predicted_classes, output_dict = True
        )
        
        report["Kappa_Coefficient"] = metrics.cohen_kappa_score(
            train_classes, predicted_classes
        )

        report_df = pd.DataFrame.from_dict(report)

        report_df.to_csv(report_csv_path)
    
    print(metrics.classification_report(train_classes, predicted_classes))
    kappa_score = round(metrics.cohen_kappa_score(train_classes, predicted_classes), 3)
    print("\nKappa Score: ", kappa_score)
    accuracy_score = round(
        metrics.balanced_accuracy_score(train_classes, predicted_classes), 3)
    print("Accuracy Score: ", accuracy_score)
    macro_precision = round(metrics.precision_score(train_classes, predicted_classes, 
                                                   average = "macro"), 3)
    print("\nMacro Precision: ", macro_precision)
    micro_precision = round(metrics.precision_score(train_classes, predicted_classes,
                                                    average = "micro"), 3)
    print("Micro Precision: ", micro_precision)
    macro_recall = round(
        metrics.recall_score(train_classes, predicted_classes, average = "macro"), 3)
    print("\nMacro Recall: ", macro_recall)
    micro_recall = round(
        metrics.recall_score(train_classes, predicted_classes, average = "micro"), 3)
    print("Micro Recall: ", micro_recall)
    macro_f1 = round(
        metrics.f1_score(train_classes, predicted_classes, average = "macro"), 3)
    print("Macro F1: ", macro_f1)
    
train = "train"
valid = "valid"
scale_type = "standard"
n_classes = 4

cork_train_model_dict = {}

cork_test_model_dict = {}
cork_not_waves_dict = {}

wexto_train_model_dict = {}
wexto_test_model_dict = {}
wexto_not_waves_dict = {}

wexse_train_model_dict = {}
wexse_test_model_dict = {}
wexse_not_waves_dict = {}

wexsw_train_model_dict = {}
wexsw_test_model_dict = {}
wexsw_not_waves_dict = {}

wexs_train_model_dict = {}
wexs_test_model_dict = {}
wexs_not_waves_dict = {}

wexn_train_model_dict = {}
wexn_test_model_dict = {}
wexn_not_waves_dict = {}

hem_train_model_dict = {}
hem_test_model_dict = {}
hem_not_waves_dict = {}

print("\nCork Harbour\n")

samples_equal, test_data, not_waves, cork_dict = dataset_split(
    input_classes = csv_path,  input_objects = intersect_path, rename_dict = rename_dict,
    valid_or_train = valid, n_classes = n_classes
)

sample_classes, sample_var_array, test_array, not_waves_array, collist = prep_for_norm(
    samples = samples_equal, test_data = test_data, not_waves_data = not_waves, 
    columns_to_ignore = col_drop_normalisation, valid_or_train = train
)

sample_scaled, test_scaled, not_waves_scaled, scaler = scaled_data_train(
     sample_var_array, test_array, not_waves_array, scale_type
)

mlp1_params = {
    'alpha': 0.001, 'hidden_layer_sizes': [22, 20, 16, 8], 
    'learning_rate_init': 0.01, 'random_state': 396
}

mlp2_params = {
    'alpha': 0.1, 'hidden_layer_sizes': [20, 22, 22, 20], 
    'learning_rate_init': 0.001, 'random_state': 173
}

neunet_model = MLPClassifier(**mlp1_params)

neunet_model2 = MLPClassifier(**mlp2_params)

estimators = [
    ("mlp1", MLPClassifier(**mlp1_params)),
    ("mlp2", MLPClassifier(**mlp2_params))
]

start = time.time()

weights = [1, 1]

clf = VotingClassifier(estimators, voting = "soft", weights = weights)

clf.fit(sample_scaled, sample_classes)

cork_train_model_dict[voting], cork_test_model_dict[voting] = model_deploy(
    clf, sample_scaled, test_scaled
)

cork_not_waves_dict[voting] = "Flat"

neunet_model.fit(sample_scaled, sample_classes)

neunet_model2.fit(sample_scaled, sample_classes)

cork_train_model_dict[neulabel], cork_test_model_dict[neulabel] = model_deploy(
    neunet_model, sample_scaled, test_scaled
) #reminder that the outputs from model deploy are the train pred and test pred

cork_train_model_dict[neulabel2], cork_test_model_dict[neulabel2] = model_deploy(
    neunet_model2, sample_scaled, test_scaled
)

cork_not_waves_dict[neulabel] = "Flat"

cork_not_waves_dict[neulabel2] = "Flat"

svm_params = {
    'C': 100, 'dual': False, 'fit_intercept': True,
    'loss': 'squared_hinge', 'penalty': 'l1', 
    'random_state': 57, 'tol': 0.1
}

svm_model = LinearSVC(**svm_params)

svm_model.fit(sample_scaled, sample_classes)

cork_train_model_dict[svmlabel], cork_test_model_dict[svmlabel] = model_deploy(
    svm_model, sample_scaled, test_scaled
)

cork_not_waves_dict[svmlabel] = "Flat"

print("\nAll Models Trained!!!")

print("\nHemptons\n")
cork_not_waves_dict[rfclabel] = "Flat"

cork_not_waves_dict[knnlabel] = "Flat"

rename_dict.update(extra_dict) # to ensure that all datasets have the same column

hem_samples_equal, hem_test_data, hem_not_waves, hem_dict = dataset_split(
    wex_classes_path, hemshape_path, rename_dict, valid, n_classes
)

hem_sample_classes, hem_sample_array, hem_test_array, hem_nwaves_array =  prep_for_norm(
    hem_samples_equal, hem_test_data, hem_not_waves, col_drop_normalisation
)

hem_sample_scaled, hem_test_scaled, hem_nwaves_scaled = scaled_valid_deploy(
     scaler, hem_sample_array, hem_test_array, hem_nwaves_array
 )

hem_train_model_dict[neulabel], hem_test_model_dict[neulabel] = model_deploy(
    neunet_model, hem_sample_scaled, hem_test_scaled
)

hem_train_model_dict[neulabel2], hem_test_model_dict[neulabel2] = model_deploy(
    neunet_model2, hem_sample_scaled, hem_test_scaled
)

hem_train_model_dict[voting], hem_test_model_dict[voting] = model_deploy(
    clf, hem_sample_scaled, hem_test_scaled
)

hem_train_model_dict[svmlabel], hem_test_model_dict[svmlabel] = model_deploy(
    svm_model, hem_sample_scaled, hem_test_scaled
)

wexs_samples_equal, wexs_test_data, wexs_not_waves, wexs_dict = dataset_split(
    wex_classes_path, wexsshape_path, rename_dict, valid, n_classes
)


wexse_samples_equal, wexse_test_data, wexse_not_waves, wexse_dict = dataset_split(
    wex_classes_path, wexseshape_path, rename_dict, valid, n_classes
)

wexsw_samples_equal, wexsw_test_data, wexsw_not_waves, wexsw_dict = dataset_split(
    wex_classes_path, wexswshape_path, rename_dict, valid, n_classes
)

wexn_samples_equal, wexn_test_data, wexn_not_waves, wexn_dict = dataset_split(
    wex_classes_path, wexnshape_path, rename_dict, valid, n_classes
)

print("\nWex S Data")
wexs_sample_classes, wexs_sample_array, wexs_test_array, wexs_nwaves_array =  prep_for_norm(
    wexs_samples_equal, wexs_test_data, wexs_not_waves, col_drop_normalisation
)

wexse_sample_classes, wexse_sample_array, wexse_test_array, wexs_nwaves_array =  prep_for_norm(
    wexse_samples_equal, wexse_test_data, wexs_not_waves, col_drop_normalisation
)

wexsw_sample_classes, wexsw_sample_array, wexsw_test_array =  prep_for_norm(
    wexsw_samples_equal, wexsw_test_data, pd.DataFrame(), col_drop_normalisation
)

print("\nWex N Data")
wexn_sample_classes, wexn_sample_array, wexn_test_array, wexn_nwaves_array =  prep_for_norm(
    wexn_samples_equal, wexn_test_data, wexn_not_waves, col_drop_normalisation
)

wexn_sample_scaled, wexn_test_scaled, wexn_nwaves_scaled = scaled_valid_deploy(
     scaler, wexn_sample_array, wexn_test_array, wexn_nwaves_array
)

wexs_sample_scaled, wexs_test_scaled, wexs_nwaves_scaled = scaled_valid_deploy(
     scaler, wexs_sample_array, wexs_test_array, wexs_nwaves_array
)

wexsw_sample_scaled, wexsw_test_scaled, wexsw_nwaves_scaled = scaled_valid_deploy(
     scaler, wexsw_sample_array, wexsw_test_array, wexs_nwaves_array
)

wexse_sample_scaled, wexse_test_scaled, wexs_nwaves_scaled = scaled_valid_deploy(
     scaler, wexse_sample_array, wexse_test_array, wexs_nwaves_array
)

wexto_sample_scaled = np.concatenate((
    wexs_sample_scaled ,wexn_sample_scaled
))

wexto_test_scaled = np.concatenate((
    wexs_test_scaled, wexn_test_scaled
))

wexto_train_model_dict[neulabel], wexto_test_model_dict[neulabel] = model_deploy(
    neunet_model, wexto_sample_scaled, wexto_test_scaled
)

wexto_train_model_dict[neulabel2], wexto_test_model_dict[neulabel2] = model_deploy(
    neunet_model2, wexto_sample_scaled, wexto_test_scaled
)

wexto_train_model_dict[voting], wexto_test_model_dict[voting] = model_deploy(
    clf, wexto_sample_scaled, wexto_test_scaled
)

wexto_train_model_dict[svmlabel], wexto_test_model_dict[svmlabel] = model_deploy(
    svm_model, wexto_sample_scaled, wexto_test_scaled
)

wexn_train_model_dict[neulabel], wexn_test_model_dict[neulabel] = model_deploy(
    neunet_model, wexn_sample_scaled, wexn_test_scaled
)

wexn_train_model_dict[neulabel2], wexn_test_model_dict[neulabel2] = model_deploy(
    neunet_model2, wexn_sample_scaled, wexn_test_scaled
)

wexn_train_model_dict[voting], wexn_test_model_dict[voting] = model_deploy(
    clf, wexn_sample_scaled, wexn_test_scaled
)

wexn_train_model_dict[svmlabel], wexn_test_model_dict[svmlabel] = model_deploy(
    svm_model, wexn_sample_scaled, wexn_test_scaled
)

wexs_train_model_dict[neulabel], wexs_test_model_dict[neulabel] = model_deploy(
    neunet_model, wexs_sample_scaled, wexs_test_scaled
)

wexs_train_model_dict[neulabel2], wexs_test_model_dict[neulabel2] = model_deploy(
    neunet_model2, wexs_sample_scaled, wexs_test_scaled
)

wexs_train_model_dict[voting], wexs_test_model_dict[voting] = model_deploy(
    clf, wexs_sample_scaled, wexs_test_scaled
)

wexs_train_model_dict[svmlabel], wexs_test_model_dict[svmlabel] = model_deploy(
    svm_model, wexs_sample_scaled, wexs_test_scaled
)

wexse_train_model_dict[neulabel], wexse_test_model_dict[neulabel] = model_deploy(
    neunet_model, wexse_sample_scaled, wexse_test_scaled
)

wexse_train_model_dict[neulabel2], wexse_test_model_dict[neulabel2] = model_deploy(
    neunet_model2, wexse_sample_scaled, wexse_test_scaled
)

wexse_train_model_dict[voting], wexse_test_model_dict[voting] = model_deploy(
    clf, wexse_sample_scaled, wexse_test_scaled
)

wexse_train_model_dict[svmlabel], wexse_test_model_dict[svmlabel] = model_deploy(
    svm_model, wexse_sample_scaled, wexse_test_scaled
)

wexsw_train_model_dict[neulabel], wexsw_test_model_dict[neulabel] = model_deploy(
    neunet_model, wexsw_sample_scaled, wexsw_test_scaled
)

wexsw_train_model_dict[neulabel2], wexsw_test_model_dict[neulabel2] = model_deploy(
    neunet_model2, wexsw_sample_scaled, wexsw_test_scaled
)

wexsw_train_model_dict[voting], wexsw_test_model_dict[voting] = model_deploy(
    clf, wexsw_sample_scaled, wexsw_test_scaled
)

wexsw_train_model_dict[svmlabel], wexsw_test_model_dict[svmlabel] = model_deploy(
    svm_model, wexsw_sample_scaled, wexsw_test_scaled
)

wexto_train_model_dict[voting] = np.concatenate((
    wexs_train_model_dict[voting], wexn_train_model_dict[voting]
))

wexto_sample_classes = np.concatenate((
    wexs_sample_classes, wexn_sample_classes
))

wexs_not_waves_dict[neulabel] = "Flat"
wexn_not_waves_dict[neulabel] = "Flat"
hem_not_waves_dict[neulabel] = "Flat"

wexn_not_waves_dict[neulabel2] = "Flat"
hem_not_waves_dict[neulabel2] = "Flat"
wexs_not_waves_dict[neulabel2] = "Flat"

wexn_not_waves_dict[svmlabel] = "Flat"
hem_not_waves_dict[svmlabel] = "Flat"
wexs_not_waves_dict[svmlabel] = "Flat"

wexn_not_waves_dict[voting] = "Flat"
hem_not_waves_dict[voting] = "Flat"
wexs_not_waves_dict[voting] = "Flat"

model = svmlabel

class_labels = ["Upper Slope", "Upper Stoss", "Lower Slope", "Lower Stoss"]

change_dict = {}

wexse_change_dict = {}

print("\nCork " + model + " Model Train Score")
accuracy_analysis(
    sample_classes, cork_train_model_dict[model], model, 
    sorted([cla for cla in set(cork_train_model_dict[model])]), 
    corkcm_image
)

print("\nWex North " + model + " Model Valid Score")
accuracy_analysis(
    wexn_sample_classes, wexn_train_model_dict[model], model, 
    sorted([cla for cla in set(wexn_sample_classes)]), 
    wexncm_image
)

print("\nWex South " + model + " Model Valid Score")
accuracy_analysis(
    wexs_sample_classes, wexs_train_model_dict[model], model, 
    sorted([cla for cla in set(wexs_sample_classes)]), 
    wexscm_image
)

print("\nWex South West " + model + " Model Valid Score")
accuracy_analysis(
    wexse_sample_classes, wexse_train_model_dict[model], model, 
    sorted([cla for cla in set(wexse_sample_classes)])
)

print("\nWex South East " + model + " Model Valid Score")
accuracy_analysis(
    wexsw_sample_classes, wexsw_train_model_dict[model], model, 
    sorted([cla for cla in set(wexsw_sample_classes)])
)

print("\nWex Total " + model + " Model Valid Score")
accuracy_analysis(
    wexto_sample_classes, wexto_train_model_dict[model], model, 
    sorted([cla for cla in set(wexto_sample_classes)]), 
    wextocm_image
)

print("\nHem " + model + " Model Valid Score")
accuracy_analysis(
    hem_sample_classes, hem_train_model_dict[model], model,
    sorted([cla for cla in set(hem_sample_classes)]),
    hemcm_image
)

wexhem_classes = np.concatenate((wexto_sample_classes, hem_sample_classes))

wexhem_pred = np.concatenate((wexto_train_model_dict[model], hem_train_model_dict[model]))

print("\nWexhem " + model + " Model Valid Score")

accuracy_analysis(
    wexhem_classes, wexhem_pred, model, 
    sorted([cla for cla in set(wexhem_classes)]), wexhemcm_image
)

cork_sample_df = create_df(
    samples_equal, samples_equal, cork_train_model_dict, collist, n_classes,
     index =  samples_equal.index
)

cork_test_df = create_df(
    test_data, test_data, cork_test_model_dict, collist, n_classes
)

cork_not_waves_df = create_df(
    not_waves, not_waves, cork_not_waves_dict, collist, n_classes
)

wexs_sample_df = create_df(
    wexs_samples_equal, wexs_samples_equal, wexs_train_model_dict, collist, n_classes,
    index = wexs_samples_equal.index
)

wexn_sample_df = create_df(
    wexn_samples_equal, wexn_samples_equal, wexn_train_model_dict, collist, n_classes,
    index = wexn_samples_equal.index
)

wexs_test_df = create_df(
    wexs_test_data, wexs_test_data, wexs_test_model_dict, collist, n_classes
)

wexn_test_df = create_df(
    wexn_test_data, wexn_test_data, wexn_test_model_dict, collist, n_classes
)

wexs_not_waves_df = create_df(
    wexs_not_waves, wexs_not_waves, wexs_not_waves_dict, collist, n_classes
)

wexn_not_waves_df = create_df(
    wexn_not_waves, wexn_not_waves, wexn_not_waves_dict, collist, n_classes
)

hem_sample_df = create_df(
    hem_samples_equal, hem_samples_equal, hem_train_model_dict, collist, n_classes
)

hem_test_df = create_df(
    hem_test_data, hem_test_data, hem_test_model_dict, collist, n_classes
)

hem_not_waves_df = create_df(
    hem_not_waves, hem_not_waves, hem_not_waves_dict, collist, n_classes
)

end = time.time()

duration = end - start

print(duration)

wexn_not_waves_dict[neulabel] = "Flat"
wexn_not_waves_dict[svmlabel] = "Flat"

hem_not_waves_dict[neulabel] = "Flat"
hem_not_waves_dict[svmlabel] = "Flat"

cork_classification_final = pd.concat([cork_sample_df, cork_test_df, cork_not_waves_df])

wexs_classification_final = pd.concat([wexs_sample_df, wexs_test_df, wexs_not_waves_df])

wexn_classification_final = pd.concat([wexn_sample_df, wexn_test_df, wexn_not_waves_df])

hem_classification_final = pd.concat([hem_sample_df, hem_test_df, hem_not_waves_df])

cork_classification_final_gdf = gpd.GeoDataFrame(
    cork_classification_final, geometry = "geometry"
)

cork_classification_final_gdf["Area"] = cork_classification_final_gdf.area

wexn_classification_final_gdf = gpd.GeoDataFrame(
    wexn_classification_final, geometry = "geometry"
)

wexn_classification_final_gdf["Area"] = wexn_classification_final_gdf.area*(1 * 10**-6)

wexs_classification_final_gdf = gpd.GeoDataFrame(
    wexs_classification_final, geometry = "geometry"
)

wexs_classification_final_gdf["Area"] = wexs_classification_final_gdf.area*(1 * 10**-6)

hem_classification_final_gdf = gpd.GeoDataFrame(
    hem_classification_final, geometry = "geometry"
)

wexs_sample_df.to_csv(r"D:/Documents/PhD/Writing&Learning/Paper 1/WexSSamples.csv")

wexs_sample_df_group =  wexs_sample_df.groupby("Labels")

mean_group = wexs_sample_df_group.mean()

mean_group.to_csv(r"D:/Documents/PhD/Writing&Learning/Paper 1/WexSSamplesmean.csv")

wexs_sample_gdf = gpd.GeoDataFrame(wexs_sample_df, geometry = "geometry")

wexs_sample_gdf.to_file(r"D:/Documents/PhD/Writing&Learning/Paper 1/WexSSamples.shp")

hem_classification_final_gdf["Area"] = hem_classification_final_gdf.area*(1 * 10**-6)

cork_classification_final_gdf.to_file(classes_path_new)

wexn_classification_final_gdf.to_file(wexn_classes_path_new)

wexs_classification_final_gdf.to_file(wexs_classes_path_new)

hem_classification_final_gdf.to_file(hem_classes_path_new)

create_id_file(
    classes_path_new, wexs_classes_path_new, intersect_path, csv_path, cork_dict, 
    test_data, neunet_model, neunet_model2
)

create_id_file(
    wexs_classes_path_new, classes_path_new, intersect_path, wex_classes_path, wexs_dict, 
    wexs_test_data, neunet_model, neunet_model2
)

create_id_file(
    wexn_classes_path_new, classes_path_new, intersect_path, wex_classes_path, wexn_dict,
    wexn_test_data, neunet_model, neunet_model2
)

create_id_file(
    hem_classes_path_new, classes_path_new, intersect_path, wex_classes_path, hem_dict,
    hem_test_data, neunet_model, neunet_model2
)



In [None]:
"""
This is the cross validation workflow. Please note that the scaling operation must be
redeployed due to the splitting of the sampled data to avoid inclusion of information 
from the test dataset. Any model that is used can be taken from variables created in the
prior cell i.e. svm_model. Outputs will show total rows in the training dataset, and the 
mean accuracy metrics for the classifier. 
"""

n_splits = 10

stratcv = StratifiedKFold(n_splits = n_splits, shuffle = True, random_state = 42)
wexs = "WexS"
wexn = "WexN"
wexto = "WexTo"
hem = "Hem"
corktrain = "Cork_Train"
corktest = "Cork_Test"

scores = {} 

model = svm_model

params = svm_params

model_name = type(model).__name__

print(model_name)

split_data = sample_scaled #Train data should be all data prior to normalisation
#i.e., samples_equal
cork_data = samples_equal.drop(labels = col_drop_normalisation, axis = 1, errors = "ignore")
wexn_data = wexn_samples_equal.drop(labels = col_drop_normalisation, axis = 1, errors = "ignore")
wexs_data = wexs_samples_equal.drop(labels = col_drop_normalisation, axis = 1, errors = "ignore")
wexto_data = pd.concat([wexs_data, wexn_data])
hem_data = hem_samples_equal.drop(labels = col_drop_normalisation, axis = 1, errors = "ignore")

scaler2 = StandardScaler()

results_df = pd.DataFrame()
cross = 0
validation = 0

for train_index, test_index in stratcv.split(split_data, sample_classes):

    train_subset = cork_data.iloc[train_index].copy()

    test_subset = cork_data.iloc[test_index].copy()
    
    print(train_subset.shape)

    train_subset.interpolate(
        method = interpol, order = 2, axis = 0, inplace = True, 
        limit_direction = "both"
    )

    train_subset_array = train_subset.to_numpy()

    scaler2.fit(train_subset_array)

    train_subset_scaled = scaler2.transform(train_subset_array)

    model.fit(train_subset_scaled, sample_classes[train_index])

    y_train_pred = model.predict(train_subset_scaled)

    training_accuracy = metrics.balanced_accuracy_score(
        sample_classes[train_index], y_train_pred
    )

    training_kappa_score = metrics.cohen_kappa_score(
        sample_classes[train_index], y_train_pred
    )

    scores["fold"] = cross

    scores["params"] = params

    scores["train_kappa_score"] = round(training_kappa_score, 3)

    scores["train_acc_score"] = round(training_accuracy, 3)

    test_subset.interpolate(
        method = interpol, order = 2, axis = 0, inplace = True, 
        limit_direction = "both"
    )

    test_subset_array = test_subset.to_numpy()

    test_subset_scaled = scaler2.transform(
        test_subset_array
    )

    y_test_pred = model.predict(test_subset_scaled)

    test_kappa_score = metrics.cohen_kappa_score(
        sample_classes[test_index], y_test_pred
    )

    scores["test_kappa_score"] = round(test_kappa_score, 3)

    test_accuracy = metrics.balanced_accuracy_score(
        sample_classes[test_index], y_test_pred
    )

    scores["test_acc_score"] = round(test_accuracy, 3)

    scores["kappa_dif"] = round(
        abs(training_kappa_score - test_kappa_score),3
    )
    scores["acc_diff"] = round(abs(training_accuracy - test_accuracy), 3)

    wexn_data_copy = wexn_data.copy()

    wexn_data_copy.interpolate(
        method = interpol, order = 2, axis = 0, inplace = True, limit_direction = "both"
    )

    wexn_data_array = wexn_data_copy.to_numpy()

    wexn_data_scaled = scaler2.transform(
        wexn_data_array
    )

    wexn_pred = model.predict(
        wexn_data_scaled
    )

    wexn_validation_accuracy = metrics.balanced_accuracy_score(
        wexn_sample_classes, wexn_pred
    )

    wexn_validation_kappa_coefficient = metrics.cohen_kappa_score(
        wexn_sample_classes, wexn_pred
    )

    wexs_data_copy = wexs_data.copy()

    wexs_data_copy.interpolate(
        method = interpol, order = 2, axis = 0, inplace = True, 
        limit_direction = "both"
    )

    wexs_data_array = wexs_data_copy.to_numpy()

    wexs_data_scaled = scaler2.transform(wexs_data_array)

    wexs_pred = model.predict(wexs_data_scaled)

    wexs_validation_accuracy = metrics.balanced_accuracy_score(
        wexs_sample_classes, wexs_pred
    )

    wexs_validation_kappa_coefficient = metrics.cohen_kappa_score(
        wexs_sample_classes, wexs_pred
    )

    wexto_data.interpolate(
        method = interpol, order = 2, axis = 0, inplace = True, 
        limit_direction = "both"
    )

    wexto_data_array = wexto_data.to_numpy()

    wexto_data_scaled = scaler2.transform(
        wexto_data_array
    )

    wexto_pred = model.predict(
        wexto_data_scaled
    )

    wexto_validation_accuracy = metrics.balanced_accuracy_score(
        wexto_sample_classes, wexto_pred
    )

    wexto_validation_kappa_coefficient = metrics.cohen_kappa_score(
        wexto_sample_classes, wexto_pred
    )

    hem_data.interpolate(
        method = interpol, order = 2, axis = 0, inplace = True, 
        limit_direction = "both"
    )

    hem_data_array = hem_data.to_numpy()

    hem_data_scaled = scaler2.transform(
        hem_data_array
    )

    hem_pred = model.predict(
        hem_data_scaled
    )

    hem_validation_accuracy = metrics.balanced_accuracy_score(
                                    hem_sample_classes, hem_pred
    )

    hem_validation_kappa_coefficient = metrics.cohen_kappa_score(
                                    hem_sample_classes, hem_pred
    )        

    scores["wexn_kappa_score"] = round(wexn_validation_kappa_coefficient, 3)

    scores["wexn_acc_score"] = round(wexn_validation_accuracy, 3)

    scores["wexs_kappa_score"] = round(wexs_validation_kappa_coefficient, 3)

    scores["wexs_acc_score"] = round(wexs_validation_accuracy, 3) 

    scores["wexto_acc_score"] = round(wexto_validation_accuracy, 3)

    scores["wexto_kappa_score"] = round(wexto_validation_kappa_coefficient, 3)

    scores["hem_kappa_score"] = round(hem_validation_kappa_coefficient , 3)

    scores["hem_acc_score"] = round(hem_validation_accuracy, 3)
    
    results_df = results_df.append(scores, ignore_index = True)
    
    cross += 1
    
print(
    "\nCork Train Kappas: %0.4f (+/- %0.4f)" % \
    (results_df["train_kappa_score"].mean(),
    results_df["train_kappa_score"].std())
)

print(
    "Cork Train Accuracies: %0.4f (+/- %0.4f)" % \
    (results_df.train_acc_score.mean(),
     results_df.train_acc_score.std())
)

print(results_df.train_kappa_score.values)
print(results_df.train_acc_score.values)

print(
    "\nCork Test Kappas: %0.4f (+/- %0.4f)" % \
    (results_df.test_kappa_score.mean(),
     results_df.test_kappa_score.std())
)

print(
    "Cork Test Accuracies: %0.4f (+/- %0.4f)" % \
    (results_df.test_acc_score.mean(),
     results_df.test_acc_score.std())
)

print(results_df.test_kappa_score.values)
print(results_df.test_acc_score.values)


print(
    "\nCork Diff Kappas: %0.4f (+/- %0.4f)" % \
    (results_df.kappa_dif.mean(),
     results_df.kappa_dif.std())
)

print(
    "Cork Diff Accuracies: %0.4f (+/- %0.4f)" % \
    (results_df.acc_diff.mean(),
     results_df.acc_diff.std())
)

print(results_df.kappa_dif.values)
print(results_df.acc_diff.values)


print(
    "\nWex North Kappas: %0.4f (+/- %0.4f)" % \
    (results_df.wexn_kappa_score.mean(),
     results_df.wexn_kappa_score.std())
)
print(
    "Wex North Accuracies: %0.4f (+/- %0.4f)" % \
    (results_df.wexn_acc_score.mean(),
     results_df.wexn_acc_score.std())
)

print(results_df.wexn_kappa_score.values)
print(results_df.wexn_acc_score.values)

print(
    "\nWex South Kappas: %0.4f (+/- %0.4f)" % \
    (results_df.wexs_kappa_score.mean(),
     results_df.wexs_kappa_score.std())
)

print(
    "Wex South Accuracies: %0.4f (+/- %0.4f)" % \
    (results_df.wexs_acc_score.mean(),
     results_df.wexs_acc_score.std())
)

print(results_df.wexs_kappa_score.values)
print(results_df.wexs_acc_score.values)

print(
    "\nWex Total Kappas: %0.4f (+/- %0.4f)" % \
    (results_df.wexto_kappa_score.mean(),
     results_df.wexto_kappa_score.std())
)
print(
    "Wex Total Accuracies: %0.4f (+/- %0.4f)" % \
    (results_df.wexto_acc_score.mean(),
     results_df.wexto_acc_score.std())
)

print(results_df.wexto_kappa_score.values)
print(results_df.wexto_acc_score.values)


In [None]:
"""
This cell will seek to save any cross validation results. 
"""
print(model_name)
cv_path_root = r"D:/Documents/PhD/Writing&Learning/Paper 1/Cross_Validation"
cv_path = os.path.join(
    cv_path_root, str(model_name) + str(iteration) + "2.csv"
).replace("\\", "/")

cols = [
    "Train Accuracy", "Test Accuracy", "Acc Diff", "Train Kappa", "Test Kappa",
    "Kappa Diff"
]

train_kappa = results_df.train_kappa_score.to_list()
train_kappa = [i * 100 for i in train_kappa]
train_accuracy = results_df.train_acc_score.to_list()
train_accuracy = [i * 100 for i in train_accuracy]
test_kappa = results_df.test_kappa_score.to_list()
test_kappa = [i * 100 for i in test_kappa]
test_accuracy = results_df.test_acc_score.to_list()
test_accuracy = [i * 100 for i in test_accuracy]
train_test_acc_diff = results_df.acc_diff.to_list()
train_test_acc_diff =  [i * 100 for i in train_test_acc_diff]
train_test_kappa_diff = results_df.kappa_dif.to_list()
train_test_kappa_diff = [i * 100 for i in train_test_kappa_diff]

accuracy_list = list(
    zip(
    train_accuracy, test_accuracy, train_test_acc_diff, train_kappa, test_kappa,
    train_test_kappa_diff
    )
)

dataframe = pd.DataFrame(accuracy_list, columns = cols)

mean = [
    st.mean(train_accuracy),st.mean(test_accuracy), st.mean(train_test_acc_diff), 
    st.mean(train_kappa), st.mean(test_kappa), st.mean(train_test_kappa_diff)
]

std = [
    round(st.stdev(train_accuracy),2), round(st.stdev(test_accuracy),2), 
    round(st.stdev(train_test_acc_diff), 2), round(st.stdev(train_kappa), 2),
    round(st.stdev(test_kappa), 2), round(st.stdev(train_test_kappa_diff), 2)   
    
]
folds = [*range(1,11)]

print(len(mean))

folds.extend(["Mean", "Standard Deviation"])

print(folds)

mean_len = dataframe.shape[0]

dataframe.loc[mean_len] = mean

print(dataframe.head(11))

std_len = dataframe.shape[0]

dataframe.loc[std_len] = std

print(dataframe.head(13))

dataframe.insert(loc = 0, column = "Fold", value = folds)

print(cv_path)

dataframe.to_csv(cv_path, index = False)

In [None]:
"""
The following code creates the parameter grid for the hyperparameter tuning, this can be altered
to suit each individual classifier algorithm. 
"""
rsrange = range(1, 11)

linearsvmparams = {
    "penalty": ["l1", "l2"], "loss": ["hinge", "squared_hinge"], "dual": [True, False],
    "tol": [0.001, 0.01, 0.1, 1], 
    "C": [1, 10, 100, 1000, 0.1, 0.01], "multi_class": ["ovr", "crammer_singer"],
    "fit_intercept": [True, False], "random_state": [i for i in rsrange]
}

svm = LinearSVC()

now = datetime.now()

current_time = now.strftime("%H:%M:%S")

print("Current Time =", current_time)

mlp = MLPClassifier()

start_cv = time.time()

paramgrid = list(ParameterGrid(linearsvmparams))

data = samples_equal.copy()

print([i for i in rsrange])
print(len(paramgrid))

In [None]:
"""
This cell contains the code required to distribute the hyperparameter tuning process across
multiple cores. We recommend that this is conducted in runs of 10000 parameter sets or less as 
a memory leak in the code prevents full utilisation of the code across larger sets of parameters.
The kappa coefficient is the metric that we chose to evaluate classification efficacy, other
parameters can be input to replace this. 
"""

import ray

now = datetime.now()

current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)

num_cpus = psutil.cpu_count(logical = False) - 1

start_ray = time.time()

better_scores = []

best_score = 0.525

if __name__ == "__main__":
    
    ray.init(num_cpus = num_cpus) 
    
    @ray.remote
    def parallel_processing(
        model, pm, train_data, train_labels, wexto_test_data, wexto_test_labels, scoredict
    ):
        model.set_params(**pm)
        try:
            model.fit(train_data, train_labels)
            wexto_test_pred = model.predict(wexto_test_data)
            tokappa_score = metrics.cohen_kappa_score(wexto_test_labels, wexto_test_pred)
            if tokappa_score >= best_score:
                print("\nPay dirt: " + str(round(tokappa_score, 3)))
                best_data = pm
                return (tokappa_score, best_data)
            else:
                print("\nNo: " + str(round(tokappa_score, 3)))
                pass
        except:
            pass

    results = ray.get([
        parallel_processing.remote(
            svm, parameter, sample_scaled, sample_classes, 
            wexto_sample_scaled_equal, wexto_sample_classes_equal, better_scores
        ) for parameter in paramgrid
    ])
    
ray.shutdown()
end_ray = time.time()

In [None]:
"""
The code in this cell allows for the shutdown of the ray kernel and display of the resultant
parameters. All null values are filtered from the list at this time.
"""
ray.shutdown()
end_ray = time.time()
totaltime = (end_ray - start_ray)/60
resultsreal = list(filter(None, results))
print(totaltime)
print(resultsreal)

In [None]:
"""
The code here is written to create combinations of MLP layers. This code is designed to extract
the combination with the highest accuracy score for the low resolution dataset. 
"""
better_kappa = 0
for combo in combinations(param_list, 2):
    estimators = [
        ("mlp1", MLPClassifier(**combo[0])),
        ("mlp2", MLPClassifier(**combo[1]))
    ] 
    for com in combinations([1,1.5, 2, 1.25, 1], 2):
        weights = [com[0], com[1]]
        model = VotingClassifier(estimators, voting = "soft", weights = weights)
        model.fit(sample_scaled, sample_classes)
        testo_pred = model.predict(wexto_sample_scaled)
        testo_accuracy = metrics.balanced_accuracy_score(wexto_sample_classes, testo_pred) 
        testo_kappa = metrics.cohen_kappa_score(wexto_sample_classes, testo_pred) 
        if testo_kappa > better_kappa:
            better_kappa = testo_kappa
            print(estimators)
            print(weights)
            print(testo_kappa)