## Step 1. Import necessary libraries

In [None]:
# import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import time

import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, multilabel_confusion_matrix
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, GridSearchCV, GroupKFold, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Normalizer, OneHotEncoder, RobustScaler, StandardScaler
from sklearn.svm import SVC

from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})

## Step 2a. Declare user variables

In [None]:
is_test = False
is_dataset_one_file = False

user_specified_steps = 1
user_specified_layout = 'layout1'
user_specified_layout_int = int(user_specified_layout[len(user_specified_layout) - 1])
user_specified_nth = 3
user_specified_clf = 'all'

user_test_size = 0.2
user_random_state = 1

if user_specified_layout_int == 1:
    ith_inp_col = 16
else:
    ith_inp_col = 20

list_filenames = {
    'layout1':'dataset1.csv',
    'layout2':'dataset2.csv',
}
list_columns = {
    'layout1_all':['timestamp','posture_id','posture_label','s01','s02','s03','s04','s05','s06','s07','s08','s09','s10','s11','s12','s13','s14','s15','s16','birth_year','sex','height','weight','bmi','bmi_label','full_name','nth','round'],
    'layout2_all':['timestamp','posture_id','posture_label','s01','s02','s03','s04','s05','s06','s07','s08','s09','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20','birth_year','sex','height','weight','bmi','bmi_label','full_name','nth','round'],
    'layout1_cat_inp':[],
    'layout1_num_inp':['s01','s02','s03','s04','s05','s06','s07','s08','s09','s10','s11','s12','s13','s14','s15','s16', 'height','weight','bmi'],
    'layout2_cat_inp':[],
    'layout2_num_inp':['s01','s02','s03','s04','s05','s06','s07','s08','s09','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20', 'height','weight','bmi'],
}
list_positions = ['Yearner_Right', 'Yearner_Left', 'Fetal_Right', 'Fetal_Left', 'Log_Right', 'Log_Left', 'Supine', 'Prone',
                  # 'Empty'
]

filename = list_filenames[user_specified_layout]
cols_all = list_columns[user_specified_layout + '_all']
cols_cat_inp = list_columns[user_specified_layout + '_cat_inp']
cols_num_inp = list_columns[user_specified_layout + '_num_inp']
cols_inp = cols_cat_inp + cols_num_inp
cols_drp = list(set(cols_all) - set(cols_inp))

cols_num_inp_std = ['height','weight','bmi']
cols_num_inp_nrm = list(set(cols_num_inp) - set(cols_num_inp_std))

col_grp = 'full_name'
col_trg = 'posture_id'

pd.set_option('display.float_format', lambda x: '%.2f' % x)
sns.reset_orig()
print(cols_drp) # this print line to be removed

## Step 2b. Declare and prepare needed variables

In [None]:
if is_dataset_one_file:
    df = pd.read_csv(filename, usecols = cols_all)
else:
    import os
    import glob

    # os.chdir("C:/Users/Julianne/Documents/Notebooks/thesis/data/set_{}".format(user_specified_layout_int))
    os.chdir("C:/Users/Julianne/Downloads/set_{} clean".format(user_specified_layout_int))

    extension = 'csv'
    list_raw_filenames = [i for i in glob.glob('*.{}'.format(extension))]

    df = pd.concat([pd.read_csv(f, usecols = cols_all) for f in list_raw_filenames])

    df = df[df.nth <= 5]
    if(isinstance(user_specified_nth, int))
        print('user_specified_nth is an instance of int')
        df = df[df.nth == user_specified_nth]

    ## export to csv
    # combined_csv.to_csv("dataset{}.csv".format(user_specified_layout_int), index=False, encoding='utf-8-sig')
    
    os.chdir("C:/Users/Julianne/Documents/Notebooks/thesis".format(user_specified_layout_int))

In [None]:
if is_test:
    df

In [None]:
if is_test:
    df.info()

In [None]:
df_inp = df.drop(columns=cols_drp)

X = df_inp

y = df[col_trg].values
groups = df[col_grp].values

In [None]:
cols_cat_inp_idx = df_inp.columns.get_indexer(cols_cat_inp)
cols_num_inp_idx = df_inp.columns.get_indexer(cols_num_inp)

In [None]:
# #apply SelectKBest class to extract top 10 best features
# bestfeatures = SelectKBest(score_func=chi2, k=10)
# fit = bestfeatures.fit(X,y)
# dfscores = pd.DataFrame(fit.scores_)
# dfcolumns = pd.DataFrame(X.columns)
# #concat two dataframes for better visualization 
# featureScores = pd.concat([dfcolumns,dfscores],axis=1)
# featureScores.columns = ['Specs','Score']  #naming the dataframe columns
# print(featureScores.nlargest(10,'Score'))  #print 10 best features

## Step 3. Visualize the dataset

In [None]:
if is_test:
    df_inp.describe()

In [None]:
# pd.plotting.scatter_matrix(X, figsize=(10, 10));

In [None]:
if is_test:
    # Create correlation matrix
    corr_matrix = df.drop(columns=cols_drp).corr()

    # Define mask used to cover squares above diagonal
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

    plt.figure(figsize=(20, 10), facecolor='w', edgecolor='k')
    plt.title('Correlation Matrix - ' + user_specified_layout)
    sns.set(font_scale=1.2)
    sns.heatmap(corr_matrix, cmap='coolwarm', center = 0, annot=True, fmt='.1g', mask=mask)

In [None]:
steps_cat_inp = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
]
pipe_cat_inp = Pipeline(steps_cat_inp)

steps_num_inp = [
    ('imputer', SimpleImputer(strategy='mean')),
#     ('scaler', MinMaxScaler())
]
pipe_num_inp = Pipeline(steps_num_inp)

ct = ColumnTransformer(transformers=[
          ('categorical', pipe_cat_inp, cols_cat_inp_idx),
          ('numerical', pipe_num_inp, cols_num_inp_idx)
])

## Step 4. Split the data into training and testing datasets

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=user_test_size, random_state=user_random_state, stratify=y)

In [None]:
df[col_grp].nunique()

In [None]:
X = df_inp.to_numpy()

gkf = GroupKFold(n_splits=df[col_grp].nunique())
gkf_split = gkf.split(X, y, groups=groups)
for train_index, test_index in gkf_split:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
#     print("%s %s" % (train_index, test_index))

## Step 5. Declare needed functions

In [None]:
def generate_confusion_matrix(cnf_matrix, classes, normalize=False, title='Confusion Matrix'):
    if normalize:
        cnf_matrix = cnf_matrix.astype('float') / cnf_matrix.sum(axis=1)[:, np.newaxis]
        print("Confusion Matrix, With Normalized Counts")
        ConfusionMatrixDisplay.from_predictions(actual_targets, predicted_targets, 
                                                display_labels = list_positions, 
                                                xticks_rotation = 45, 
                                                normalize='true', 
                                                values_format = '.2f')
    else:
        print("Confusion Matrix, Without Normalized Counts")
        ConfusionMatrixDisplay.from_predictions(actual_targets, predicted_targets, 
                                                display_labels = list_positions, 
                                                xticks_rotation = 45)

#     plt.imshow(cnf_matrix, interpolation='nearest', cmap=plt.get_cmap('Blues'))
    plt.title(title)
#     plt.colorbar()

#     tick_marks = np.arange(len(classes))
#     plt.xticks(tick_marks, classes, rotation=45)
#     plt.yticks(tick_marks, classes)

#     fmt = '.2f' if normalize else 'd'
#     thresh = cnf_matrix.max() / 2.

#     for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
#         plt.text(j, i, format(cnf_matrix[i, j], fmt), horizontalalignment="center",
#                  color="white" if cnf_matrix[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    return cnf_matrix

In [None]:
def plot_confusion_matrix(actual_targets, predicted_targets, classifier):
    cm = confusion_matrix(actual_targets, predicted_targets)
    np.set_printoptions(precision=2)

    print(accuracy_score(actual_targets, predicted_targets))
    print(precision_score(actual_targets, predicted_targets, average='micro'))
    print(recall_score(actual_targets, predicted_targets, average='micro'))
    print(f1_score(actual_targets, predicted_targets, average='micro'))
    print(classification_report(actual_targets, predicted_targets, target_names=list_positions))
#     print(classification_report(actual_targets, predicted_targets, labels=list_positions))

    # Plot non-normalized confusion matrix
    plt.figure()
    generate_confusion_matrix(cm, classes=list_positions, title='%s Confusion Matrix - Layout %x (Without Normalized Counts)' % (classifier, user_specified_layout_int))
    plt.savefig('CM-%xS-L%x-%s-nonnormalized.png' % (user_specified_steps, user_specified_layout_int, classifier), format='png', bbox_inches="tight")
    plt.show()

    # Plot normalized confusion matrix
    plt.figure()
    generate_confusion_matrix(cm, classes=list_positions, normalize=True, title='%s Confusion Matrix - Layout %x (With Normalized Counts)' % (classifier, user_specified_layout_int))
    plt.savefig('CM-%xS-L%x-%s-normalized.png' % (user_specified_steps, user_specified_layout_int, classifier), format='png', bbox_inches="tight")
    plt.show()

In [None]:
def evaluate_model(data_x, data_y, classifier):
    group_k_fold = GroupKFold(n_splits=df[col_grp].nunique())

    predicted_targets = np.array([])
    actual_targets = np.array([])

    for i, (train_index, test_index) in enumerate(group_k_fold.split(data_x, data_y, groups=groups)):
        train_x, train_y, test_x, test_y = data_x[train_index], data_y[train_index], data_x[test_index], data_y[test_index]
#         print("%s %s" % (test_x, test_y))

        if classifier == 'MLP':
            pipe = Pipeline([
                ('pre', ct), 
#                 ('feature_selection', SelectKBest(score_func=f_regression, k=4)),
                ('clf', MLPClassifier(max_iter=5000))
            ])
        elif classifier == 'SVM':
            pipe = Pipeline([
                ('pre', ct), 
#                 ('feature_selection', SelectKBest(score_func=f_regression, k=4)),
                ('clf', SVC(kernel='rbf', random_state=user_random_state))
            ])

        cols_identifying_idx = df_inp.columns.get_indexer(['height','weight','bmi'])

        pipe.fit(train_x, train_y)
        predicted_labels = pipe.predict(test_x)

        predicted_targets = np.append(predicted_targets, predicted_labels)
        actual_targets = np.append(actual_targets, test_y)

#         print('height: %.2f, weight: %.2f, bmi: %.2f' % (test_x[0, cols_identifying_idx[0]], test_x[0, cols_identifying_idx[1]], test_x[0, cols_identifying_idx[2]]))
        cmd = ConfusionMatrixDisplay.from_predictions(test_y, predicted_labels, 
                                                       display_labels = list_positions, 
                                                       xticks_rotation = 45, 
                                                       normalize='true', 
                                                       values_format = '.2f')
#         cmd.ax_.set_title('%s Confusion Matrix - Layout %x (height: %.2f, weight: %.2f, bmi: %.2f)' % (classifier, user_specified_layout_int, test_x[0, cols_identifying_idx[0]], test_x[0, cols_identifying_idx[1]], test_x[0, cols_identifying_idx[2]]))
        plt.title('%s Confusion Matrix - Layout %x (height: %.2f, weight: %.2f, bmi: %.2f)' % (classifier, user_specified_layout_int, test_x[0, cols_identifying_idx[0]], test_x[0, cols_identifying_idx[1]], test_x[0, cols_identifying_idx[2]]))
        plt.tight_layout()
#         plt.savefig('kfold confusion matrices/CM-%xS-L%x-%s-height-%.2f, weight-%.2f, bmi-%.2f.png' % (user_specified_steps, user_specified_layout_int, classifier, test_x[0, cols_identifying_idx[0]], test_x[0, cols_identifying_idx[1]], test_x[0, cols_identifying_idx[2]]), format='png', bbox_inches="tight")
        plt.show()

#         plot_confusion_matrix(test_y, predicted_labels)

    return actual_targets, predicted_targets

## Step 6. Execute

In [None]:
data = X
target = y

In [None]:
actual_targets, predicted_targets = evaluate_model(data, target, 'SVM')

In [None]:
plot_confusion_matrix(actual_targets, predicted_targets, 'SVM')

In [None]:
actual_targets, predicted_targets = evaluate_model(data, target, 'MLP')

In [None]:
plot_confusion_matrix(actual_targets, predicted_targets, 'MLP')