In [21]:
from Scripts.utils.pd_classification_funcs import *
from Scripts.utils.pd_feature_preprocessing_funcs import get_all_dataset_paths
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
import warnings
import itertools
warnings.filterwarnings("ignore")

# Classification Analysis
This final notebook goes over the various classifiers used and tested on the three datasets. In total, 17 classifiers were used and evaluated using cross-validation where each of the six subject's data were used as a testing set.

In [22]:
final_dataset_path = '../../Study/Data/features/filtered/'
avg_results_path = '../../Results/average/'
folds_results_path = '../../Results/folds/'
all_filtered_dataset_paths = get_all_dataset_paths('filtered')

In [23]:
all_filtered_dataset_paths

['../../Study/Data/features/filtered/.DS_Store',
 '../../Study/Data/features/filtered/pd_whole_filtered_features.csv',
 '../../Study/Data/features/filtered/pd_500-100-windows_filtered_features.csv',
 '../../Study/Data/features/filtered/pd_1000-1000-windows_filtered_features.csv',
 '../../Study/Data/features/filtered/pd_500-500-windows_filtered_features.csv',
 '../../Study/Data/features/filtered/pd_5000-1000-windows_filtered_features.csv',
 '../../Study/Data/features/filtered/pd_1000-250-windows_filtered_features.csv',
 '../../Study/Data/features/filtered/pd_light-intervals-windows_filtered_features.csv',
 '../../Study/Data/features/filtered/pd_500-250-windows_filtered_features.csv',
 '../../Study/Data/features/filtered/pd_250-250-windows_filtered_features.csv']

In [24]:
dataset_path = all_filtered_dataset_paths[1]

In [25]:
dataset_path

'../../Study/Data/features/filtered/pd_whole_filtered_features.csv'

In [26]:
dataset_name = dataset_path.split('/')[-1:][0].split('_')[1]
folds_results_path += dataset_name + '/'

In [27]:
dataset_name

'whole'

In [28]:
dataset_df = pd.read_csv(dataset_path)

## Splitting Dataset

In [29]:
# Getting subject groupings for all datasets
dataset_id_groupings = dataset_df['ID']

In [30]:
# Getting features and target values for all datasets
dataset_target = dataset_df['PD_Class'].copy().values
dataset_features = dataset_df.iloc[:, 4:].copy().values

## Training & Testing Models

In [31]:
def create_mlp_list(neurons_per_layer_range, hidden_layers_range):
    # Define the range of values
    values_range = range(1, neurons_per_layer_range + 1)
    # Generate all combinations of tuples
    combinations = list(itertools.product(values_range, repeat=hidden_layers_range))
    
    mlp_list = []
    for neuron_config in combinations:
        mlp_list.append(MLPClassifier(max_iter=1000, hidden_layer_sizes=neuron_config))
    return mlp_list

In [32]:
# All classification models that allow negative values for features
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(kernel='sigmoid'),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'Voting': VotingClassifier(voting="hard", estimators=[('lr', LogisticRegression()), ('sv', AdaBoostClassifier()), ('rf', ExtraTreesClassifier())]),
    'ANNs': create_mlp_list(5, 2)
}

In [33]:
model_score_columns = list(scoring_methods.keys())
model_score_columns.insert(0, 'Model')
avg_results = pd.DataFrame(columns=model_score_columns)

### Logistic Regression

In [335]:
lr_results, lr_fold_scores = cross_val_model(model=models['Logistic Regression'], features=dataset_features, target=dataset_target, id_groupings=dataset_id_groupings, groupk_folds=5, print_scores=False)
lr_results['Model'] = 'Logistic Regression'
avg_results = pd.concat([avg_results, pd.DataFrame.from_dict(lr_results, orient='index').T], ignore_index=True)

In [336]:
lr_fold_scores

Unnamed: 0,Accuracy,Precision,Sensitivity,Specificity,F1
0,0.567773,0.622435,0.64566,0.82,0.562071
1,0.541494,0.513636,0.514525,0.602754,0.512086
2,0.664975,0.632756,0.637942,0.72281,0.634657
3,0.678249,0.67831,0.678515,0.670699,0.678174
4,0.627095,0.629287,0.6281,0.576712,0.626524


In [337]:
lr_fold_scores.to_csv(folds_results_path + 'pd_' + dataset_name + '_' + lr_results['Model'].replace(' ', '-') + '_fold_scores.csv', index=False)

### Support Vector Machine

In [338]:
sv_results, sv_fold_scores = cross_val_model(model=models['Support Vector Machine'], features=dataset_features, target=dataset_target, id_groupings=dataset_id_groupings, groupk_folds=5, print_scores=False)
sv_results['Model'] = 'Support Vector Machine'
avg_results = pd.concat([avg_results, pd.DataFrame.from_dict(sv_results, orient='index').T], ignore_index=True)

In [339]:
sv_fold_scores

Unnamed: 0,Accuracy,Precision,Sensitivity,Specificity,F1
0,0.576763,0.569122,0.586255,0.6075,0.550743
1,0.506916,0.50269,0.502965,0.51589,0.492828
2,0.515228,0.495143,0.49468,0.559192,0.490079
3,0.546908,0.545316,0.543539,0.642473,0.540571
4,0.628492,0.630052,0.629306,0.587671,0.628172


In [340]:
sv_fold_scores.to_csv(folds_results_path + 'pd_' + dataset_name + '_' + sv_results['Model'].replace(' ', '-') + '_fold_scores.csv', index=False)

### Random Forest

In [341]:
rf_results, rf_fold_scores = cross_val_model(model=models['Random Forest'], features=dataset_features, target=dataset_target, id_groupings=dataset_id_groupings, groupk_folds=5, print_scores=False)
rf_results['Model'] = 'Random Forest'
avg_results = pd.concat([avg_results, pd.DataFrame.from_dict(rf_results, orient='index').T], ignore_index=True)

In [342]:
rf_fold_scores

Unnamed: 0,Accuracy,Precision,Sensitivity,Specificity,F1
0,0.639004,0.650619,0.677249,0.7675,0.622919
1,0.612033,0.584869,0.595788,0.658898,0.582456
2,0.660533,0.623502,0.642984,0.719923,0.638484
3,0.606671,0.623482,0.605418,0.659946,0.616046
4,0.703911,0.703045,0.706998,0.632877,0.704477


In [343]:
rf_fold_scores.to_csv(folds_results_path + 'pd_' + dataset_name + '_' + rf_results['Model'].replace(' ', '-') + '_fold_scores.csv', index=False)

### Artificial Neural Networks (MLPs)

In [59]:
best_ann_model = None

best_ann_aggr_model_score = 0
best_ann_model_fold_scores = None
for model in models['ANNs']:
    curr_ann_results, curr_ann_fold_scores = cross_val_model(model=model, features=dataset_features, target=dataset_target, id_groupings=dataset_id_groupings, groupk_folds=5, print_scores=False)
    curr_ann_aggr_score = curr_ann_results['Accuracy'] * 1.2 + curr_ann_results['F1'] + curr_ann_results['Specificity']
    curr_ann_results['Model'] = 'ANN ' + str(model.hidden_layer_sizes).replace(' ', '')
    avg_results = pd.concat([avg_results, pd.DataFrame.from_dict(curr_ann_results, orient='index').T], ignore_index=True)
    if curr_ann_aggr_score > best_ann_aggr_model_score:
        best_ann_aggr_model_score = curr_ann_aggr_score
        best_ann_model_fold_scores = curr_ann_fold_scores
        best_ann_model = curr_ann_results['Model']

In [60]:
best_ann_model

'ANN (3,3)'

In [61]:
best_ann_model_fold_scores

Unnamed: 0,Accuracy,Precision,Sensitivity,Specificity,F1
0,0.777778,0.857143,0.7625,0.8,0.828571
1,0.5,0.863636,0.75,1.0,0.805668
2,0.9375,1.0,0.875,1.0,0.873016
3,0.8125,0.25,0.8125,0.75,0.333333
4,0.625,0.718182,0.625,1.0,0.676113


In [62]:
best_ann_model_fold_scores.to_csv(folds_results_path + 'pd_' + dataset_name + '_' + best_ann_model.replace(' ', '-') + '_fold_scores.csv', index=False)

## Final Results

In [348]:
avg_results

Unnamed: 0,Model,Accuracy,Precision,Sensitivity,Specificity,F1
0,Logistic Regression,0.615917,0.615285,0.620948,0.678595,0.602702
1,Support Vector Machine,0.554861,0.548465,0.551349,0.582545,0.540478
2,Random Forest,0.64443,0.637103,0.645687,0.687829,0.632876
3,"ANN (1,1)",0.593678,0.614715,0.584144,0.7475,0.481777
4,"ANN (1,2)",0.620685,0.612174,0.565505,0.659635,0.551622
5,"ANN (1,3)",0.604435,0.617938,0.632596,0.658258,0.614458
6,"ANN (1,4)",0.615358,0.626882,0.623616,0.672196,0.605667
7,"ANN (1,5)",0.612984,0.611947,0.620316,0.653444,0.603158
8,"ANN (2,1)",0.631013,0.534949,0.652445,0.76807,0.608689
9,"ANN (2,2)",0.620751,0.62562,0.598143,0.669868,0.606178


In [349]:
avg_results.to_csv(avg_results_path + 'pd_' + dataset_name + '_classification_avg_results.csv', index=False)