<a href="https://colab.research.google.com/github/KingDurian/oss/blob/main/parkinsons_train_probabilities.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
from google.colab import drive
drive.mount('/content/drive')
!cp /content/drive/MyDrive/config.py .
!cp drive/MyDrive/model_dispatcher.py .
import config
import model_dispatcher


import sys
import importlib.util


importlib.reload(config)
importlib.reload(model_dispatcher)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


<module 'model_dispatcher' from '/content/model_dispatcher.py'>

In [56]:

# configuration information are in these files

import model_dispatcher

import itertools

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn import tree
from sklearn.metrics import classification_report, confusion_matrix



## Data Preprocessing

In [57]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, KBinsDiscretizer, LabelEncoder, OrdinalEncoder
# Column Transformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [58]:
def transform_data(df, fold):    

    # new ratio features
    numeric_features = ['MDVP_Fo_Hz_', 'MDVP_Fhi_Hz_', 'MDVP_Flo_Hz_', 'MDVP_Jitter_', 'MDVP_Jitter_Abs_',
 'MDVP_RAP', 'MDVP_PPQ', 'Jitter_DDP', 'MDVP_Shimmer', 'MDVP_Shimmer_dB_', 'Shimmer_APQ3', 'Shimmer_APQ5',
 'MDVP_APQ', 'Shimmer_DDA', 'NHR', 'HNR', 'RPDE', 'DFA', 'spread1', 'spread2', 'D2', 'PPE']
            
    # training data - use all the data that does not belong to the fold
    # validation data - use all the data that is marked with that fold
    df_train = df.loc[df.kfold != fold].reset_index(drop=True)
    df_valid = df.loc[df.kfold == fold].reset_index(drop=True)
    
    # train
    # drop the target/label
    X_train = df_train[numeric_features]
    y_train = df_train['status'].values
    
    # validation
    # drop the target/label
    X_valid = df_valid[numeric_features]
    y_valid = df_valid['status'].values
    
    # learn from entire dataset for transformation - good idea or no?
    colTransformer = None

    colTransformer = ColumnTransformer([('standardize',StandardScaler(),numeric_features)],
        remainder="passthrough")
        
    colTransformer.fit(df[numeric_features])   
    
    train_data_transformed = colTransformer.transform (X_train)
    valid_data_transformed = colTransformer.transform (X_valid)
        
    return (train_data_transformed, valid_data_transformed, y_train, y_valid)    

In [59]:
def run_fold(fold, modelName):
    df = pd.read_csv(config.TRAINING_FILE)
      
    
    x_train, x_valid, y_train, y_valid = transform_data(df, fold)
    
    classifier = model_dispatcher.models[modelName]
    classifier.fit(x_train, y_train)
    
    # create predictions for validation data
    # We are predicting probabilites scores
    preds = classifier.predict_proba(x_valid)    
    preds = np.where(preds[:,1] >= config.THRESHOLD,1,0)
    
    print("Threshold", config.THRESHOLD)
    # calculate and print accuracy metrics
    accuracy = metrics.accuracy_score(y_valid, preds)
    
    print(f'Fold: {fold}, Accuracy:{accuracy:0.4f}')
    
    # Compute confusion matrix
    print('Confusion Matrix')    
    cnf_matrix = confusion_matrix(y_valid,preds,labels=[0,1])    
    print(cnf_matrix)
    
    print('Confusion Matrix - Normalized')
    cnf_matrix = confusion_matrix(y_valid,preds,labels=[0,1], normalize='true')    
    print(cnf_matrix)
    
    # calculate and print accuracy metrics
    accuracy = metrics.accuracy_score(y_valid, preds)
    recall = metrics.recall_score(y_valid, preds)
    precision = metrics.precision_score(y_valid, preds)
    f1_score = metrics.f1_score(y_valid, preds)

    print(f'  Fold: {fold}, Accuracy:{accuracy:0.4f}, F1Score:{f1_score:0.4f}, Recall:{recall:0.4f}, Precision:{precision:0.4f}')    
  
    


In [60]:
# train the models with logistic regression
for fold in range(5):
    run_fold(fold=fold, modelName= "knn" )

Threshold 0.7
Fold: 0, Accuracy:0.6923
Confusion Matrix
[[ 4  5]
 [ 7 23]]
Confusion Matrix - Normalized
[[0.44444444 0.55555556]
 [0.23333333 0.76666667]]
  Fold: 0, Accuracy:0.6923, F1Score:0.7931, Recall:0.7667, Precision:0.8214
Threshold 0.7
Fold: 1, Accuracy:0.7692
Confusion Matrix
[[ 6  3]
 [ 6 24]]
Confusion Matrix - Normalized
[[0.66666667 0.33333333]
 [0.2        0.8       ]]
  Fold: 1, Accuracy:0.7692, F1Score:0.8421, Recall:0.8000, Precision:0.8889
Threshold 0.7
Fold: 2, Accuracy:0.9231
Confusion Matrix
[[ 8  2]
 [ 1 28]]
Confusion Matrix - Normalized
[[0.8        0.2       ]
 [0.03448276 0.96551724]]
  Fold: 2, Accuracy:0.9231, F1Score:0.9492, Recall:0.9655, Precision:0.9333
Threshold 0.7
Fold: 3, Accuracy:0.7949
Confusion Matrix
[[ 7  3]
 [ 5 24]]
Confusion Matrix - Normalized
[[0.7        0.3       ]
 [0.17241379 0.82758621]]
  Fold: 3, Accuracy:0.7949, F1Score:0.8571, Recall:0.8276, Precision:0.8889
Threshold 0.7
Fold: 4, Accuracy:0.5897
Confusion Matrix
[[ 2  8]
 [ 8 21