In [1]:
import sys
sys.path.insert(0, '/Users/matthewashman/github/MasterProject2018')

# Import necessary modules. Set settings. Import data.
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import pywt
import math
from IPython.display import HTML

# For model building
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, recall_score, make_scorer
from sklearn import svm, naive_bayes, neighbors, gaussian_process
from sklearn.linear_model import LogisticRegression
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import RBF
from scipy.spatial.distance import euclidean
from sklearn.decomposition import PCA
from skrvm import RVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score

# For feature extraction
from scipy.interpolate import CubicSpline      # for warping
from statsmodels.robust import mad
from tsfresh.feature_extraction import feature_calculators
from FeatureExtraction.feature_tools import detect_peaks
from sklearn.utils import resample
import fastdtw

# Miscelaneous
from IPython.display import display, clear_output
import pdb

plt.style.use('default')

X_train = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_train.pkl')
X_validation = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_validation.pkl')
X_augmented_01 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_augmented_01.pkl')
X_augmented_02 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_augmented_02.pkl')
X_augmented_03 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_augmented_03.pkl')
X_augmented_04 = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_augmented_04.pkl')
X_test = pd.read_pickle('/Users/matthewashman/github/MasterProject2018/EPDataAnalysis/Final Report/X_test.pkl')

  from pandas.core import datetools


## Isolate Feature Matrices, Targets and Information

In [3]:
# Isolate feature matrices, target vectors and information for upsampled dataset
X_train_ = X_train.drop(['Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2'], axis=1)
y_train = X_train['Label'].astype(int)
info_train = X_train[['Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2']]

X_augmented_01_ = X_augmented_01.drop(['Augmented', 'Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2'], axis=1)
y_augmented_01 = X_augmented_01['Label'].astype(int)
info_augmented_01 = X_augmented_01[['Augmented', 'Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2']]

X_augmented_02_ = X_augmented_02.drop(['Augmented', 'Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2'], axis=1)
y_augmented_02 = X_augmented_02['Label'].astype(int)
info_augmented_02 = X_augmented_02[['Augmented', 'Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2']]

X_augmented_03_ = X_augmented_03.drop(['Augmented', 'Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2'], axis=1)
y_augmented_03 = X_augmented_03['Label'].astype(int)
info_augmented_03 = X_augmented_03[['Augmented', 'Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2']]

X_augmented_04_ = X_augmented_04.drop(['Augmented', 'Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2'], axis=1)
y_augmented_04 = X_augmented_04['Label'].astype(int)
info_augmented_04 = X_augmented_04[['Augmented', 'Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2']]

X_validation_ = X_validation.drop(['Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2'], axis=1)
y_validation = X_validation['Label'].astype(int)
info_validation = X_validation[['Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2']]

X_test_ = X_test.drop(['Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2'], axis=1)
y_test = X_test['Label'].astype(int)
info_test = X_test[['Channel', 'Coupling Interval', 'Data', 'Label', 'Patient', 'Type', 'S1/S2']]

## Evaluating Performance of Multiple Models

### Performance using non-augmented data

In [4]:
import warnings
warnings.filterwarnings("ignore")

# Get cross validation scores on training data, following by test score.
models = (LogisticRegression(penalty='l1', C=1, random_state=1, solver='saga', multi_class='multinomial', class_weight='balanced'),
          naive_bayes.GaussianNB())
model_names = ('Logistic Regression', 'Naive Bayes')

best_f1 = 0
for clf, model_name in zip(models, model_names):
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(model_name)
    clf.fit(X_train_.values, y_train.values)
    predictions = clf.predict(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))

~~~~~~~~~~~~~~~~~~~~~~~~~
Logistic Regression
     t/p  Green Amber   Red 
    Green 241.0  20.0   5.0 
    Amber   7.0  30.0   6.0 
      Red   0.0   2.0  11.0 
F1 Score: 0.884375813739826
~~~~~~~~~~~~~~~~~~~~~~~~~
Naive Bayes
     t/p  Green Amber   Red 
    Green 248.0  16.0   2.0 
    Amber  17.0  20.0   6.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8602171890707018


In [None]:
%matplotlib qt

best_f1 = 0
for clf, model_name in zip(models, model_names):
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(model_name)
    clf.fit(X_train_.values, y_train.values)
    predictions = clf.predict(X_validation_.values)
    prediction_probabilities = clf.predict_proba(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))
    
    error_amber = ((y_validation.values == 1) & (predictions != y_validation.values))
    error_amber = [i for (i,x) in enumerate(error_amber) if x==True]
    x=[0, 1, 2]
    fig = plt.figure(figsize=(10,3), dpi=80)
    for error_idx in error_amber:
        plt.plot(x, prediction_probabilities[error_idx,:], 'k')
    
    plt.draw()
    plt.waitforbuttonpress()
    plt.close()

~~~~~~~~~~~~~~~~~~~~~~~~~
Logistic Regression
     t/p  Green Amber   Red 
    Green 241.0  20.0   5.0 
    Amber   7.0  30.0   6.0 
      Red   0.0   2.0  11.0 
F1 Score: 0.884375813739826


#### Investigate effect of C

In [5]:
# C is inverse regularisation term
c_s = np.logspace(-5, 5, num=21)
c_scores = []
best_f1 = 0
best_c = 0
for c in c_s:
    clf = LogisticRegression(penalty='l1', C=(1/c), random_state=1, solver='saga', multi_class='multinomial', class_weight='balanced')
    clf.fit(X_train_.values, y_train.values)
    predictions = clf.predict(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('c: ' + str(c))
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))
    if f1 > best_f1:
            best_f1 = f1
            best_c = c
            
    c_scores.append(f1)
         
print(best_c)
print(best_f1)

~~~~~~~~~~~~~~~~~~~~~~~~~
c: 1e-05
     t/p  Green Amber   Red 
    Green 217.0  48.0   1.0 
    Amber   1.0  38.0   4.0 
      Red   0.0   5.0   8.0 
F1 Score: 0.8413314312376314
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 3.1622776601683795e-05
     t/p  Green Amber   Red 
    Green 236.0  27.0   3.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8716949046897079
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.0001
     t/p  Green Amber   Red 
    Green 242.0  19.0   5.0 
    Amber   7.0  29.0   7.0 
      Red   0.0   2.0  11.0 
F1 Score: 0.884316764782866
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.00031622776601683794
     t/p  Green Amber   Red 
    Green 240.0  21.0   5.0 
    Amber   7.0  30.0   6.0 
      Red   0.0   2.0  11.0 
F1 Score: 0.8817867001873214
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.001
     t/p  Green Amber   Red 
    Green 241.0  22.0   3.0 
    Amber   7.0  32.0   4.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.884335959582826
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.0031622776601683794
     t/p  Green 

### Performance using augmented data

#### Degree: 0.1

In [6]:
best_f1 = 0
    
for clf, model_name in zip(models, model_names):
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(model_name)
    clf.fit(X_augmented_01_.values, y_augmented_01.values)
    predictions = clf.predict(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))

~~~~~~~~~~~~~~~~~~~~~~~~~
Logistic Regression
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8816915957675624
~~~~~~~~~~~~~~~~~~~~~~~~~
Naive Bayes
     t/p  Green Amber   Red 
    Green 248.0  15.0   3.0 
    Amber  17.0  17.0   9.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8504836191602528


In [7]:
aug01_c_scores = []
best_f1 = 0
best_c = 0
for c in c_s:
    clf = LogisticRegression(penalty='l1', C=(1/c), random_state=1, solver='saga', multi_class='multinomial', class_weight='balanced')
    clf.fit(X_augmented_01_.values, y_augmented_01.values)
    predictions = clf.predict(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('c: ' + str(c))
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))
    if f1 > best_f1:
            best_f1 = f1
            best_c = c
            
    aug01_c_scores.append(f1)
        
     
print(best_c)
print(best_f1)

~~~~~~~~~~~~~~~~~~~~~~~~~
c: 1e-05
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8816915957675624
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 3.1622776601683795e-05
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8816915957675624
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.0001
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8816915957675624
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.00031622776601683794
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8816915957675624
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.001
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8816915957675624
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.0031622776601683794
     t/p  Gree

#### Degree: 0.2

In [8]:
best_f1 = 0
    
for clf, model_name in zip(models, model_names):
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(model_name)
    clf.fit(X_augmented_02_.values, y_augmented_02.values)
    predictions = clf.predict(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))

~~~~~~~~~~~~~~~~~~~~~~~~~
Logistic Regression
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8843675633000622
~~~~~~~~~~~~~~~~~~~~~~~~~
Naive Bayes
     t/p  Green Amber   Red 
    Green 248.0  14.0   4.0 
    Amber  17.0  15.0  11.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8439908442526446


In [9]:
aug02_c_scores = []
best_f1 = 0
best_c = 0
for c in c_s:
    clf = LogisticRegression(penalty='l1', C=(1/c), random_state=1, solver='saga', multi_class='multinomial', class_weight='balanced')
    clf.fit(X_augmented_02_.values, y_augmented_02.values)
    predictions = clf.predict(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('c: ' + str(c))
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))
    if f1 > best_f1:
            best_f1 = f1
            best_c = c
            
    aug02_c_scores.append(f1)
        
     
print(best_c)
print(best_f1)

~~~~~~~~~~~~~~~~~~~~~~~~~
c: 1e-05
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8843675633000622
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 3.1622776601683795e-05
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8843675633000622
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.0001
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8843675633000622
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.00031622776601683794
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8843675633000622
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.001
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8843675633000622
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.0031622776601683794
     t/p  Gree

#### Degree: 0.3

In [10]:
best_f1 = 0
    
for clf, model_name in zip(models, model_names):
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(model_name)
    clf.fit(X_augmented_03_.values, y_augmented_03.values)
    predictions = clf.predict(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))

~~~~~~~~~~~~~~~~~~~~~~~~~
Logistic Regression
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   5.0  33.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8910962043466824
~~~~~~~~~~~~~~~~~~~~~~~~~
Naive Bayes
     t/p  Green Amber   Red 
    Green 248.0  15.0   3.0 
    Amber  17.0  18.0   8.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8537514049857454


In [11]:
aug03_c_scores = []
best_f1 = 0
best_c = 0
for c in c_s:
    clf = LogisticRegression(penalty='l1', C=(1/c), random_state=1, solver='saga', multi_class='multinomial', class_weight='balanced')
    clf.fit(X_augmented_03_.values, y_augmented_03.values)
    predictions = clf.predict(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('c: ' + str(c))
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))
    if f1 > best_f1:
            best_f1 = f1
            best_c = c
            
    aug03_c_scores.append(f1)
        
     
print(best_c)
print(best_f1)

~~~~~~~~~~~~~~~~~~~~~~~~~
c: 1e-05
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   5.0  33.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8910962043466824
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 3.1622776601683795e-05
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   5.0  33.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8910962043466824
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.0001
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   5.0  33.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8910962043466824
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.00031622776601683794
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   5.0  33.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8910962043466824
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.001
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   5.0  33.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8910962043466824
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.0031622776601683794
     t/p  Gree

#### Degree: 0.4

In [12]:
best_f1 = 0
log_reg_scores_04 = []
nb_scores_04 = []
    
for clf, model_name in zip(models, model_names):
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print(model_name)
    clf.fit(X_augmented_04_.values, y_augmented_04.values)
    predictions = clf.predict(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))

~~~~~~~~~~~~~~~~~~~~~~~~~
Logistic Regression
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   6.0  31.0   6.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8850578639644505
~~~~~~~~~~~~~~~~~~~~~~~~~
Naive Bayes
     t/p  Green Amber   Red 
    Green 248.0  15.0   3.0 
    Amber  17.0  19.0   7.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8569949777039558


In [13]:
aug04_c_scores = []
best_f1 = 0
best_c = 0
for c in c_s:
    clf = LogisticRegression(penalty='l1', C=(1/c), random_state=1, solver='saga', multi_class='multinomial', class_weight='balanced')
    clf.fit(X_augmented_04_.values, y_augmented_04.values)
    predictions = clf.predict(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('c: ' + str(c))
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))
    if f1 > best_f1:
            best_f1 = f1
            best_c = c
            
    aug04_c_scores.append(f1)
        
     
print(best_c)
print(best_f1)

~~~~~~~~~~~~~~~~~~~~~~~~~
c: 1e-05
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   6.0  32.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8878007308203975
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 3.1622776601683795e-05
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   6.0  32.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8878007308203975
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.0001
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   6.0  32.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8878007308203975
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.00031622776601683794
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   6.0  32.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8878007308203975
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.001
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   6.0  32.0   5.0 
      Red   0.0   3.0  10.0 
F1 Score: 0.8878007308203975
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.0031622776601683794
     t/p  Gree

In [22]:
%matplotlib qt
mycolors = ['tab:red', 'tab:blue', 'tab:green', 'tab:orange', 'tab:brown', 'tab:grey', 'tab:pink', 
            'tab:olive', 'deeppink', 'steelblue', 'firebrick', 'mediumseagreen']  

c_s = np.logspace(-5, 5, num=21)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,4), dpi= 80)
lines=[]
plt.plot(c_s, c_scores, color=mycolors[0])
plt.plot(c_s, aug01_c_scores, color=mycolors[1])
plt.plot(c_s, aug02_c_scores, color=mycolors[2])
plt.plot(c_s, aug03_c_scores, color=mycolors[3])
plt.plot(c_s, aug04_c_scores, color=mycolors[4])
plt.grid(True)
plt.ylabel(r'$\tilde{F}_1$ Score', fontsize=12)
plt.ylim([0.8, 0.9])
plt.xlabel('c', fontsize=12)
plt.xscale('log')
lines = ax.get_lines()
plt.legend(['$M_0 \\ (\sigma_A=0)$', '$M_1 \\ (\sigma_A=0.1)$', '$M_2 \\ (\sigma_A=0.2)$', 
            '$M_3 \\ (\sigma_A=0.3)$', '$M_4 \\ (\sigma_A=0.4)$'], fontsize=12)

# Remove borders
plt.gca().spines["top"].set_alpha(0.0)    
plt.gca().spines["bottom"].set_alpha(0.3)
plt.gca().spines["right"].set_alpha(0.0)    
plt.gca().spines["left"].set_alpha(0.3)   
plt.tight_layout()
plt.show()

In [145]:
c_best = c_s[np.argmax(c_scores)]

clf0 = LogisticRegression(penalty='l1', C=(1/c_best), random_state=1, solver='saga', multi_class='multinomial', class_weight='balanced')
clf0.fit(X_train_.values, y_train.values)
predictions0 = clf0.predict(X_validation_.values)
cm0 = confusion_matrix(y_validation.values, predictions0)
print('~~~~~~~~~~~~~~~~~~~~~~~~~')
print_cm(cm0, ['Green', 'Amber','Red'])

clf3 = LogisticRegression(penalty='l1', C=(1/c_best), random_state=1, solver='saga', multi_class='multinomial', class_weight='balanced')
clf3.fit(X_augmented_03_.values, y_augmented_03.values)
predictions3 = clf3.predict(X_validation_.values)
cm3 = confusion_matrix(y_validation.values, predictions3)
print('~~~~~~~~~~~~~~~~~~~~~~~~~')
print_cm(cm3, ['Green', 'Amber','Red'])

~~~~~~~~~~~~~~~~~~~~~~~~~
     t/p  Green Amber   Red 
    Green 238.0  23.0   5.0 
    Amber   4.0  33.0   6.0 
      Red   0.0   2.0  11.0 
~~~~~~~~~~~~~~~~~~~~~~~~~
     t/p  Green Amber   Red 
    Green 241.0  24.0   1.0 
    Amber   5.0  33.0   5.0 
      Red   0.0   3.0  10.0 


In [16]:
%matplotlib qt
mycolors = ['tab:red', 'tab:blue', 'tab:green', 'tab:orange', 'tab:brown', 'tab:grey', 'tab:pink', 
            'tab:olive', 'deeppink', 'steelblue', 'firebrick', 'mediumseagreen']  

x = np.linspace(1, 15, 15).astype(int)

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10,5), dpi= 80)
lines=[]
plt.plot(x, log_reg_scores_01, color=mycolors[0])
plt.plot(x, nb_scores_01, '--', color=mycolors[0])
plt.plot(x, log_reg_scores_02, color=mycolors[1])
plt.plot(x, nb_scores_02, '--', color=mycolors[1])
plt.plot(x, log_reg_scores_03, color=mycolors[2])
plt.plot(x, nb_scores_03, '--', color=mycolors[2])
plt.plot(x, log_reg_scores_04, color=mycolors[3])
plt.plot(x, nb_scores_04, '--', color=mycolors[3])
plt.grid(True)
plt.ylabel('F1 Score', fontsize=12)
plt.xlabel('Number of Features', fontsize=12)
lines = ax.get_lines()
legend1 = plt.legend([lines[i] for i in [0,2,4,6]], ['$\M_1 \sigma_A=0.1$', '$\sigma=0.2$', '$\sigma=0.3$', '$\sigma=0.4$'], 
                        loc=(0.01,0.65))
legend2 = plt.legend([lines[i] for i in [0,1]], ['Logistic Regression', 'Naïve Bayes'], loc=2)
ax.add_artist(legend1)
ax.add_artist(legend2)

# Remove borders
plt.gca().spines["top"].set_alpha(0.0)    
plt.gca().spines["bottom"].set_alpha(0.3)
plt.gca().spines["right"].set_alpha(0.0)    
plt.gca().spines["left"].set_alpha(0.3)   
plt.tight_layout()
plt.show()

NameError: name 'log_reg_scores_01' is not defined

### Investigating Weight Values

In [159]:
feature_names = X_train_.columns
print(feature_names)
feature_stds = []
for feature in feature_names:
    feature_stds.append(np.std(X_train_[feature].values))
    
# C is inverse regularisation term
c_s = [0.01, 1, 100]
for c in c_s:
    clf = LogisticRegression(penalty='l1', C=(1/c), random_state=1, solver='saga', multi_class='multinomial', class_weight='balanced')
    clf.fit(X_train_.values, y_train.values)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('c: ' + str(c))
    green_weights = clf.coef_[0]/feature_stds
    amber_weights = clf.coef_[1]/feature_stds
    red_weights = clf.coef_[2]/feature_stds
    green_features = feature_names[abs(green_weights)>0.025*max(abs(green_weights))]
    amber_features = feature_names[abs(amber_weights)>0.025*max(abs(amber_weights))]
    red_features = feature_names[abs(red_weights)>0.025*max(abs(red_weights))]
    print(str(len(green_features)) + ' Green Features: ' + str(green_features) + '\n')
    
    for i, feature in enumerate(feature_names):
        print(feature + ': ' + str(green_weights[i]))
        
    print(str(len(amber_features)) + ' Amber Features: ' + str(amber_features) + '\n')
    
    for i, feature in enumerate(feature_names):
        print(feature + ': ' + str(amber_weights[i]))
        
    print(str(len(red_features)) + ' Red Features: ' + str(red_features)+ '\n')
    
    for i, feature in enumerate(feature_names):
        print(feature + ': ' + str(red_weights[i]))
        

Index(['DTW Distance', 'Location of Maximum Energy',
       'Location of Maximum Energy 2', 'Mean Absolute Value',
       'Mean Absolute Value 2', 'Number of Peaks', 'Number of Peaks 2',
       'Percentage Fractionation', 'Percentage Fractionation 2',
       'Ratio Above 1xSTD', 'Ratio Above 1xSTD 2',
       'Sample Entropy Around Max Energy',
       'Sample Entropy Around Max Energy 2', 'Width of Maximum Energy',
       'Width of Maximum Energy 2'],
      dtype='object')
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.01
14 Green Features: Index(['DTW Distance', 'Location of Maximum Energy 2', 'Mean Absolute Value',
       'Mean Absolute Value 2', 'Number of Peaks', 'Number of Peaks 2',
       'Percentage Fractionation', 'Percentage Fractionation 2',
       'Ratio Above 1xSTD', 'Ratio Above 1xSTD 2',
       'Sample Entropy Around Max Energy',
       'Sample Entropy Around Max Energy 2', 'Width of Maximum Energy',
       'Width of Maximum Energy 2'],
      dtype='object')

DTW Distance: -0.094223520617

In [97]:
feature_names = X_augmented_03_.columns
feature_stds = []
for feature in feature_names:
    feature_stds.append(np.std(X_augmented_03_[feature].values))
    
for c in c_s:
    clf = LogisticRegression(penalty='l1', C=(1/c), random_state=1, solver='saga', multi_class='multinomial', class_weight='balanced')
    clf.fit(X_augmented_03_.values, y_augmented_03.values)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('c: ' + str(c))
    green_weights = clf.coef_[0]/feature_stds
    amber_weights = clf.coef_[1]/feature_stds
    red_weights = clf.coef_[2]/feature_stds
    green_features = feature_names[abs(green_weights)>0.00001]
    amber_features = feature_names[abs(amber_weights)>0.00001]
    red_features = feature_names[abs(red_weights)>0.00001]
    print('Green Features: ' + str(green_features))
    print('Amber Features: ' + str(amber_features))
    print('Red Features: ' + str(red_features))

~~~~~~~~~~~~~~~~~~~~~~~~~
c: 1e-05
F1 Score: 0.8355866449447941
Green Features: Index(['DTW Distance', 'Location of Maximum Energy',
       'Location of Maximum Energy 2', 'Mean Absolute Value',
       'Mean Absolute Value 2', 'Number of Peaks', 'Number of Peaks 2',
       'Percentage Fractionation', 'Percentage Fractionation 2',
       'Ratio Above 1xSTD', 'Ratio Above 1xSTD 2',
       'Sample Entropy Around Max Energy',
       'Sample Entropy Around Max Energy 2', 'Width of Maximum Energy',
       'Width of Maximum Energy 2'],
      dtype='object')
Amber Features: Index(['DTW Distance', 'Location of Maximum Energy',
       'Location of Maximum Energy 2', 'Mean Absolute Value',
       'Mean Absolute Value 2', 'Number of Peaks', 'Number of Peaks 2',
       'Percentage Fractionation', 'Percentage Fractionation 2',
       'Ratio Above 1xSTD', 'Ratio Above 1xSTD 2',
       'Sample Entropy Around Max Energy',
       'Sample Entropy Around Max Energy 2', 'Width of Maximum Energy',
       'W

~~~~~~~~~~~~~~~~~~~~~~~~~
c: 1000.0
F1 Score: 0.8355866449447941
Green Features: Index(['Location of Maximum Energy 2', 'Width of Maximum Energy 2'], dtype='object')
Amber Features: Index(['Location of Maximum Energy'], dtype='object')
Red Features: Index(['Location of Maximum Energy 2', 'Percentage Fractionation 2',
       'Width of Maximum Energy 2'],
      dtype='object')


In [84]:
feature_names = X_augmented_01_.columns

# C is inverse regularisation term
c_s = np.logspace(-5, 3.0, num=9)
c_scores = []
best_f1 = 0
best_c = 0
for c in c_s:
    clf = LogisticRegression(penalty='l1', C=(1/c), random_state=1, solver='saga', multi_class='multinomial', class_weight='balanced')
    clf.fit(X_augmented_01_.values, y_augmented_01.values)
    predictions = clf.predict(X_validation_.values)
    cm = confusion_matrix(y_validation.values, predictions)
    print('~~~~~~~~~~~~~~~~~~~~~~~~~')
    print('c: ' + str(c))
    green_features = feature_names[clf.coef_[0]<0.001]
    amber_features = feature_names[clf.coef_[1]<0.001]
    red_features = feature_names[clf.coef_[2]<0.001]
    print_cm(cm, ['Green', 'Amber','Red'])
    f1 = f1_score(y_validation.values, predictions, average='weighted')
    print('F1 Score: ' + str(f1))
    print('Green Features: ' + str(green_features))
    print('Amber Features: ' + str(amber_features))
    print('Red Features: ' + str(red_features))
    if f1 > best_f1:
            best_f1 = f1
            best_c = c
            
    c_scores.append(f1)
        
     
print(best_c)
print(best_f1)

~~~~~~~~~~~~~~~~~~~~~~~~~
c: 1e-05
     t/p  Green Amber   Red 
    Green 241.0  23.0   2.0 
    Amber   7.0  31.0   5.0 
      Red   0.0   4.0   9.0 
F1 Score: 0.8816915957675624
Green Features: Index(['DTW Distance', 'Location of Maximum Energy 2', 'Mean Absolute Value 2',
       'Number of Peaks', 'Number of Peaks 2', 'Percentage Fractionation',
       'Percentage Fractionation 2', 'Ratio Above 1xSTD 2',
       'Sample Entropy Around Max Energy',
       'Sample Entropy Around Max Energy 2', 'Width of Maximum Energy 2'],
      dtype='object')
Amber Features: Index(['Number of Peaks 2', 'Percentage Fractionation 2',
       'Ratio Above 1xSTD 2', 'Width of Maximum Energy'],
      dtype='object')
Red Features: Index(['Location of Maximum Energy', 'Mean Absolute Value',
       'Ratio Above 1xSTD', 'Sample Entropy Around Max Energy',
       'Sample Entropy Around Max Energy 2', 'Width of Maximum Energy'],
      dtype='object')
~~~~~~~~~~~~~~~~~~~~~~~~~
c: 0.0001
     t/p  Green Amber   Re

In [2]:
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    
    # Begin CHANGES
    fst_empty_cell = (columnwidth-3)//2 * " " + "t/p" + (columnwidth-3)//2 * " "
    
    if len(fst_empty_cell) < len(empty_cell):
        fst_empty_cell = " " * (len(empty_cell) - len(fst_empty_cell)) + fst_empty_cell
    # Print header
    print("    " + fst_empty_cell, end=" ")
    # End CHANGES
    
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
        
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()