# Preprocessing

In [42]:
# Needed libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# evaluate a logistic regression model using k-fold cross-validation
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# Anova feat selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# KNN classifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

# Decision Tree classifier
from sklearn import tree

In [53]:
# Gives func get_anova_feat for anova selection
%run getAnovaFeat.py
##############################################################
## All model files should return model for confusion matrix ##
##############################################################
# Gives func knn_pred for knn classifier
%run knn.py

# Gives func decision_tree_pred for classifier
%run dt.py

# Gives func ridge_reg_pred for classifier
%run ridge.py

# Gives func rf_pred for classifier
%run randForest.py

In [3]:
# Importing both csv and joining
df1 = pd.read_csv("Data/public-annotations.csv")
df2 = pd.read_csv("Data/public_cohen_dataset_features.csv")
df = df1.join(df2)

# Dropping J- measurements for "junior"
df = df.drop(['S-A', 'S-B', 'S-C', 'S-D', 'S-E', 'S-F', 'J-A', 'J-B', 'J-C', 'J-D', 'J-E', 'J-F', 'J-Global'], axis = 1)

# Adding severity class 1, 2, 3
df['s_class'] = 0

for i in range(192):
    if df.iloc[i, 1] <= 6:
        df.iloc[i, 109] = 1
        continue
    elif df.iloc[i, 1] <= 12:
        df.iloc[i, 109] = 2
        continue
    else:
        df.iloc[i, 109] = 3
        
        
# dropping filename column and S-Global score
df_features = df.drop('filename', axis = 1)
df_features = df_features.drop('S-Global', axis = 1)


In [4]:
# Setting features and response
X = df_features.iloc[:, 0:107]
Y = df_features.iloc[:, 107:108]

# Splitting 80/20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=20)

# Scaling
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
y_train = np.ravel(y_train)

# Total of each class in train
print(sum(y_train == 1), sum(y_train == 2), sum(y_train == 3))

## Lasso Regression

In [6]:
# Declaring Lasso model
clf = LogisticRegression(penalty = 'l1', max_iter = 300, random_state=0,  solver='liblinear').fit(X_scaled, y_train)
test_score = clf.score(X_scaled, y_train)

# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# create model
model = LogisticRegression(penalty = 'l1', max_iter = 300, random_state=0,  solver='liblinear').fit(X_scaled, y_train)

# evaluate model
scores = cross_val_score(model, X_scaled, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy on test data: %.3f || CV Accuracy: %.3f' % (test_score , mean(scores)))

Accuracy on test data: 0.765 || CV Accuracy: 0.536


# Ridge regression 
### Feature Selection - ANOVA
**55 Features**

In [28]:
X_train_selected_ANOVA = get_anova_feat(55, X_scaled, y_train)

# ridge_reg_pred(X_train_selected_ANOVA, y_train)
model = ridge_reg_pred(X_train_selected_ANOVA, y_train)

Accuracy on test data: 0.791 || CV Accuracy: 0.541


  f = msb / msw


In [29]:
y_pred = model.predict(X_train_selected_ANOVA)

confusion_matrix(y_train, y_pred)

array([[41, 15,  2],
       [ 7, 61,  2],
       [ 0,  6, 19]], dtype=int64)

**19 features (rule of ten)**

In [30]:
X_train_selected_ANOVA = get_anova_feat(19, X_scaled, y_train)

# Retrying Log Regression with ANOVA selected features
model = ridge_reg_pred(X_train_selected_ANOVA, y_train)

Accuracy on test data: 0.673 || CV Accuracy: 0.566


  f = msb / msw


In [31]:
y_pred = model.predict(X_train_selected_ANOVA)

confusion_matrix(y_train, y_pred)

array([[33, 22,  3],
       [11, 55,  4],
       [ 1,  9, 15]], dtype=int64)

# KNN
## Feature Selection - ANOVA
**55 Features**

In [39]:
# KNN 55 features K = 5
X_train_selected_ANOVA = get_anova_feat(55, X_scaled, y_train)

model = knn_pred(X_train_selected_ANOVA, y_train, 5)

Accuracy on test data: 0.654 || CV Accuracy: 0.483


  f = msb / msw


In [33]:
y_pred = model.predict(X_train_selected_ANOVA)

confusion_matrix(y_train, y_pred)

array([[38, 19,  1],
       [14, 55,  1],
       [ 4, 14,  7]], dtype=int64)

**19 Features**

In [40]:
# KNN 19 Features K = 5
X_train_selected_ANOVA = get_anova_feat(19, X_scaled, y_train)

model = knn_pred(X_train_selected_ANOVA, y_train, 5)

Accuracy on test data: 0.634 || CV Accuracy: 0.405


  f = msb / msw


In [41]:
y_pred = model.predict(X_train_selected_ANOVA)

confusion_matrix(y_train, y_pred)

array([[37, 21,  0],
       [22, 47,  1],
       [ 6,  6, 13]], dtype=int64)

# Decision Tree
## Feature Selection - ANOVA
**55 Features**

In [45]:
#Selecting features for the model 
X_train_selected_ANOVA = get_anova_feat(55, X_scaled, y_train)

# decision_tree_pred(X_train_selected_ANOVA, y_train)
model = decision_tree_pred(X_train_selected_ANOVA, y_train)

  f = msb / msw


Accuracy on test data: 1.000 || CV Accuracy: 0.359


**19 Features**

In [46]:
#Selecting features for the model 
X_train_selected_ANOVA = get_anova_feat(19, X_scaled, y_train)

# decision_tree_pred(X_train_selected_ANOVA, y_train)
model = decision_tree_pred(X_train_selected_ANOVA, y_train)

  f = msb / msw


Accuracy on test data: 1.000 || CV Accuracy: 0.463


# Random Forest 

**Max-depth = 3**

In [61]:
model = rf_pred(X_scaled, y_train, 3)

Accuracy on test data: 0.784 || CV Accuracy: 0.523
