# Preprocessing

In [138]:
# Needed libraries
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# evaluate a logistic regression model using k-fold cross-validation
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [139]:
# Importing both csv and joining
df1 = pd.read_csv("Data/public-annotations.csv")
df2 = pd.read_csv("Data/public_cohen_dataset_features.csv")
df = df1.join(df2)

# Dropping J- measurements for "junior"
df = df.drop(['S-A', 'S-B', 'S-C', 'S-D', 'S-E', 'S-F', 'J-A', 'J-B', 'J-C', 'J-D', 'J-E', 'J-F', 'J-Global'], axis = 1)

# Adding severity class 1, 2, 3
df['s_class'] = 0

for i in range(192):
    if df.iloc[i, 1] <= 6:
        df.iloc[i, 109] = 1
        continue
    elif df.iloc[i, 1] <= 12:
        df.iloc[i, 109] = 2
        continue
    else:
        df.iloc[i, 109] = 3
        
        
# dropping filename column and S-Global score
df_features = df.drop('filename', axis = 1)
df_features = df_features.drop('S-Global', axis = 1)


In [140]:
# Setting features and response
X = df_features.iloc[:, 0:107]
Y = df_features.iloc[:, 107:108]

# Splitting 80/20
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=20)

# Scaling
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
y_train = np.ravel(y_train)

Logistic Regression - Lasso Feature Selection

In [169]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty = 'l1', max_iter = 300, random_state=0,  solver='liblinear').fit(X_scaled, y_train)

In [170]:
clf.predict(X_scaled[:2, :])

array([1, 2], dtype=int64)

In [171]:
clf.score(X_scaled, y_train)

# Get confusion matrix

0.7647058823529411

In [172]:
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# create model
model = LogisticRegression(penalty = 'l1', max_iter = 300, random_state=0,  solver='liblinear').fit(X_scaled, y_train)

# evaluate model
scores = cross_val_score(model, X_scaled, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.536 (0.122)


Feature Selection - ANOVA

In [115]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

In [158]:
fs = SelectKBest(score_func = f_classif, k = 55)
X_train_selected_ANOVA = fs.fit_transform(X_scaled, y_train)
# Retrying Log Regression with ANOVA selected features
clf = LogisticRegression(penalty = 'l2', max_iter = 150, random_state=0).fit(X_train_selected_ANOVA, y_train)
clf.score(X_train_selected_ANOVA, y_train)

  f = msb / msw


0.7908496732026143

In [159]:
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# create model
model = LogisticRegression(penalty = 'l2', max_iter = 150, random_state=0).fit(X_train_selected_ANOVA, y_train)

# evaluate model
scores = cross_val_score(model, X_train_selected_ANOVA, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.541 (0.153)


In [164]:
fs = SelectKBest(score_func = f_classif, k = 19)
X_train_selected_ANOVA = fs.fit_transform(X_scaled, y_train)
# Retrying Log Regression with ANOVA selected features
clf = LogisticRegression(penalty = 'l2', max_iter = 150, random_state=0).fit(X_train_selected_ANOVA, y_train)
clf.score(X_train_selected_ANOVA, y_train)

  f = msb / msw


0.673202614379085

In [165]:
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)

# create model
model = LogisticRegression(penalty = 'l2', max_iter = 150, random_state=0).fit(X_train_selected_ANOVA, y_train)

# evaluate model
scores = cross_val_score(model, X_train_selected_ANOVA, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

Accuracy: 0.566 (0.150)
