# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV


# Data

In [15]:
dev = pd.read_csv('Data/dev.csv')
X_dev = dev.drop(columns=['hospital_death'])
y_dev = dev['hospital_death']

dev_us = pd.read_csv('Data/dev_us.csv')
X_dev_us = dev_us.drop(columns=['hospital_death'])
y_dev_us = dev_us['hospital_death']

dev_os = pd.read_csv('Data/dev_os.csv')
X_dev_os = dev_os.drop(columns=['hospital_death'])
y_dev_os = dev_os['hospital_death']

dev_smote = pd.read_csv('Data/dev_smote.csv')
X_dev_smote = dev_smote.drop(columns=['hospital_death'])
y_dev_smote = dev_smote['hospital_death']

dev_pca95 = pd.read_pickle('Data/dev_pca95.pkl')
X_dev_pca95 = dev_pca95.drop(columns=['hospital_death'])
y_dev_pca95 = dev_pca95['hospital_death']

dev_us_pca95 = pd.read_pickle('Data/dev_us_pca95.pkl')
X_dev_us_pca95 = dev_us_pca95.drop(columns=['hospital_death'])
y_dev_us_pca95 = dev_us_pca95['hospital_death']

dev_os_pca95 = pd.read_pickle('Data/dev_os_pca95.pkl')
X_dev_os_pca95 = dev_os_pca95.drop(columns=['hospital_death'])
y_dev_os_pca95 = dev_os_pca95['hospital_death']

dev_smote_pca95 = pd.read_pickle('Data/dev_smote_pca95.pkl')
X_dev_smote_pca95 = dev_smote_pca95.drop(columns=['hospital_death'])
y_dev_smote_pca95 = dev_smote_pca95['hospital_death']

test = pd.read_csv("Data/test.csv")
X_test = test.drop(columns=['hospital_death'])
y_test = test['hospital_death']

test_pca95 = pd.read_pickle("Data/test_pca95.pkl")
X_test_pca95 = test_pca95.drop(columns=['hospital_death'])
y_test_pca95 = test_pca95['hospital_death']

test_us_pca95 = pd.read_pickle("Data/test_us_pca95.pkl")
X_test_us_pca95 = test_us_pca95.drop(columns=['hospital_death'])
y_test_us_pca95 = test_us_pca95['hospital_death']

test_os_pca95 = pd.read_pickle("Data/test_os_pca95.pkl")
X_test_os_pca95 = test_os_pca95.drop(columns=['hospital_death'])
y_test_os_pca95 = test_os_pca95['hospital_death']

test_smote_pca95 = pd.read_pickle("Data/test_smote_pca95.pkl")
X_test_smote_pca95 = test_smote_pca95.drop(columns=['hospital_death'])
y_test_smote_pca95 = test_smote_pca95['hospital_death']

data = {
    "original": (X_dev, y_dev, X_test, y_test),
    "us": (X_dev_us, y_dev_us, X_test, y_test),
    "os": (X_dev_os, y_dev_os, X_test, y_test),
    "smote": (X_dev_smote, y_dev_smote, X_test, y_test)
}

pca_data = {
    "pca95": (X_dev_pca95, y_dev_pca95, X_test_pca95, y_test_pca95),
    "us_pca95": (X_dev_us_pca95, y_dev_us_pca95, X_test_us_pca95, y_test_us_pca95),
    "os_pca95": (X_dev_os_pca95, y_dev_os_pca95, X_test_os_pca95, y_test_os_pca95),
    "smote_pca95": (X_dev_smote_pca95, y_dev_smote_pca95, X_test_smote_pca95, y_test_smote_pca95)
}


# LinearSVC (L1 Regularization)

In [16]:
L1_params = {
    'linearsvc__penalty': ['l1'],
    'linearsvc__loss': ['squared_hinge'],
    'linearsvc__dual': [False],
    'linearsvc__C': list(np.logspace(-5, 1, 20)),
    'linearsvc__class_weight': ['balanced'],
    'linearsvc__random_state': [42]
}

svc_pipe = make_pipeline(StandardScaler(), LinearSVC())
L1_grid_accuracy = GridSearchCV(svc_pipe, L1_params, scoring='accuracy', refit=True)
L1_grid_ap = GridSearchCV(svc_pipe, L1_params, scoring='average_precision', refit=True)
L1_grid_auc = GridSearchCV(svc_pipe, L1_params, scoring='roc_auc', refit=True)

for key in data.keys():
    L1_grid_accuracy.fit(data[key][0], data[key][1])
    print("L1 / {} [Accuracy]: {}".format(key, L1_grid_accuracy.score(data[key][2], data[key][3])))
    L1_grid_ap.fit(data[key][0], data[key][1])
    print("L1 / {} [Average Precision]: {}".format(key, L1_grid_ap.score(data[key][2], data[key][3])))
    L1_grid_auc.fit(data[key][0], data[key][1])
    print("L1 / {} [ROC AUC]: {}".format(key, L1_grid_auc.score(data[key][2], data[key][3])))


L1 / original [Accuracy]: 0.9137000490650384
L1 / original [Average Precision]: 0.4707165614147216
L1 / original [ROC AUC]: 0.8736291549382836
L1 / us [Accuracy]: 0.8054298642533937
L1 / us [Average Precision]: 0.4664117894045946
L1 / us [ROC AUC]: 0.8722180552016728
L1 / os [Accuracy]: 0.8017772447255084
L1 / os [Average Precision]: 0.4695332414264789
L1 / os [ROC AUC]: 0.8734469158436069
L1 / smote [Accuracy]: 0.8015591778880227
L1 / smote [Average Precision]: 0.46502719058459285
L1 / smote [ROC AUC]: 0.8725315742894748


In [18]:
L1_params = {
    'penalty': ['l1'],
    'loss': ['squared_hinge'],
    'dual': [False],
    'C': list(np.logspace(-5, 1, 20)),
    'class_weight': ['balanced'],
    'random_state': [42]
}

svc = LinearSVC()
L1_pca_grid_accuracy = GridSearchCV(svc, L1_params, scoring='accuracy', refit=True)
L1_pca_grid_ap = GridSearchCV(svc, L1_params, scoring='average_precision', refit=True)
L1_pca_grid_auc = GridSearchCV(svc, L1_params, scoring='roc_auc', refit=True)

for key in pca_data.keys():
    L1_pca_grid_accuracy.fit(pca_data[key][0], pca_data[key][1])
    print("L1 / {} [Accuracy]: {}".format(key, L1_pca_grid_accuracy.score(pca_data[key][2], pca_data[key][3])))
    L1_pca_grid_ap.fit(pca_data[key][0], pca_data[key][1])
    print("L1 / {} [Average Precision]: {}".format(key, L1_pca_grid_ap.score(pca_data[key][2], pca_data[key][3])))
    L1_pca_grid_auc.fit(pca_data[key][0], pca_data[key][1])
    print("L1 / {} [ROC AUC]: {}".format(key, L1_pca_grid_auc.score(pca_data[key][2], pca_data[key][3])))


L1 / pca95 [Accuracy]: 0.8029220956223082
L1 / pca95 [Average Precision]: 0.3997539894326848
L1 / pca95 [ROC AUC]: 0.8241407813025327


ValueError: X has 48 features, but LinearSVC is expecting 47 features as input.

# LinearSVC (L2 Regularization)

In [29]:
L2_params = {
    'linearsvc__penalty': ['l2'],
    'linearsvc__loss': ['squared_hinge'],
    'linearsvc__dual': [False],
    'linearsvc__C': list(np.logspace(-5, 1, 20)),
    'linearsvc__class_weight': ['balanced'],
    'linearsvc__random_state': [42]
}

svc_pipe = make_pipeline(StandardScaler(), LinearSVC())
L2_grid_accuracy = GridSearchCV(svc_pipe, L2_params, scoring='accuracy', refit=True)
L2_grid_ap = GridSearchCV(svc_pipe, L2_params, scoring='average_precision', refit=True)
L2_grid_auc = GridSearchCV(svc_pipe, L2_params, scoring='roc_auc', refit=True)

for key in data.keys():
    L2_grid_accuracy.fit(data[key][0], data[key][1])
    print("L2 / {} [Accuracy]: {}".format(key, L2_grid_accuracy.score(data[key][2], data[key][3])))
    L2_grid_ap.fit(data[key][0], data[key][1])
    print("L2 / {} [Average Precision]: {}".format(key, L2_grid_ap.score(data[key][2], data[key][3])))
    L2_grid_auc.fit(data[key][0], data[key][1])
    print("L2 / {} [ROC AUC]: {}".format(key, L2_grid_auc.score(data[key][2], data[key][3])))


L2 / original [Accuracy]: 0.8015046611786513
L2 / original [Average Precision]: 0.4712604566819849
L2 / original [ROC AUC]: 0.8735828696004836
L2 / us [Accuracy]: 0.804175979937851
L2 / us [Average Precision]: 0.4671176779853225
L2 / us [ROC AUC]: 0.8719495022441605
L2 / os [Accuracy]: 0.8023224118192226
L2 / os [Average Precision]: 0.4693318563515138
L2 / os [ROC AUC]: 0.8734485365842627
L2 / smote [Accuracy]: 0.8016136945973941
L2 / smote [Average Precision]: 0.4661245818391683
L2 / smote [ROC AUC]: 0.872606693734292


In [30]:
L2_params = {
    'penalty': ['l2'],
    'loss': ['squared_hinge'],
    'dual': [False],
    'C': list(np.logspace(-5, 1, 20)),
    'class_weight': ['balanced'],
    'random_state': [42]
}

svc = LinearSVC()
L2_pca_grid_accuracy = GridSearchCV(svc, L2_params, scoring='accuracy', refit=True)
L2_pca_grid_ap = GridSearchCV(svc, L2_params, scoring='average_precision', refit=True)
L2_pca_grid_auc = GridSearchCV(svc, L2_params, scoring='roc_auc', refit=True)

for key in pca_data.keys():
    L2_pca_grid_accuracy.fit(pca_data[key][0], pca_data[key][1])
    print("L2 / {} [Accuracy]: {}".format(key, L2_pca_grid_accuracy.score(pca_data[key][2], pca_data[key][3])))
    L2_pca_grid_ap.fit(pca_data[key][0], pca_data[key][1])
    print("L2 / {} [Average Precision]: {}".format(key, L2_pca_grid_ap.score(pca_data[key][2], pca_data[key][3])))
    L2_pca_grid_auc.fit(pca_data[key][0], pca_data[key][1])
    print("L2 / {} [ROC AUC]: {}".format(key, L2_pca_grid_auc.score(pca_data[key][2], pca_data[key][3])))


L2 / pca95 [Accuracy]: 0.8028130622035654
L2 / pca95 [Average Precision]: 0.41129548588316384
L2 / pca95 [ROC AUC]: 0.8237837660585245


ValueError: X has 48 features, but LinearSVC is expecting 47 features as input.