# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV


# Data

In [2]:
dev = pd.read_csv('Data/dev.csv')
X_dev = dev.drop(columns=['hospital_death'])
y_dev = dev['hospital_death']

dev_kmeans = pd.read_csv('Data/dev_kmeans.csv')
X_dev_kmeans = dev_kmeans.drop(columns=['hospital_death'])
y_dev_kmeans = dev_kmeans['hospital_death']

dev_us = pd.read_csv('Data/dev_us.csv')
X_dev_us = dev_us.drop(columns=['hospital_death'])
y_dev_us = dev_us['hospital_death']

dev_os = pd.read_csv('Data/dev_os.csv')
X_dev_os = dev_os.drop(columns=['hospital_death'])
y_dev_os = dev_os['hospital_death']

dev_smote = pd.read_csv('Data/dev_smote.csv')
X_dev_smote = dev_smote.drop(columns=['hospital_death'])
y_dev_smote = dev_smote['hospital_death']

dev_pca95 = pd.read_pickle('Data/dev_pca95.pkl')
X_dev_pca95 = dev_pca95.drop(columns=['hospital_death'])
y_dev_pca95 = dev_pca95['hospital_death']

dev_us_pca95 = pd.read_pickle('Data/dev_us_pca95.pkl')
X_dev_us_pca95 = dev_us_pca95.drop(columns=['hospital_death'])
y_dev_us_pca95 = dev_us_pca95['hospital_death']

dev_os_pca95 = pd.read_pickle('Data/dev_os_pca95.pkl')
X_dev_os_pca95 = dev_os_pca95.drop(columns=['hospital_death'])
y_dev_os_pca95 = dev_os_pca95['hospital_death']

dev_smote_pca95 = pd.read_pickle('Data/dev_smote_pca95.pkl')
X_dev_smote_pca95 = dev_smote_pca95.drop(columns=['hospital_death'])
y_dev_smote_pca95 = dev_smote_pca95['hospital_death']

test = pd.read_csv("Data/test.csv")
X_test = test.drop(columns=['hospital_death'])
y_test = test['hospital_death']

test_kmeans = pd.read_csv("Data/test_kmeans.csv")
X_test_kmeans = test_kmeans.drop(columns=['hospital_death'])
y_test_kmeans = test_kmeans['hospital_death']

test_normalized = pd.read_csv("Data/test_normalized.csv")
X_test_normalized = test_normalized.drop(columns=['hospital_death'])
y_test_normalized = test_normalized['hospital_death']

test_normalized_kmeans = pd.read_csv("Data/test_normalization_kmeans.csv")
X_test_normalized_kmeans = test_normalized_kmeans.drop(columns=['hospital_death'])
y_test_normalized_kmeans = test_normalized_kmeans['hospital_death']

test_pca95 = pd.read_pickle("Data/test_pca95.pkl")
X_test_pca95 = test_pca95.drop(columns=['hospital_death'])
y_test_pca95 = test_pca95['hospital_death']

test_us_pca95 = pd.read_pickle("Data/test_us_pca95.pkl")
X_test_us_pca95 = test_us_pca95.drop(columns=['hospital_death'])
y_test_us_pca95 = test_us_pca95['hospital_death']

test_os_pca95 = pd.read_pickle("Data/test_os_pca95.pkl")
X_test_os_pca95 = test_os_pca95.drop(columns=['hospital_death'])
y_test_os_pca95 = test_os_pca95['hospital_death']

test_smote_pca95 = pd.read_pickle("Data/test_smote_pca95.pkl")
X_test_smote_pca95 = test_smote_pca95.drop(columns=['hospital_death'])
y_test_smote_pca95 = test_smote_pca95['hospital_death']

data = {
    "original": (X_dev, y_dev, X_test_normalized, y_test_normalized),
    "us": (X_dev_us, y_dev_us, X_test_normalized, y_test_normalized),
    "os": (X_dev_os, y_dev_os, X_test_normalized, y_test_normalized),
    "smote": (X_dev_smote, y_dev_smote, X_test_normalized, y_test_normalized)
}

pca_data = {
    "pca95": (X_dev_pca95, y_dev_pca95, X_test_pca95, y_test_pca95),
    "us_pca95": (X_dev_us_pca95, y_dev_us_pca95, X_test_us_pca95, y_test_us_pca95),
    "os_pca95": (X_dev_os_pca95, y_dev_os_pca95, X_test_os_pca95, y_test_os_pca95),
    "smote_pca95": (X_dev_smote_pca95, y_dev_smote_pca95, X_test_smote_pca95, y_test_smote_pca95)
}

data_v2 = {
#     "original": (X_dev, y_dev, X_test, y_test),
    "original_normalized": (X_dev, y_dev, X_test_normalized, y_test_normalized),
#     "kmeans": (X_dev_kmeans, y_dev_kmeans, X_test_kmeans, y_test_kmeans),
    "kmeans_normalized": (X_dev_kmeans, y_dev_kmeans, X_test_normalized_kmeans, y_test_normalized_kmeans)
}


# LinearSVC (L1 Regularization)

In [3]:
L1_params = {
    'linearsvc__penalty': ['l1'],
    'linearsvc__loss': ['squared_hinge'],
    'linearsvc__dual': [False],
    'linearsvc__C': list(np.logspace(-5, 1, 20)),
    'linearsvc__class_weight': ['balanced'],
    'linearsvc__random_state': [42]
}

svc_pipe = make_pipeline(StandardScaler(), LinearSVC())
L1_grid_accuracy = GridSearchCV(svc_pipe, L1_params, scoring='accuracy', refit=True)
L1_grid_ap = GridSearchCV(svc_pipe, L1_params, scoring='average_precision', refit=True)
L1_grid_auc = GridSearchCV(svc_pipe, L1_params, scoring='roc_auc', refit=True)
L1_grid_f1 = GridSearchCV(svc_pipe, L1_params, scoring='f1', refit=True)

for key in data.keys():
    L1_grid_accuracy.fit(data[key][0], data[key][1])
    print("L1 / {} [Accuracy]: {}".format(key, L1_grid_accuracy.score(data[key][2], data[key][3])))
    L1_grid_ap.fit(data[key][0], data[key][1])
    print("L1 / {} [Average Precision]: {}".format(key, L1_grid_ap.score(data[key][2], data[key][3])))
    L1_grid_auc.fit(data[key][0], data[key][1])
    print("L1 / {} [ROC AUC]: {}".format(key, L1_grid_auc.score(data[key][2], data[key][3])))
    L1_grid_f1.fit(data[key][0], data[key][1])
    print("L1 / {} [F1-score]: {}".format(key, L1_grid_f1.score(data[key][2], data[key][3])))


L1 / original [Accuracy]: 0.9137000490650384
L1 / original [Average Precision]: 0.4707165614147216
L1 / original [ROC AUC]: 0.8736291549382836
L1 / original [F1-score]: 0.4005270092226614
L1 / us [Accuracy]: 0.8054298642533937
L1 / us [Average Precision]: 0.4664117894045946
L1 / us [ROC AUC]: 0.8722180552016728
L1 / us [F1-score]: 0.40127495386680084
L1 / os [Accuracy]: 0.8017772447255084
L1 / os [Average Precision]: 0.4695332414264789
L1 / os [ROC AUC]: 0.8734469158436069
L1 / os [F1-score]: 0.400988467874794
L1 / smote [Accuracy]: 0.8015591778880227
L1 / smote [Average Precision]: 0.46502719058459285
L1 / smote [ROC AUC]: 0.8725315742894748
L1 / smote [F1-score]: 0.399736147757256


In [3]:
## Using Average Precision

for key in data_v2.keys():
    L1_params_pipe = {
        'linearsvc__penalty': ['l1'],
        'linearsvc__loss': ['squared_hinge'],
        'linearsvc__dual': [False],
        'linearsvc__C': list(np.logspace(-5, 1, 20)),
        'linearsvc__class_weight': ['balanced'],
        'linearsvc__random_state': [42],
        'linearsvc__max_iter': [1000]
    }

    L1_params = {
        'penalty': ['l1'],
        'loss': ['squared_hinge'],
        'dual': [False],
        'C': list(np.logspace(-5, 1, 20)),
        'class_weight': ['balanced'],
        'random_state': [42],
        'max_iter': [1000]
    }

    svc_pipe = make_pipeline(StandardScaler(), LinearSVC())
    L1_grid_accuracy_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='accuracy', refit=True)
    L1_grid_ap_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='average_precision', refit=True)
    L1_grid_auc_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='roc_auc', refit=True)
    L1_grid_f1_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='f1', refit=True)

    L1_grid_accuracy = GridSearchCV(LinearSVC(), L1_params, scoring='accuracy', refit=True)
    L1_grid_ap = GridSearchCV(LinearSVC(), L1_params, scoring='average_precision', refit=True)
    L1_grid_auc = GridSearchCV(LinearSVC(), L1_params, scoring='roc_auc', refit=True)
    L1_grid_f1 = GridSearchCV(LinearSVC(), L1_params, scoring='f1', refit=True)
    
    if "normalized" in key:
        L1_grid_ap_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Average Precision]: {}".format(key, L1_grid_ap_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Average Precision] best_params_: {L1_grid_ap_pipe.best_params_}")
        L1_params_pipe['linearsvc__C'] = [L1_grid_ap_pipe.best_params_['linearsvc__C']]
        L1_grid_accuracy_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='accuracy', refit=True)
        L1_grid_auc_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='roc_auc', refit=True)
        L1_grid_f1_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='f1', refit=True)
        L1_grid_accuracy_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Accuracy]: {}".format(key, L1_grid_accuracy_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Accuracy] best_params_: {L1_grid_accuracy_pipe.best_params_}")
        L1_grid_auc_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [ROC AUC]: {}".format(key, L1_grid_auc_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [ROC AUC] best_params_: {L1_grid_auc_pipe.best_params_}")
        L1_grid_f1_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [F1-score]: {}".format(key, L1_grid_f1_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [F1-score] best_params_: {L1_grid_f1_pipe.best_params_}")
    else:
        L1_grid_ap.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Average Precision]: {}".format(key, L1_grid_ap.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Average Precision] best_params_: {L1_grid_ap.best_params_}")
        L1_params['C'] = [L1_grid_ap.best_params_['C']]
        L1_grid_accuracy = GridSearchCV(LinearSVC(), L1_params, scoring='accuracy', refit=True)
        L1_grid_auc = GridSearchCV(LinearSVC(), L1_params, scoring='roc_auc', refit=True)
        L1_grid_f1 = GridSearchCV(LinearSVC(), L1_params, scoring='f1', refit=True)
        L1_grid_accuracy.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Accuracy]: {}".format(key, L1_grid_accuracy.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Accuracy] best_params_: {L1_grid_accuracy.best_params_}")
        L1_grid_auc.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [ROC AUC]: {}".format(key, L1_grid_auc.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [ROC AUC] best_params_: {L1_grid_auc.best_params_}")
        L1_grid_f1.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [F1-score]: {}".format(key, L1_grid_f1.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [F1-score] best_params_: {L1_grid_f1.best_params_}")


L1 / original_normalized [Average Precision]: 0.4707165614147216
    L1 / original_normalized [Average Precision] best_params_: {'linearsvc__C': 0.0007847599703514606, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l1', 'linearsvc__random_state': 42}
L1 / original_normalized [Accuracy]: 0.7998691598975086
    L1 / original_normalized [Accuracy] best_params_: {'linearsvc__C': 0.0007847599703514606, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l1', 'linearsvc__random_state': 42}
L1 / original_normalized [ROC AUC]: 0.8724873996836917
    L1 / original_normalized [ROC AUC] best_params_: {'linearsvc__C': 0.0007847599703514606, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l1'

In [3]:
## Using F1-score

for key in data_v2.keys():
    L1_params_pipe = {
        'linearsvc__penalty': ['l1'],
        'linearsvc__loss': ['squared_hinge'],
        'linearsvc__dual': [False],
        'linearsvc__C': list(np.logspace(-5, 1, 20)),
        'linearsvc__class_weight': ['balanced'],
        'linearsvc__random_state': [42],
        'linearsvc__max_iter': [1000]
    }

    L1_params = {
        'penalty': ['l1'],
        'loss': ['squared_hinge'],
        'dual': [False],
        'C': list(np.logspace(-5, 1, 20)),
        'class_weight': ['balanced'],
        'random_state': [42],
        'max_iter': [1000]
    }

    svc_pipe = make_pipeline(StandardScaler(), LinearSVC())
    L1_grid_accuracy_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='accuracy', refit=True)
    L1_grid_ap_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='average_precision', refit=True)
    L1_grid_auc_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='roc_auc', refit=True)
    L1_grid_f1_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='f1', refit=True)

    L1_grid_accuracy = GridSearchCV(LinearSVC(), L1_params, scoring='accuracy', refit=True)
    L1_grid_ap = GridSearchCV(LinearSVC(), L1_params, scoring='average_precision', refit=True)
    L1_grid_auc = GridSearchCV(LinearSVC(), L1_params, scoring='roc_auc', refit=True)
    L1_grid_f1 = GridSearchCV(LinearSVC(), L1_params, scoring='f1', refit=True)
    
    if "normalized" in key:
        L1_grid_f1_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [F1-score]: {}".format(key, L1_grid_f1_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [F1-score] best_params_: {L1_grid_f1_pipe.best_params_}")
        L1_params_pipe['linearsvc__C'] = [L1_grid_f1_pipe.best_params_['linearsvc__C']]
        L1_grid_accuracy_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='accuracy', refit=True)
        L1_grid_auc_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='roc_auc', refit=True)
        L1_grid_ap_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='average_precision', refit=True)
        L1_grid_accuracy_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Accuracy]: {}".format(key, L1_grid_accuracy_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Accuracy] best_params_: {L1_grid_accuracy_pipe.best_params_}")
        L1_grid_auc_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [ROC AUC]: {}".format(key, L1_grid_auc_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [ROC AUC] best_params_: {L1_grid_auc_pipe.best_params_}")
        L1_grid_ap_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Average Precision]: {}".format(key, L1_grid_ap_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Average Precision] best_params_: {L1_grid_ap_pipe.best_params_}")
    else:
        L1_grid_f1.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [F1-score]: {}".format(key, L1_grid_f1.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [F1-score] best_params_: {L1_grid_f1.best_params_}")
        L1_params['C'] = [L1_grid_f1.best_params_['C']]
        L1_grid_accuracy = GridSearchCV(LinearSVC(), L1_params, scoring='accuracy', refit=True)
        L1_grid_auc = GridSearchCV(LinearSVC(), L1_params, scoring='roc_auc', refit=True)
        L1_grid_ap = GridSearchCV(LinearSVC(), L1_params, scoring='average_precision', refit=True)
        L1_grid_accuracy.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Accuracy]: {}".format(key, L1_grid_accuracy.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Accuracy] best_params_: {L1_grid_accuracy.best_params_}")
        L1_grid_auc.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [ROC AUC]: {}".format(key, L1_grid_auc.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [ROC AUC] best_params_: {L1_grid_auc.best_params_}")
        L1_grid_ap.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Average Precision]: {}".format(key, L1_grid_ap.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Average Precision] best_params_: {L1_grid_ap.best_params_}")


L1 / original_normalized [F1-score]: 0.4005270092226614
    L1 / original_normalized [F1-score] best_params_: {'linearsvc__C': 0.06158482110660261, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l1', 'linearsvc__random_state': 42}
L1 / original_normalized [Accuracy]: 0.8015591778880227
    L1 / original_normalized [Accuracy] best_params_: {'linearsvc__C': 0.06158482110660261, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l1', 'linearsvc__random_state': 42}
L1 / original_normalized [ROC AUC]: 0.8735587092572183
    L1 / original_normalized [ROC AUC] best_params_: {'linearsvc__C': 0.06158482110660261, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l1', 'linearsvc__random_sta

In [4]:
## Using Accuracy

for key in data_v2.keys():
    L1_params_pipe = {
        'linearsvc__penalty': ['l1'],
        'linearsvc__loss': ['squared_hinge'],
        'linearsvc__dual': [False],
        'linearsvc__C': list(np.logspace(-5, 1, 20)),
        'linearsvc__class_weight': ['balanced'],
        'linearsvc__random_state': [42],
        'linearsvc__max_iter': [1000]
    }

    L1_params = {
        'penalty': ['l1'],
        'loss': ['squared_hinge'],
        'dual': [False],
        'C': list(np.logspace(-5, 1, 20)),
        'class_weight': ['balanced'],
        'random_state': [42],
        'max_iter': [1000]
    }

    svc_pipe = make_pipeline(StandardScaler(), LinearSVC())
    L1_grid_accuracy_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='accuracy', refit=True)
    L1_grid_ap_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='average_precision', refit=True)
    L1_grid_auc_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='roc_auc', refit=True)
    L1_grid_f1_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='f1', refit=True)

    L1_grid_accuracy = GridSearchCV(LinearSVC(), L1_params, scoring='accuracy', refit=True)
    L1_grid_ap = GridSearchCV(LinearSVC(), L1_params, scoring='average_precision', refit=True)
    L1_grid_auc = GridSearchCV(LinearSVC(), L1_params, scoring='roc_auc', refit=True)
    L1_grid_f1 = GridSearchCV(LinearSVC(), L1_params, scoring='f1', refit=True)
    
    if "normalized" in key:
        L1_grid_accuracy_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Accuracy]: {}".format(key, L1_grid_accuracy_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Accuracy] best_params_: {L1_grid_accuracy_pipe.best_params_}")
        L1_params_pipe['linearsvc__C'] = [L1_grid_accuracy_pipe.best_params_['linearsvc__C']]
        L1_grid_f1_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='f1', refit=True)
        L1_grid_auc_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='roc_auc', refit=True)
        L1_grid_ap_pipe = GridSearchCV(svc_pipe, L1_params_pipe, scoring='average_precision', refit=True)
        L1_grid_f1_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [F1-score]: {}".format(key, L1_grid_f1_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [F1-score] best_params_: {L1_grid_f1_pipe.best_params_}")
        L1_grid_auc_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [ROC AUC]: {}".format(key, L1_grid_auc_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [ROC AUC] best_params_: {L1_grid_auc_pipe.best_params_}")
        L1_grid_ap_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Average Precision]: {}".format(key, L1_grid_ap_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Average Precision] best_params_: {L1_grid_ap_pipe.best_params_}")
    else:
        L1_grid_accuracy.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Accuracy]: {}".format(key, L1_grid_accuracy.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Accuracy] best_params_: {L1_grid_accuracy.best_params_}")
        L1_params['C'] = [L1_grid_f1.best_params_['C']]
        L1_grid_f1 = GridSearchCV(LinearSVC(), L1_params, scoring='f1', refit=True)
        L1_grid_auc = GridSearchCV(LinearSVC(), L1_params, scoring='roc_auc', refit=True)
        L1_grid_ap = GridSearchCV(LinearSVC(), L1_params, scoring='average_precision', refit=True)
        L1_grid_f1.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [F1-score]: {}".format(key, L1_grid_f1.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [F1-score] best_params_: {L1_grid_f1.best_params_}")
        L1_grid_auc.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [ROC AUC]: {}".format(key, L1_grid_auc.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [ROC AUC] best_params_: {L1_grid_auc.best_params_}")
        L1_grid_ap.fit(data_v2[key][0], data_v2[key][1])
        print("L1 / {} [Average Precision]: {}".format(key, L1_grid_ap.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L1 / {key} [Average Precision] best_params_: {L1_grid_ap.best_params_}")


L1 / original_normalized [Accuracy]: 0.9137000490650384
    L1 / original_normalized [Accuracy] best_params_: {'linearsvc__C': 1e-05, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l1', 'linearsvc__random_state': 42}
L1 / original_normalized [F1-score]: 0.0
    L1 / original_normalized [F1-score] best_params_: {'linearsvc__C': 1e-05, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l1', 'linearsvc__random_state': 42}
L1 / original_normalized [ROC AUC]: 0.5
    L1 / original_normalized [ROC AUC] best_params_: {'linearsvc__C': 1e-05, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l1', 'linearsvc__random_state': 42}
L1 / original_normalized [Average Precision]: 0.086299950934961

# LinearSVC (L2 Regularization)

In [4]:
L2_params = {
    'linearsvc__penalty': ['l2'],
    'linearsvc__loss': ['squared_hinge'],
    'linearsvc__dual': [False],
    'linearsvc__C': list(np.logspace(-5, 1, 20)),
    'linearsvc__class_weight': ['balanced'],
    'linearsvc__random_state': [42]
}

svc_pipe = make_pipeline(StandardScaler(), LinearSVC())
L2_grid_accuracy = GridSearchCV(svc_pipe, L2_params, scoring='accuracy', refit=True)
L2_grid_ap = GridSearchCV(svc_pipe, L2_params, scoring='average_precision', refit=True)
L2_grid_auc = GridSearchCV(svc_pipe, L2_params, scoring='roc_auc', refit=True)
L2_grid_f1 = GridSearchCV(svc_pipe, L2_params, scoring='f1', refit=True)

for key in data.keys():
    L2_grid_accuracy.fit(data[key][0], data[key][1])
    print("L2 / {} [Accuracy]: {}".format(key, L2_grid_accuracy.score(data[key][2], data[key][3])))
    L2_grid_ap.fit(data[key][0], data[key][1])
    print("L2 / {} [Average Precision]: {}".format(key, L2_grid_ap.score(data[key][2], data[key][3])))
    L2_grid_auc.fit(data[key][0], data[key][1])
    print("L2 / {} [ROC AUC]: {}".format(key, L2_grid_auc.score(data[key][2], data[key][3])))
    L2_grid_f1.fit(data[key][0], data[key][1])
    print("L2 / {} [F1-score]: {}".format(key, L2_grid_f1.score(data[key][2], data[key][3])))


L2 / original [Accuracy]: 0.8015046611786513
L2 / original [Average Precision]: 0.4712604566819849
L2 / original [ROC AUC]: 0.8735828696004836
L2 / original [F1-score]: 0.40019746585486254
L2 / us [Accuracy]: 0.804175979937851
L2 / us [Average Precision]: 0.4671176779853225
L2 / us [ROC AUC]: 0.8719495022441605
L2 / us [F1-score]: 0.401133711237079
L2 / os [Accuracy]: 0.8023224118192226
L2 / os [Average Precision]: 0.4693318563515138
L2 / os [ROC AUC]: 0.8734485365842627
L2 / os [F1-score]: 0.4022420046158919
L2 / smote [Accuracy]: 0.8016136945973941
L2 / smote [Average Precision]: 0.4661245818391683
L2 / smote [ROC AUC]: 0.872606693734292
L2 / smote [F1-score]: 0.39960402573832704


In [5]:
## Using F1-score

for key in data_v2.keys():
    L2_params_pipe = {
        'linearsvc__penalty': ['l2'],
        'linearsvc__loss': ['squared_hinge'],
        'linearsvc__dual': [False],
        'linearsvc__C': list(np.logspace(-5, 1, 20)),
        'linearsvc__class_weight': ['balanced'],
        'linearsvc__random_state': [42],
        'linearsvc__max_iter': [1000]
    }

    L2_params = {
        'penalty': ['l2'],
        'loss': ['squared_hinge'],
        'dual': [False],
        'C': list(np.logspace(-5, 1, 20)),
        'class_weight': ['balanced'],
        'random_state': [42],
        'max_iter': [1000]
    }

    svc_pipe = make_pipeline(StandardScaler(), LinearSVC())
    L2_grid_accuracy_pipe = GridSearchCV(svc_pipe, L2_params_pipe, scoring='accuracy', refit=True)
    L2_grid_ap_pipe = GridSearchCV(svc_pipe, L2_params_pipe, scoring='average_precision', refit=True)
    L2_grid_auc_pipe = GridSearchCV(svc_pipe, L2_params_pipe, scoring='roc_auc', refit=True)
    L2_grid_f1_pipe = GridSearchCV(svc_pipe, L2_params_pipe, scoring='f1', refit=True)

    L2_grid_accuracy = GridSearchCV(LinearSVC(), L2_params, scoring='accuracy', refit=True)
    L2_grid_ap = GridSearchCV(LinearSVC(), L2_params, scoring='average_precision', refit=True)
    L2_grid_auc = GridSearchCV(LinearSVC(), L2_params, scoring='roc_auc', refit=True)
    L2_grid_f1 = GridSearchCV(LinearSVC(), L2_params, scoring='f1', refit=True)
    
    if "normalized" in key:
        L2_grid_f1_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L2 / {} [F1-score]: {}".format(key, L2_grid_f1_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L2 / {key} [F1-score] best_params_: {L2_grid_f1_pipe.best_params_}")
        L2_params_pipe['linearsvc__C'] = [L2_grid_f1_pipe.best_params_['linearsvc__C']]
        L2_grid_accuracy_pipe = GridSearchCV(svc_pipe, L2_params_pipe, scoring='accuracy', refit=True)
        L2_grid_auc_pipe = GridSearchCV(svc_pipe, L2_params_pipe, scoring='roc_auc', refit=True)
        L2_grid_ap_pipe = GridSearchCV(svc_pipe, L2_params_pipe, scoring='average_precision', refit=True)
        L2_grid_accuracy_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L2 / {} [Accuracy]: {}".format(key, L2_grid_accuracy_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L2 / {key} [Accuracy] best_params_: {L2_grid_accuracy_pipe.best_params_}")
        L2_grid_auc_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L2 / {} [ROC AUC]: {}".format(key, L2_grid_auc_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L2 / {key} [ROC AUC] best_params_: {L2_grid_auc_pipe.best_params_}")
        L2_grid_ap_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("L2 / {} [Average Precision]: {}".format(key, L2_grid_ap_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L2 / {key} [Average Precision] best_params_: {L2_grid_ap_pipe.best_params_}")
    else:
        L2_grid_f1.fit(data_v2[key][0], data_v2[key][1])
        print("L2 / {} [F1-score]: {}".format(key, L2_grid_f1.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L2 / {key} [F1-score] best_params_: {L2_grid_f1.best_params_}")
        L2_params['C'] = [L2_grid_f1.best_params_['C']]
        L2_grid_accuracy = GridSearchCV(LinearSVC(), L2_params, scoring='accuracy', refit=True)
        L2_grid_auc = GridSearchCV(LinearSVC(), L2_params, scoring='roc_auc', refit=True)
        L2_grid_ap = GridSearchCV(LinearSVC(), L2_params, scoring='average_precision', refit=True)
        L2_grid_accuracy.fit(data_v2[key][0], data_v2[key][1])
        print("L2 / {} [Accuracy]: {}".format(key, L2_grid_accuracy.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L2 / {key} [Accuracy] best_params_: {L2_grid_accuracy.best_params_}")
        L2_grid_auc.fit(data_v2[key][0], data_v2[key][1])
        print("L2 / {} [ROC AUC]: {}".format(key, L2_grid_auc.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L2 / {key} [ROC AUC] best_params_: {L2_grid_auc.best_params_}")
        L2_grid_ap.fit(data_v2[key][0], data_v2[key][1])
        print("L2 / {} [Average Precision]: {}".format(key, L2_grid_ap.score(data_v2[key][2], data_v2[key][3])))
        print(f"    L2 / {key} [Average Precision] best_params_: {L2_grid_ap.best_params_}")


L2 / original_normalized [F1-score]: 0.40019746585486254
    L2 / original_normalized [F1-score] best_params_: {'linearsvc__C': 0.003359818286283781, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l2', 'linearsvc__random_state': 42}
L2 / original_normalized [Accuracy]: 0.8012865943411656
    L2 / original_normalized [Accuracy] best_params_: {'linearsvc__C': 0.003359818286283781, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l2', 'linearsvc__random_state': 42}
L2 / original_normalized [ROC AUC]: 0.8735451779573241
    L2 / original_normalized [ROC AUC] best_params_: {'linearsvc__C': 0.003359818286283781, 'linearsvc__class_weight': 'balanced', 'linearsvc__dual': False, 'linearsvc__loss': 'squared_hinge', 'linearsvc__max_iter': 1000, 'linearsvc__penalty': 'l2', 'linearsvc__random

# SVC

In [None]:
## Using F1-score

for key in data_v2.keys():
    params_pipe = {
#         'svc__C': list(np.logspace(-5, 1, 20)),
        'svc__kernel': ['rbf'],
        'svc__class_weight': ['balanced'],
        'svc__random_state': [42]
    }

    params = {
#         'C': list(np.logspace(-5, 1, 20)),
        'kernel': ['rbf'],
        'class_weight': ['balanced'],
        'random_state': [42]
    }

    svc_pipe = make_pipeline(StandardScaler(), SVC())
    SVC_grid_accuracy_pipe = GridSearchCV(svc_pipe, params_pipe, scoring='accuracy', refit=True)
    SVC_grid_ap_pipe = GridSearchCV(svc_pipe, params_pipe, scoring='average_precision', refit=True)
    SVC_grid_auc_pipe = GridSearchCV(svc_pipe, params_pipe, scoring='roc_auc', refit=True)
    SVC_grid_f1_pipe = GridSearchCV(svc_pipe, params_pipe, scoring='f1', refit=True)

    SVC_grid_accuracy = GridSearchCV(SVC(), params, scoring='accuracy', refit=True)
    SVC_grid_ap = GridSearchCV(SVC(), params, scoring='average_precision', refit=True)
    SVC_grid_auc = GridSearchCV(SVC(), params, scoring='roc_auc', refit=True)
    SVC_grid_f1 = GridSearchCV(SVC(), params, scoring='f1', refit=True)
    
    if "normalized" in key:
        SVC_grid_f1_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("SVC / {} [F1-score]: {}".format(key, SVC_grid_f1_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    SVC / {key} [F1-score] best_params_: {SVC_grid_f1_pipe.best_params_}")
#         params_pipe['svc__C'] = [SVC_grid_f1_pipe.best_params_['svc__C']]
        SVC_grid_accuracy_pipe = GridSearchCV(svc_pipe, params_pipe, scoring='accuracy', refit=True)
        SVC_grid_auc_pipe = GridSearchCV(svc_pipe, params_pipe, scoring='roc_auc', refit=True)
        SVC_grid_ap_pipe = GridSearchCV(svc_pipe, params_pipe, scoring='average_precision', refit=True)
        SVC_grid_accuracy_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("SVC / {} [Accuracy]: {}".format(key, SVC_grid_accuracy_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    SVC / {key} [Accuracy] best_params_: {SVC_grid_accuracy_pipe.best_params_}")
        SVC_grid_auc_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("SVC / {} [ROC AUC]: {}".format(key, SVC_grid_auc_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    SVC / {key} [ROC AUC] best_params_: {SVC_grid_auc_pipe.best_params_}")
        SVC_grid_ap_pipe.fit(data_v2[key][0], data_v2[key][1])
        print("SVC / {} [Average Precision]: {}".format(key, SVC_grid_ap_pipe.score(data_v2[key][2], data_v2[key][3])))
        print(f"    SVC / {key} [Average Precision] best_params_: {SVC_grid_ap_pipe.best_params_}")
    else:
        SVC_grid_f1.fit(data_v2[key][0], data_v2[key][1])
        print("SVC / {} [F1-score]: {}".format(key, SVC_grid_f1.score(data_v2[key][2], data_v2[key][3])))
        print(f"    SVC / {key} [F1-score] best_params_: {SVC_grid_f1.best_params_}")
#         params['C'] = [SVC_grid_f1.best_params_['C']]
        SVC_grid_accuracy = GridSearchCV(LinearSVC(), params, scoring='accuracy', refit=True)
        SVC_grid_auc = GridSearchCV(LinearSVC(), params, scoring='roc_auc', refit=True)
        SVC_grid_ap = GridSearchCV(LinearSVC(), params, scoring='average_precision', refit=True)
        SVC_grid_accuracy.fit(data_v2[key][0], data_v2[key][1])
        print("SVC / {} [Accuracy]: {}".format(key, SVC_grid_accuracy.score(data_v2[key][2], data_v2[key][3])))
        print(f"    SVC / {key} [Accuracy] best_params_: {SVC_grid_accuracy.best_params_}")
        SVC_grid_auc.fit(data_v2[key][0], data_v2[key][1])
        print("SVC / {} [ROC AUC]: {}".format(key, SVC_grid_auc.score(data_v2[key][2], data_v2[key][3])))
        print(f"    SVC / {key} [ROC AUC] best_params_: {SVC_grid_auc.best_params_}")
        SVC_grid_ap.fit(data_v2[key][0], data_v2[key][1])
        print("SVC / {} [Average Precision]: {}".format(key, SVC_grid_ap.score(data_v2[key][2], data_v2[key][3])))
        print(f"    SVC / {key} [Average Precision] best_params_: {SVC_grid_ap.best_params_}")


SVC / original_normalized [F1-score]: 0.4508573818485989
    SVC / original_normalized [F1-score] best_params_: {'svc__class_weight': 'balanced', 'svc__kernel': 'rbf', 'svc__random_state': 42}
SVC / original_normalized [Accuracy]: 0.856839121190645
    SVC / original_normalized [Accuracy] best_params_: {'svc__class_weight': 'balanced', 'svc__kernel': 'rbf', 'svc__random_state': 42}
SVC / original_normalized [ROC AUC]: 0.8697304067531363
    SVC / original_normalized [ROC AUC] best_params_: {'svc__class_weight': 'balanced', 'svc__kernel': 'rbf', 'svc__random_state': 42}
SVC / original_normalized [Average Precision]: 0.4066883285080185
    SVC / original_normalized [Average Precision] best_params_: {'svc__class_weight': 'balanced', 'svc__kernel': 'rbf', 'svc__random_state': 42}
SVC / kmeans_normalized [F1-score]: 0.0
    SVC / kmeans_normalized [F1-score] best_params_: {'svc__class_weight': 'balanced', 'svc__kernel': 'rbf', 'svc__random_state': 42}
