In [5]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import RFE 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pickle 
import pandas as pd
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

In [2]:
def get_pickles(): 
    x_train = pickle.load(open(f'../../data/processed/pickles/x_train.p', 'rb'))
    x_test = pickle.load(open(f'../../data/processed/pickles/x_test.p', 'rb'))
    y_train = pickle.load(open(f'../../data/processed/pickles/y_train.p', 'rb'))
    y_test = pickle.load(open(f'../../data/processed/pickles/y_test.p', 'rb'))
    X = pickle.load(open('../../data/processed/pickles/X.p', 'rb'))
    y = pickle.load(open('../../data/processed/pickles/y.p', 'rb'))

    return (x_train, x_test, y_train, y_test), (X,y)

(x_train, x_test, y_train, y_test), (X,y) = get_pickles()


print(f'Original X: {X.shape}\tOriginal y: {y.shape}')
print(f'Train X: {x_train.shape}\tTrain y: {y_train.shape}')
print(f'Test X: {x_test.shape}\tTest y: {y_test.shape}')

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()


standard = StandardScaler() 

x_train[['amount_tsh', 'gps_height', 
         'population', 'time_passed']] = standard.fit_transform(x_train[['amount_tsh', 
                                                                         'gps_height', 'population', 'time_passed']])

Original X: (57247, 53)	Original y: (57247, 1)
Train X: (48659, 53)	Train y: (48659, 1)
Test X: (8588, 53)	Test y: (8588, 1)


In [None]:
best_score = 0 
best_i = 0
pbar = tqdm(range(5,21))
for i in pbar: 
    rfe = RFE(svm, n_features_to_select = i, verbose = 2)

    rfe.fit(x_train,y_train)
    pred = rfe.predict(x_test)
    roc_auc = roc_auc_score(y_test, pred)
    if roc_auc > best_score: 
        best_score = roc_auc 
        best_i = i

In [9]:
rfe_objects = [RFE(svm, n_features_to_select = i, verbose = 2) for i in range(5,21)]
rfe_fitted = [i.fit(x_train,y_train) for i in rfe_objects] 

Fitting estimator with 53 features.




Fitting estimator with 52 features.


KeyboardInterrupt: 

In [6]:
svm = LinearSVC(max_iter = 1250)


rfe = RFE(svm, n_features_to_select = 20, verbose = 2)

rfe.fit(x_train,y_train)

In [None]:
print('Score', rfe.score(x_test,y_test))
#masks for columns that are important
column_masks = rfe.support_
print(column_masks)

orig_columns = x_train.columns
new_columns = [x for x,y in zip(orig_columns, column_masks) if y == True]
print(new_columns)

In [None]:
x_train_new = x_train[new_columns]
x_test_new= x_test[new_columns]
param_grid = {
    'loss': ['hinge', 'squared_hinge'],
    'penalty': ['l2', 'l1'],
    'tol': [.0001, .001, .01],
    'C': [.5, 1.0, 2]
    'max_iter': [1000, 3000, 5000]
}
svm = LinearSVC()
gs = GridSearchCV(svm, param_grid = param_grid, verbose = 2, n_jobs = -1)
gs.fit(x_train_new, y_train)

In [None]:
pickle.dump(gs, open('../../models/LinearSVC_GridSearch.p', 'wb'))

In [None]:
gs.best_estimator_

In [None]:

svm_tuned = LinearSVC(alpha=0.001, average=False, class_weight=None,
              early_stopping=True, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='squared_hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l1',
              power_t=0.5, random_state=None, shuffle=True, tol=0.1,
              validation_fraction=0.1, verbose=0, warm_start=False)

svm_tuned.fit(x_train_new,y_train)

print(f'Test Score: {svm_tuned.score(x_test_new,y_test)}')