In [13]:
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pickle 
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

In [14]:
def get_pickles(): 
    x_train = pickle.load(open(f'../../../data/processed/pickles/cluster_x_train.p', 'rb'))
    x_test = pickle.load(open(f'../../../data/processed/pickles/cluster_x_test.p', 'rb'))
    y_train = pickle.load(open(f'../../../data/processed/pickles/cluster_y_train.p', 'rb'))
    y_test = pickle.load(open(f'../../../data/processed/pickles/cluster_y_test.p', 'rb'))
    X = pickle.load(open('../../../data/processed/pickles/cluster_X.p', 'rb'))
    y = pickle.load(open('../../../data/processed/pickles/cluster_y.p', 'rb'))

    return (x_train, x_test, y_train, y_test), (X,y)

(x_train, x_test, y_train, y_test), (X,y) = get_pickles()


print(f'Original X: {X.shape}\tOriginal y: {y.shape}')
print(f'Train X: {x_train.shape}\tTrain y: {y_train.shape}')
print(f'Test X: {x_test.shape}\tTest y: {y_test.shape}')

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

Original X: (57247, 55)	Original y: (57247, 1)
Train X: (42935, 54)	Train y: (42935, 1)
Test X: (14312, 54)	Test y: (14312, 1)


In [19]:
#Vanilla 
sgd_svm = SGDClassifier()
sgd_svm.fit(x_train, y_train)
pred = sgd_svm.predict(x_test)
print(f'Vanilla ROC: {roc_auc_score(y_test, pred)}')

Vanilla ROC: 0.6360571472410942


In [11]:
#PCA
sgd_svm = SGDClassifier()
best_score = 0 
best_i = 0
r_back = [i for i in range(1,len(x_train.columns)+1)]
pbar = tqdm(r_back)
for i in pbar: 
    pbar.set_description(f'Testing {i} PCA Features || Best AUC: {best_score} || Best n: {best_i}')
    pca = PCA(n_components = i)
    pca.fit(x_train)
    x_train_new = pca.transform(x_train)
    x_test_new = pca.transform(x_test)
    lin_svm.fit(x_train_new,y_train)
    pred = sgd_svm.predict(x_test_new)
    roc_auc = roc_auc_score(y_test, pred)
    if roc_auc > best_score: 
        best_score = roc_auc 
        best_i = i
    #masks for columns that are important
        pickle.dump(best_i, open('../../../models/SGDSVM_PCA.p', 'wb'))

Testing 54 PCA Features || Best AUC: 0.6707173114074427 || Best n: 48: 100%|██████████████████████████████████████████████████████████████████████████████| 54/54 [00:53<00:00,  1.01it/s]


In [12]:
sgd_svm = SGDClassifier()
best_score = 0 
best_i = 0
r_back = [i for i in range(8,53)]
pbar = tqdm(r_back)
for i in pbar: 
    pbar.set_description(f'Testing {i} RFE Features || Best AUC: {best_score} || Best n: {best_i}')
    rfe = RFE(sgd_svm, n_features_to_select = i, step = 3)

    rfe.fit(x_train,y_train)
    pred = rfe.predict(x_test)
    roc_auc = roc_auc_score(y_test, pred)
    if roc_auc > best_score: 
        best_score = roc_auc 
        best_i = i
    #masks for columns that are important
        column_masks = rfe.support_

        orig_columns = x_train.columns
        new_columns = [x for x,y in zip(orig_columns, column_masks) if y == True]
        pickle.dump(new_columns, open('../../../models/SGDSVM_Columns.p', 'wb'))

        


Testing 52 Features || Best AUC: 0.6971892456171424 || Best n: 10: 100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [03:41<00:00,  4.92s/it]


In [15]:
columns = pickle.load(open('../../../models/SGDSVM_Columns.p', 'rb'))
print(columns)

['basin_Lake Rukwa', 'extract_india mark ii', 'extract_climax', 'waterpoint_communal standpipe multiple', 'waterpoint_other', 'waterpoint_improved spring', 'quantity_enough', 'quantity_insufficient', 'quantity_dry', 'quantity_seasonal']


In [16]:
x_train_new = x_train[columns]
x_test_new= x_test[columns]
param_grid = {
    'loss': ['hinge', 'log', 'perceptron', 'modified_huber', 'squared_hinge'],
    'penalty': ['l2', 'l1'],
    'alpha': [.001, .0001, .1, .01],
    'l1_ratio': [.15, .45, .01, .015],
    'max_iter': [1000, 3000],
    'tol': [.001, .01, .1],
    'epsilon': [.1, .01],
    'early_stopping': [True, False]
}
svm = SGDClassifier()
gs = GridSearchCV(svm, param_grid = param_grid, verbose = 2, n_jobs = 10)
gs.fit(x_train_new, y_train)

Fitting 5 folds for each of 3840 candidates, totalling 19200 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  21 tasks      | elapsed:    1.6s
[Parallel(n_jobs=10)]: Done 142 tasks      | elapsed:    3.3s
[Parallel(n_jobs=10)]: Done 345 tasks      | elapsed:    6.2s
[Parallel(n_jobs=10)]: Done 628 tasks      | elapsed:   10.3s
[Parallel(n_jobs=10)]: Done 993 tasks      | elapsed:   15.4s
[Parallel(n_jobs=10)]: Done 1438 tasks      | elapsed:   21.7s
[Parallel(n_jobs=10)]: Done 1965 tasks      | elapsed:   29.0s
[Parallel(n_jobs=10)]: Done 2775 tasks      | elapsed:   47.6s
[Parallel(n_jobs=10)]: Done 3491 tasks      | elapsed:  2.1min
[Parallel(n_jobs=10)]: Done 4260 tasks      | elapsed:  3.7min
[Parallel(n_jobs=10)]: Done 5181 tasks      | elapsed:  4.9min
[Parallel(n_jobs=10)]: Done 6112 tasks      | elapsed:  5.2min
[Parallel(n_jobs=10)]: Done 7125 tasks      | elapsed:  5.4min
[Parallel(n_jobs=10)]: Done 8468 tasks      | elapsed:  6.0min
[Parallel(n_jobs=10)]: Done 9863 tasks      | 

GridSearchCV(cv=None, error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=True, tol=0.001,
                                     validation_fraction=0...
             param_grid={'alpha': [0.001, 0.0001, 0.1, 0.01],
                         'early_stopping': [True, False],
                         'epsilon': [0.1, 0.01],
                         'l1_ratio': [0.15, 0.45, 0.01, 0.015],
                         'loss': ['hinge', 'log', 'perceptron',
                    

In [17]:
pickle.dump(gs, open('../../../models/SGD_SVM_GridSearch.p', 'wb'))
gs.best_estimator_

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=True, epsilon=0.01, eta0=0.0, fit_intercept=True,
              l1_ratio=0.45, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.1,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [18]:

svm_tuned = SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=True, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='squared_hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l1',
              power_t=0.5, random_state=None, shuffle=True, tol=0.1,
              validation_fraction=0.1, verbose=0, warm_start=False)

svm_tuned.fit(x_train_new,y_train)
test_pred = svm_tuned.predict(x_test_new)
print(f'Test Score: {svm_tuned.score(x_test_new,y_test)}')
print(f'Tuned ACC Score: {roc_auc_score(y_test, test_pred)}')

Test Score: 0.7145751816657351
Tuned ACC Score: 0.6971892456171424
