In [1]:
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import RFE 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pickle 
import pandas as pd

In [2]:
def get_pickles(): 
    x_train = pickle.load(open(f'../../data/processed/pickles/x_train.p', 'rb'))
    x_test = pickle.load(open(f'../../data/processed/pickles/x_test.p', 'rb'))
    y_train = pickle.load(open(f'../../data/processed/pickles/y_train.p', 'rb'))
    y_test = pickle.load(open(f'../../data/processed/pickles/y_test.p', 'rb'))
    X = pickle.load(open('../../data/processed/pickles/X.p', 'rb'))
    y = pickle.load(open('../../data/processed/pickles/y.p', 'rb'))

    return (x_train, x_test, y_train, y_test), (X,y)

(x_train, x_test, y_train, y_test), (X,y) = get_pickles()


print(f'Original X: {X.shape}\tOriginal y: {y.shape}')
print(f'Train X: {x_train.shape}\tTrain y: {y_train.shape}')
print(f'Test X: {x_test.shape}\tTest y: {y_test.shape}')

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()


standard = StandardScaler() 

x_train[['amount_tsh', 'gps_height', 
         'population', 'time_passed']] = standard.fit_transform(x_train[['amount_tsh', 
                                                                         'gps_height', 'population', 'time_passed']])

Original X: (57247, 53)	Original y: (57247, 1)
Train X: (48659, 53)	Train y: (48659, 1)
Test X: (8588, 53)	Test y: (8588, 1)


In [3]:
svm = SGDClassifier(n_jobs = 8)
rfe = RFE(svm, n_features_to_select = 5, verbose = 2)
rfe.fit(x_train,y_train)

Fitting estimator with 53 features.
Fitting estimator with 52 features.
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 fe

RFE(estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                            early_stopping=False, epsilon=0.1, eta0=0.0,
                            fit_intercept=True, l1_ratio=0.15,
                            learning_rate='optimal', loss='hinge',
                            max_iter=1000, n_iter_no_change=5, n_jobs=8,
                            penalty='l2', power_t=0.5, random_state=None,
                            shuffle=True, tol=0.001, validation_fraction=0.1,
                            verbose=0, warm_start=False),
    n_features_to_select=5, step=1, verbose=2)

In [5]:
print('Score', rfe.score(x_test,y_test))
#masks for columns that are important
column_masks = rfe.support_
print(column_masks)

orig_columns = x_train.columns
new_columns = [x for x,y in zip(orig_columns, column_masks) if y == True]
print(new_columns)

Score 0.7078481602235678
[False False False False False False False False False False False False
 False  True False False False False False False False False False False
 False False  True False False  True False  True False False False False
 False False False False False False False False False False False False
 False False False  True False]
['basin_Lake Rukwa', 'extract_climax', 'waterpoint_communal standpipe multiple', 'waterpoint_other', 'quantity_dry']


In [6]:
x_train_new = x_train[new_columns]
x_test_new= x_test[new_columns]
param_grid = {
    'loss': ['hinge', 'log', 'perceptron', 'modified_huber', 'squared_hinge'],
    'penalty': ['l2', 'l1'],
    'alpha': [.001, .0001, .1, .01],
    'l1_ratio': [.15, .45, .01, .015],
    'max_iter': [1000, 3000],
    'tol': [.001, .01, .1],
    'epsilon': [.1, .01],
    'early_stopping': [True, False]
}
svm = SGDClassifier()
gs = GridSearchCV(svm, param_grid = param_grid, verbose = 2, n_jobs = -1)
gs.fit(x_train_new, y_train)

Fitting 5 folds for each of 3840 candidates, totalling 19200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed:    5.3s
[Parallel(n_jobs=-1)]: Done 616 tasks      | elapsed:    8.4s
[Parallel(n_jobs=-1)]: Done 981 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done 1426 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 1953 tasks      | elapsed:   23.0s
[Parallel(n_jobs=-1)]: Done 2624 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done 4002 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done 5540 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done 7242 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 9104 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 11130 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 13316 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 15666 tasks     

GridSearchCV(cv=None, error_score=nan,
             estimator=SGDClassifier(alpha=0.0001, average=False,
                                     class_weight=None, early_stopping=False,
                                     epsilon=0.1, eta0=0.0, fit_intercept=True,
                                     l1_ratio=0.15, learning_rate='optimal',
                                     loss='hinge', max_iter=1000,
                                     n_iter_no_change=5, n_jobs=None,
                                     penalty='l2', power_t=0.5,
                                     random_state=None, shuffle=True, tol=0.001,
                                     validation_fraction=0...
             param_grid={'alpha': [0.001, 0.0001, 0.1, 0.01],
                         'early_stopping': [True, False],
                         'epsilon': [0.1, 0.01],
                         'l1_ratio': [0.15, 0.45, 0.01, 0.015],
                         'loss': ['hinge', 'log', 'perceptron',
                    

In [7]:
pickle.dump(gs, open('../../models/SGD_SVM_GridSearch.p', 'wb'))

In [8]:
gs.best_estimator_

SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=True, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.45, learning_rate='optimal', loss='modified_huber',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [9]:
x_test_new= x_test[new_columns]

svm_tuned = SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=True, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='squared_hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l1',
              power_t=0.5, random_state=None, shuffle=True, tol=0.1,
              validation_fraction=0.1, verbose=0, warm_start=False)

svm_tuned.fit(x_train_new,y_train)

print(f'Test Score: {svm_tuned.score(x_test_new,y_test)}')

Test Score: 0.6934094084769445
