In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score, GridSearchCV
import pandas as pd
import pickle 
import numpy as np
from tqdm import tqdm
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.metrics import roc_auc_score

In [2]:
def get_pickles(): 
    x_train = pickle.load(open(f'../../../data/processed/pickles/cluster_x_train.p', 'rb'))
    x_test = pickle.load(open(f'../../../data/processed/pickles/cluster_x_test.p', 'rb'))
    y_train = pickle.load(open(f'../../../data/processed/pickles/cluster_y_train.p', 'rb'))
    y_test = pickle.load(open(f'../../../data/processed/pickles/cluster_y_test.p', 'rb'))
    X = pickle.load(open('../../../data/processed/pickles/cluster_X.p', 'rb'))
    y = pickle.load(open('../../../data/processed/pickles/cluster_y.p', 'rb'))

    return (x_train, x_test, y_train, y_test), (X,y)

(x_train, x_test, y_train, y_test), (X,y) = get_pickles()


print(f'Original X: {X.shape}\tOriginal y: {y.shape}')
print(f'Train X: {x_train.shape}\tTrain y: {y_train.shape}')
print(f'Test X: {x_test.shape}\tTest y: {y_test.shape}')

y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

Original X: (57247, 55)	Original y: (57247, 1)
Train X: (42935, 54)	Train y: (42935, 1)
Test X: (14312, 54)	Test y: (14312, 1)


In [10]:
#Vanilla 
lg = LogisticRegression(max_iter = 3000) 
lg.fit(x_train,y_train)
test_pred = lg.predict(x_test)
print(f'Vanilla ROC Score: {roc_auc_score(y_test, test_pred)}')

Vanilla ROC Score: 0.713695346683728


In [8]:
lg = LogisticRegression(max_iter = 3000)
best_score = 0 
best_i = 0
r_back = [i for i in range(1,len(x_train.columns)+1)]
pbar = tqdm(r_back)
for i in pbar: 
    pbar.set_description(f'Testing {i} PCA Features || Best AUC: {best_score} || Best n: {best_i}')
    pca = PCA(n_components = i)
    pca.fit(x_train)
    x_train_new = pca.transform(x_train)
    x_test_new = pca.transform(x_test)
    lg.fit(x_train_new,y_train)
    pred = lg.predict(x_test_new)
    roc_auc = roc_auc_score(y_test, pred)
    if roc_auc > best_score: 
        best_score = roc_auc 
        best_i = i
    #masks for columns that are important
        pickle.dump(best_i, open('../../../models/Logistic_PCA.p', 'wb'))

Testing 54 PCA Features || Best AUC: 0.7145282072313638 || Best n: 41: 100%|██████████████████████████████████████████████████████████████████████████████| 54/54 [02:26<00:00,  2.72s/it]


In [5]:
lg = LogisticRegression(max_iter = 3000)
best_score = 0 
best_i = 0
r_back = [i for i in range(8,53)]
pbar = tqdm(r_back)
for i in pbar: 
    pbar.set_description(f'Testing {i} Features || Best AUC: {best_score} || Best n: {best_i}')
    rfe = RFE(lg, n_features_to_select = i, step = 3)

    rfe.fit(x_train,y_train)
    pred = rfe.predict(x_test)
    roc_auc = roc_auc_score(y_test, pred)
    if roc_auc > best_score: 
        best_score = roc_auc 
        best_i = i
    #masks for columns that are important
        column_masks = rfe.support_

        orig_columns = x_train.columns
        new_columns = [x for x,y in zip(orig_columns, column_masks) if y == True]
        pickle.dump(new_columns, open('../../../models/LGColumns.p', 'wb'))

        


Testing 52 Features || Best AUC: 0.7120531464795856 || Best n: 51: 100%|██████████████████████████████████████████████████████████████████████████████████| 45/45 [05:44<00:00,  7.65s/it]


In [11]:
columns = pickle.load(open('../../../models/LGColumns.p', 'rb'))
print(columns)

['gps_height', 'permit', 'time_passed', 'basin_Lake Nyasa', 'basin_Lake Victoria', 'basin_Pangani', 'basin_Ruvuma / Southern Coast', 'basin_Internal', 'basin_Lake Tanganyika', 'basin_Wami / Ruvu', 'basin_Rufiji', 'basin_Lake Rukwa', 'extract_gravity', 'extract_submersible', 'extract_swn 80', 'extract_nira/tanira', 'extract_india mark ii', 'extract_extract_other', 'extract_ksb', 'extract_windmill', 'extract_afridev', 'extract_mono', 'extract_india mark iii', 'extract_cemo', 'extract_climax', 'extract_walimi', 'waterpoint_communal standpipe', 'waterpoint_communal standpipe multiple', 'waterpoint_hand pump', 'waterpoint_other', 'waterpoint_improved spring', 'waterpoint_cattle trough', 'waterpoint_dam', 'source_spring', 'source_rainwater harvesting', 'source_dam', 'source_machine dbh', 'source_other', 'source_shallow well', 'source_river', 'source_hand dtw', 'source_lake', 'quality_soft', 'quality_salty', 'quality_milky', 'quality_fluoride', 'quality_coloured', 'quantity_enough', 'quantity

In [12]:

param_grid = {
    'max_iter': [100, 1000,],
    'penalty': ['l1', 'l2'],
    'solver': ['sag', 'saga', 'liblinear', 'lbfgs', 'newton-cg'],
    'multi_class': ['ovr']  
}
x_train_new = x_train[columns]
x_test_new= x_test[columns]
log = LogisticRegression()
gs = GridSearchCV(log, param_grid = param_grid, verbose = 2, n_jobs = 11)
gs.fit(x_train_new, y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=11)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  19 tasks      | elapsed:    2.0s
[Parallel(n_jobs=11)]: Done 100 out of 100 | elapsed:  1.5min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=11,
             param_grid={'max_iter': [100, 1000], 'multi_class': ['ovr'],
                         'penalty': ['l1', 'l2'],
                         'solver': ['sag', 'saga', 'liblinear', 'lbfgs',
                                    'newton-cg']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, 

In [14]:
pickle.dump(gs, open('../../../models/Logistic_GridSearch.p', 'wb'))
gs.best_estimator_

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
log_tuned = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

log_tuned.fit(x_train_new, y_train)

test_pred = log_tuned.predict(x_test_new)


print(f'Tuned ROC Score: {roc_auc_score(y_test, test_pred)}')
print(f'Tuned ACC Score: {log_tuned.score(x_test_new, y_test)}')

Tuned ROC Score: 0.7120797456081445
Tuned ACC Score: 0.7276411403018446
