# Model - Hypertuning - Logistic Regression

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import pickle

In [0]:
##############################################################################
##########                DATABASE FUNCTIONS                     #############
##############################################################################
#### Read function to import data from the SQL to a pandas dataframe.
def readSQL(query):
    import pandas as pd
    import sqlite3 as sql3
    db = sql3.connect(DB_FILE)
    df = pd.read_sql_query(query, db)
    db.close()
    return(df)

#### Write a pandas dataframe into an SQL table. Use overwrite=True if you want to delete 
#### first a pre-existent table with the same name. Use append=True if you want to append
#### the data in the dataframe to a pre-existent table.
def writeSQL(df,tablename,overwrite=False, append=False):
    import pandas as pd
    import sqlite3 as sql
    db = sql.connect(DB_FILE)
    if (overwrite):
        action = "replace"
    elif (append):
        action = "append"
    else: 
        action = "fail"
    df.to_sql(tablename, db, if_exists=action)
    db.close()
def listTables():
    import sqlite3 as sql3
    db = sql3.connect(DB_FILE)
    cur = db.cursor()
    cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
    res = cur.fetchall()
    cur.close()
    db.close()
    return(res)

In [0]:
import os
cwd = os.getcwd()
DB_FILE = "%s\Data\loans.db" % cwd

In [0]:
X_Train = readSQL('''SELECT * FROM X_train_scaled''')

In [0]:
Y_Train = readSQL('''SELECT * FROM Y_train''')

In [0]:
X_Train = X_Train.drop(["index"],axis=1)

In [0]:
Y_Train = Y_Train.drop(["index"],axis=1)

In [0]:
X_Dev = readSQL('''SELECT * FROM X_dev_scaled''')

In [0]:
X_Dev = X_Dev.drop(["index"],axis=1)

In [0]:
Y_Dev = readSQL('''SELECT * FROM Y_dev''')

In [0]:
Y_Dev = Y_Dev.drop(["index"],axis=1)

In [0]:
Y_Train = Y_Train.values.ravel()
Y_Dev = Y_Dev.values.ravel()

## Base Logistic Regression

In [0]:
# Initialize the predictive model object
#mod_logistic = LogisticRegression(random_state=1207,solver ="saga", max_iter=10000,C=1)
filename= "%s\Models\logistic_base_model.sav" % cwd
mod_logistic = pickle.load(open(filename, 'rb'))

In [0]:
# Train the model using the training sets
mod_logistic.fit(X_Train, Y_Train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1207, solver='saga', tol=0.0001,
          verbose=0, warm_start=False)

In [0]:
pred_train = mod_logistic.predict_proba(X_Train)

In [0]:
pred_dev = mod_logistic.predict_proba(X_Dev)

In [0]:
train_score = metrics.roc_auc_score(Y_Train, pred_train[:,1])

In [0]:
dev_score = metrics.roc_auc_score(Y_Dev, pred_dev[:,1])

In [0]:
print(train_score)
print(dev_score)

0.6942647732514526
0.6894901040048504


## Checking Paramters with GridSearch

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [0]:
pipeline = Pipeline([('mdl',LogisticRegression(random_state=1207,solver ="saga", max_iter=10000))])
parameters = {"mdl__C":(0.001, 0.01, 0.1, 1, 10, 100), "mdl__penalty":["l1","l2"]}

grid_search = GridSearchCV(pipeline, parameters, n_jobs=2, cv=5,verbose=1, scoring='roc_auc')
grid_search.fit(X_Train, Y_Train)
print ('Best score: %0.3f' % grid_search.best_score_)
print ('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print ('\t%s: %r' % (param_name, best_parameters[param_name]))

predictions = grid_search.predict_proba(X_Dev)
print (metrics.classification_report(Y_Dev, predictions))

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed: 88.8min
[Parallel(n_jobs=2)]: Done  60 out of  60 | elapsed: 153.3min finished


Best score: 0.680
Best parameters set:
	mdl__C: 0.01
	mdl__penalty: 'l1'
             precision    recall  f1-score   support

          0       0.82      1.00      0.90     20869
          1       0.53      0.02      0.04      4610

avg / total       0.77      0.82      0.74     25479



In [0]:
predictions = grid_search.predict_proba(X_Dev)
print (metrics.classification_report(Y_Dev, predictions))

## Checking the best model by GridSearch

In [0]:
mod_logistic_searchBest = LogisticRegression(random_state=1207,solver ="saga", max_iter=10000,C=0.01,penalty='l1')
mod_logistic_searchBest.fit(X_Train, Y_Train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=1207, solver='saga', tol=0.0001,
          verbose=0, warm_start=False)

In [0]:
pred_train_searchBest = mod_logistic_searchBest.predict_proba(X_Train)

In [0]:
pred_dev_searchBest = mod_logistic_searchBest.predict_proba(X_Dev)

In [0]:
train_score_searchBest = metrics.roc_auc_score(Y_Train, pred_train_searchBest[:,1])

In [0]:
dev_score_searchBest = metrics.roc_auc_score(Y_Dev, pred_dev_searchBest[:,1])

In [0]:
print(train_score_searchBest)
print(dev_score_searchBest)

0.6907755334747212
0.6858102122225318


## Changing the parameters manually

In [0]:
c_param = [1e-07, 1e-06,1e-05,1e-04,0.001, 0.01, 0.1,5,10,100,1000]
l_param = ["l1","l2"]
res = []
for l in l_param:
    for i in c_param:
        model = LogisticRegression(solver="saga",max_iter=10000, C=i,penalty=l, random_state=1207,n_jobs=2)
        model = model.fit(X_Train, Y_Train)
        pred_train = model.predict_proba(X_Train)
        pred_dev = model.predict_proba(X_Dev)
        train_score = metrics.roc_auc_score(Y_Train, pred_train[:,1])
        dev_score = metrics.roc_auc_score(Y_Dev, pred_dev[:,1])
        res.append([l,i,train_score,dev_score,model.score(X_Train, Y_Train), model.score(X_Dev,Y_Dev)])
        print("Penalty: %s | C: %s | Train AUC: %s | Dev AUC: %s | Train Score: %s | Dev Score: %s" % (l,i,train_score,dev_score, model.score(X_Train,Y_Train), model.score(X_Dev,Y_Dev)))



Penalty: l1 | C: 1e-07 | Train AUC: 0.5 | Dev AUC: 0.5 | Train Score: 0.8196147883592693 | Dev Score: 0.8190666823658699
Penalty: l1 | C: 1e-06 | Train AUC: 0.5 | Dev AUC: 0.5 | Train Score: 0.8196147883592693 | Dev Score: 0.8190666823658699
Penalty: l1 | C: 1e-05 | Train AUC: 0.5 | Dev AUC: 0.5 | Train Score: 0.8196147883592693 | Dev Score: 0.8190666823658699
Penalty: l1 | C: 0.0001 | Train AUC: 0.5 | Dev AUC: 0.5 | Train Score: 0.8196147883592693 | Dev Score: 0.8190666823658699
Penalty: l1 | C: 0.001 | Train AUC: 0.6798024897358176 | Dev AUC: 0.6769002357335177 | Train Score: 0.8196196942640162 | Dev Score: 0.8191059303740336
Penalty: l1 | C: 0.01 | Train AUC: 0.6907755334747212 | Dev AUC: 0.6858102122225318 | Train Score: 0.820066131595989 | Dev Score: 0.8194591624475058
Penalty: l1 | C: 0.1 | Train AUC: 0.6940768244759581 | Dev AUC: 0.6891799365300055 | Train Score: 0.8199287662630742 | Dev Score: 0.8202048746026139
Penalty: l1 | C: 5 | Train AUC: 0.6942668541949883 | Dev AUC: 0.68

In [0]:
res = pd.DataFrame(res, columns=['L','C','Train_auc_score','Dev_auc_score','Train_Score','Dev_score'])

In [0]:
df=res[:-12]
df=df.drop(0,axis=1)

In [0]:
base = []
base.append(['l2',1,train_score,dev_score,mod_logistic.score(X_Train, Y_Train), mod_logistic.score(X_Dev,Y_Dev)])
base = pd.DataFrame(base, columns=['L','C','Train_auc_score','Dev_auc_score','Train_Score','Dev_score'])

In [0]:
df = df.append(base,ignore_index=True,sort=False)

In [0]:
df['AUC_Diff'] = df['Train_auc_score'] - df['Dev_auc_score']

In [0]:
df.sort_values(['Train_auc_score', 'Dev_auc_score','AUC_Diff'], ascending=[0, 0 ,0])

Unnamed: 0,L,C,Train_auc_score,Dev_auc_score,Train_Score,Dev_score,AUC_Diff
10,l1,1000.0,0.694269,0.689505,0.819953,0.820166,0.004764
9,l1,100.0,0.694269,0.689505,0.819958,0.820166,0.004764
21,l2,1000.0,0.694268,0.689507,0.819958,0.820166,0.004762
20,l2,100.0,0.694268,0.689507,0.819963,0.820166,0.004762
8,l1,10.0,0.694268,0.689503,0.819963,0.820166,0.004765
19,l2,10.0,0.694268,0.689505,0.819963,0.820166,0.004762
18,l2,5.0,0.694268,0.689505,0.819973,0.820166,0.004763
7,l1,5.0,0.694267,0.6895,0.819968,0.820166,0.004766
22,l2,1.0,0.694265,0.68949,0.819973,0.820126,0.004775
17,l2,0.1,0.694227,0.689391,0.819953,0.820205,0.004836


The best models are l1/l2 penalty and c=100,1000

In [0]:
c_param = [500,1500,2000,5000,10000]
l_param = ["l1","l2"]
res2 = []
for l in l_param:
    for i in c_param:
        model = LogisticRegression(solver="saga",max_iter=10000, C=i,penalty=l, random_state=1207,n_jobs=2)
        model = model.fit(X_Train, Y_Train)
        pred_train = model.predict_proba(X_Train)
        pred_dev = model.predict_proba(X_Dev)
        train_score = metrics.roc_auc_score(Y_Train, pred_train[:,1])
        dev_score = metrics.roc_auc_score(Y_Dev, pred_dev[:,1])
        res2.append([l,i,train_score,dev_score,model.score(X_Train, Y_Train), model.score(X_Dev,Y_Dev)])
        print("Penalty: %s | C: %s | Train AUC: %s | Dev AUC: %s | Train Score: %s | Dev Score: %s" % (l,i,train_score,dev_score, model.score(X_Train,Y_Train), model.score(X_Dev,Y_Dev)))



Penalty: l1 | C: 500 | Train AUC: 0.6942689011154308 | Dev AUC: 0.6895045209715934 | Train Score: 0.819953295786809 | Dev Score: 0.8201656265944504
Penalty: l1 | C: 1500 | Train AUC: 0.6942689178827925 | Dev AUC: 0.6895046249151172 | Train Score: 0.819953295786809 | Dev Score: 0.8201656265944504
Penalty: l1 | C: 2000 | Train AUC: 0.6942689305804063 | Dev AUC: 0.6895045625490028 | Train Score: 0.819953295786809 | Dev Score: 0.8201656265944504
Penalty: l1 | C: 5000 | Train AUC: 0.6942689283013473 | Dev AUC: 0.6895045521546506 | Train Score: 0.819953295786809 | Dev Score: 0.8201656265944504
Penalty: l1 | C: 10000 | Train AUC: 0.694268924882759 | Dev AUC: 0.6895045833377077 | Train Score: 0.819953295786809 | Dev Score: 0.8201656265944504
Penalty: l2 | C: 500 | Train AUC: 0.6942683306995545 | Dev AUC: 0.6895065270816017 | Train Score: 0.819958201691556 | Dev Score: 0.8201656265944504
Penalty: l2 | C: 1500 | Train AUC: 0.6942683538157229 | Dev AUC: 0.689506620630773 | Train Score: 0.81995820

In [0]:
res2 = pd.DataFrame(res2, columns=['L','C','Train_auc_score','Dev_auc_score','Train_Score','Dev_score'])

In [0]:
res2['AUC_Diff'] = res2['Train_auc_score'] - res2['Dev_auc_score']

In [0]:
res2.sort_values(['Train_auc_score', 'Dev_auc_score','AUC_Diff'], ascending=[0, 0 ,0])

Unnamed: 0,L,C,Train_auc_score,Dev_auc_score,Train_Score,Dev_score,AUC_Diff
2,l1,2000,0.694269,0.689505,0.819953,0.820166,0.004764
3,l1,5000,0.694269,0.689505,0.819953,0.820166,0.004764
4,l1,10000,0.694269,0.689505,0.819953,0.820166,0.004764
1,l1,1500,0.694269,0.689505,0.819953,0.820166,0.004764
0,l1,500,0.694269,0.689505,0.819953,0.820166,0.004764
7,l2,2000,0.694268,0.689507,0.819958,0.820166,0.004762
8,l2,5000,0.694268,0.689507,0.819958,0.820166,0.004762
9,l2,10000,0.694268,0.689507,0.819958,0.820166,0.004762
6,l2,1500,0.694268,0.689507,0.819958,0.820166,0.004762
5,l2,500,0.694268,0.689507,0.819958,0.820166,0.004762


We take the best model as l2 penalty and C=100

In [0]:
model = LogisticRegression(solver="saga",max_iter=10000, C=100,penalty='l2', random_state=1207,n_jobs=2)
model = model.fit(X_Train, Y_Train)

In [0]:
##Save the model
filename= "%s\Models\logistic_best_model.sav" % cwd
pickle.dump(model, open(filename, 'wb'))