## Logistic Regression

In [2]:
import pandas as pd
#Reading train data, splitting matrix X and target Y
train = pd.read_csv('/Users/jigyasasachdeva/Desktop/Data/train.csv', index_col = 0)
X = train.loc[:, train.columns != 'CARAVAN']
y = train['CARAVAN'].astype('category')

In [3]:
#Value counts of the target variable 
train['CARAVAN'].value_counts()/train.shape[0]

0    0.940227
1    0.059773
Name: CARAVAN, dtype: float64

In [4]:
#Importing important libraries and splitting given train data into train and val
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix,roc_curve, roc_auc_score, precision_score, recall_score, precision_recall_curve, f1_score

#70-30 split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=27)

In [6]:
#Defining logistic regression model
logit_model = LogisticRegression(random_state=27)
#Fitting the model on train data 
logit_model.fit(X_train, y_train)
#Testing the model on test data
y_pred = logit_model.predict(X_val)

#Evaluating performance of baseline
print(f'Logistic Regression Confusion Matrix: \n{confusion_matrix(y_val, y_pred)}')
print(f'Logistic Regression Accuracy: {accuracy_score(y_val,y_pred)}')
print(f'Logistic Regression Area Under Curve: {roc_auc_score(y_val, y_pred)}')
print(f'Logistic Regression Recall: {recall_score(y_val,y_pred)}')
print(f'Logistic Regression Precision: {precision_score(y_val,y_pred)}')
print(f'Logistic Regression F1 score: {f1_score(y_val,y_pred)}')

Logistic Regression Confusion Matrix: 
[[1636    2]
 [ 108    1]]
Logistic Regression Accuracy: 0.9370349170005724
Logistic Regression Area Under Curve: 0.5039766553528021
Logistic Regression Recall: 0.009174311926605505
Logistic Regression Precision: 0.3333333333333333
Logistic Regression F1 score: 0.01785714285714286


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
#Assigning class weights based on opposite proportion of classes and re-fitting the model
logit_2 = LogisticRegression(random_state=26 , class_weight= {0:5.97, 1:94.03})
logit_2.fit(X_train, y_train)

#Testing the model on validation data
y_pred = logit_2.predict(X_val)
#Evaluation metrics:
print(f'Logistic Regression Confusion Matrix: \n{confusion_matrix(y_val, y_pred)}')
print(f'Logistic Regression Accuracy: {accuracy_score(y_val,y_pred)}')
print(f'Logistic Regression Area Under Curve: {roc_auc_score(y_val, y_pred)}')
print(f'Logistic Regression Recall: {recall_score(y_val,y_pred)}')
print(f'Logistic Regression Precision: {precision_score(y_val,y_pred)}')
print(f'Logistic Regression F1 score: {f1_score(y_val,y_pred)}')

Logistic Regression Confusion Matrix: 
[[1154  484]
 [  35   74]]
Logistic Regression Accuracy: 0.7029192902117917
Logistic Regression Area Under Curve: 0.6917083935432559
Logistic Regression Recall: 0.6788990825688074
Logistic Regression Precision: 0.13261648745519714
Logistic Regression F1 score: 0.22188905547226387


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
#For grid search CV
#Assigning random weights
weights = [{0:1.0,1:0.001}, {0:1.0,1:0.01}, {0:1.0,1:0.1}, {0:1.0,1:1.0}, 
     {0:1.0,1:10}, {0:1.0,1:100}, {0:1.0,1:200}, {0:1.0,1:300},
     {0:1.0,1:400},{0:1.0,1:500}, {0:1.0,1:1000}, {0:0.01,1:1.0}, 
     {0:0.01,1:10}, {0:0.01,1:100}, {0:0.001,1:1.0}, {0:0.005,1:1.0}, 
     {0:10,1:0.1}, {0:10,1:1000}, {0:100,1:1000} ]
hyperparam_grid = {"class_weight": weights }

In [13]:
#Trying grid search of class weight based on auc 

logit_3 = LogisticRegression(random_state=27)
grid = GridSearchCV(logit_3,hyperparam_grid, scoring="roc_auc", cv=100, n_jobs=-1, refit=True)
#Fitting on the entire train data
grid.fit(X,y)
print(f'Best score: {grid.best_score_} with param: {grid.best_params_}')

Best score: 0.7521018518518519 with param: {'class_weight': {0: 1.0, 1: 10}}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
#Trying grid search of class weight based on f1 score 

logit_3 = LogisticRegression(random_state=13)
grid = GridSearchCV(logit_3,hyperparam_grid, scoring="f1", cv=100, n_jobs=-1, refit=True)
#Fitting on the entire train data
grid.fit(X,y)
print(f'Best score: {grid.best_score_} with param: {grid.best_params_}')

Best score: 0.2269089262131057 with param: {'class_weight': {0: 100, 1: 1000}}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# Grid search CV for weights assigned to class labels on Validation data

logit_4 = LogisticRegression(random_state=27, class_weight={0: 100, 1: 1000})
# fit it
logit_4.fit(X_train, y_train)
# test
y_pred = logit_4.predict(X_val)
# performance
print(f'Accuracy Score: {accuracy_score(y_val,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_val, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_val, y_pred)}')
print(f'Recall score: {recall_score(y_val,y_pred)}')
print(f'F1 score: {metrics.f1_score(y_val,y_pred)}')

Accuracy Score: 0.80475
Confusion Matrix: 
[[3099  663]
 [ 118  120]]
Area Under Curve: 0.6639828180075857
Recall score: 0.5042016806722689
F1 score: 0.23506366307541626


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
# define hyperparameters
weights = [{0:1.0,1:0.001}, {0:1.0,1:0.01}, {0:1.0,1:0.1}, {0:1.0,1:1.0}, 
           {0:1.0,1:10}, {0:1.0,1:100}, {0:1.0,1:200}, {0:1.0,1:300},
           {0:1.0,1:400},{0:1.0,1:500}, {0:1.0,1:1000}, {0:0.01,1:1.0}, 
           {0:0.01,1:10}, {0:0.01,1:100}, {0:0.001,1:1.0}, {0:0.005,1:1.0}, 
           {0:10,1:0.1}, {0:10,1:1000}, {0:100,1:1000} ]
c = np.arange(0.5, 20.0, 0.5)
hyperparameter_grid = {"class_weight": weights,
                       "penalty": ["l1", "l2"], #Lasso or Ridge
                       "C": crange,
                       "fit_intercept": [True, False]  }

In [20]:
# logistic model classifier
logit_5 = LogisticRegression(random_state=27)
# define evaluation procedure
grid = GridSearchCV(logit_5, hyperparameter_grid, scoring="f1", cv=100, n_jobs=-1, refit=True)
grid.fit(X,y)
print(f'Best F1 score: {grid.best_score_} with parameters: {grid.best_params_}')

Best score: 0.23155389923114378 with param: {'C': 1.5, 'class_weight': {0: 1.0, 1: 10}, 'fit_intercept': True, 'penalty': 'l2'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [22]:
#Final model with best parameters
logit_6 = LogisticRegression(random_state=37,
                         C=1.5,
                         fit_intercept=True, 
                         penalty='l2',
                         class_weight={0: 1.0, 1: 10} )
logit_6.fit(X,y)

#Reading test data
test = pd.read_csv('/Users/jigyasasachdeva/Desktop/Data/test.csv', index_col = 0)
X_test = test.loc[:, test.columns != 'CARAVAN']
y_test = test['CARAVAN'].astype('category')

#predicting on test data
y_pred = logit_6.predict(X_test)
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}') 
print(f'Recall score: {recall_score(y_test,y_pred)}')
print(f'F1 score: {metrics.f1_score(y_test,y_pred)}')
print(f'Precision score: {precision_score(y_test,y_pred)}')

Accuracy Score: 0.808
Confusion Matrix: 
[[3114  648]
 [ 120  118]]
Area Under Curve: 0.66177475774999
Recall score: 0.4957983193277311
F1 score: 0.23505976095617528
Precision score: 0.15404699738903394


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
## Repeating procedure on SMOTE data
smote_data = pd.read_csv('/Users/jigyasasachdeva/Desktop/Data/SMOTE_traindata.csv', index_col = 0)
X_smote = smote_data.loc[:, smote_data.columns != 'CARAVAN']
y_smote = smote_data['CARAVAN'].astype('category')

# logistic model classifier
logit_7 = LogisticRegression(random_state=13)
# define evaluation procedure
grid = GridSearchCV(logit_7, hyperparam_grid, scoring="f1", cv=10, n_jobs=-1, refit=True)
grid.fit(X_smote,y_smote)
print(f'Best F1 score: {grid.best_score_} with parameters: {grid.best_params_}')

Best score: 0.7537089872425629 with param: {'C': 4.0, 'class_weight': {0: 1.0, 1: 1.0}, 'fit_intercept': True, 'penalty': 'l2'}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
#Re-evaluating SMOTE model on the train data to evaluate better fit between logit_6 and logit_8
logit_8 = LogisticRegression(random_state=37,
                         C=4.0,
                         fit_intercept=True, 
                         penalty='l2',
                         class_weight={0: 1.0, 1: 10} )
logit_8.fit(X_smote,y_smote)
#Testing on train data
y_pred = lg4.predict(X)
# performance
print(f'Accuracy Score: {accuracy_score(y,y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y, y_pred)}')
print(f'Area Under Curve: {roc_auc_score(y, y_pred)}') 
print(f'Recall score: {recall_score(y,y_pred)}')
print(f'F1 score: {metrics.f1_score(y,y_pred)}')
print(f'Precision score: {precision_score(y,y_pred)}')

Accuracy Score: 0.54475
Confusion Matrix: 
[[2014 1748]
 [  73  165]]
Area Under Curve: 0.6143154231389525
Recall score: 0.6932773109243697
F1 score: 0.15341701534170155
Precision score: 0.08625196027182436


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
## This proves that logit 6 is much more better than logit 8 and hence, we are using normal data with 
## tuned hyper parameters