In [1]:
#Let's mount the google drive with colab:

from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#importing python libraries for analysis:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


from sklearn.model_selection import train_test_split,cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [4]:
df_Lending_Club = pd.read_csv("/content/drive/MyDrive/Kaggle/Lending_Club_Loan_approval_Optimization.csv")
df_Lending_Club.drop("Unnamed: 0", axis=1, inplace=True)
df_Lending_Club.head()

Unnamed: 0,Amount Requested,Risk_Score,Debt-To-Income Ratio,Employment Length,Target
0,3600.0,677.0,5.91,10,1
1,24700.0,717.0,16.06,10,1
2,20000.0,697.0,10.78,10,1
3,10400.0,697.0,25.37,3,1
4,11950.0,692.0,10.2,4,1


#**Let's split the data for training and testing:**



In [5]:
# spliting training and testing data
from sklearn.model_selection import train_test_split

X = df_Lending_Club.iloc[:,:-1].values
y = df_Lending_Club.iloc[:,-1].values

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [6]:
print(X_train.shape)
print(y_train.shape)

(1658244, 4)
(1658244,)


#**Let's Scale Down :**

In [7]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler, RobustScaler

Normalisation = MinMaxScaler()      # Scale down from 0 to 1
Standardisation = StandardScaler()  # Scale down mean=0, std=1. (range = -3 to +3)
Robust = RobustScaler()             # Median and IQR values stored and then transformed.

In [8]:
#Fit the data

norm = Normalisation.fit(X_train)
stand = Standardisation.fit(X_train)
robust = Robust.fit(X_train)

In [9]:
#Transform the Training dataset:

X_train_norm = norm.transform(X_train)    
X_train_stand = stand.transform(X_train)
X_train_robust = robust.transform(X_train)

In [10]:
#Transform the Testing dataset:

X_test_norm = norm.transform(X_test)
X_test_stand = stand.transform(X_test)
X_test_robust = robust.transform(X_test)

#**We can achieve above step by using like this too,**

* **X_train_norm = MinMaxScaler().fit_transform(X_train)**  #Fit for train
* **X_test_norm = norm.transform(X_test)**  
#transform alone sufficient(already train got fit)

or

* **norm = MinMaxScaler().fit(X_train)**
* **X_train_norm = norm.transform(X_train)**
* **X_test_norm = norm.transform(X_test)**

or 
 
* **norm = MinMaxScaler().fit_transform(X_train)**
* **X_test_norm = norm.transform(X_test)**

In [11]:
X_train = [X_train, X_train_norm, X_train_stand, X_train_robust]
y_train = y_train
X_test = [X_test, X_test_norm, X_test_stand, X_test_robust]
y_test = y_test

#**(1) Let's apply LogisticRegression Algorithm:**

In [12]:
from sklearn.linear_model import LogisticRegression
Model_lr = LogisticRegression()
Model_Norm_lr = LogisticRegression()
Model_Stand_lr = LogisticRegression()
Model_Robust_lr = LogisticRegression()

In [13]:
Models = ["LogisticRegression Without Scaling :", "LogisticRegression with Normalisation :", 
          "LogisticRegression with Standardisation :", "LogisticRegression with Robust Scaling :"]

LR = [Model_lr, Model_Norm_lr, Model_Stand_lr, Model_Robust_lr]

In [14]:
for i in range (4):
  LR[i].fit(X_train[i], y_train)       #Fit the training data in Logistic Regression
  y_pred = LR[i].predict(X_test[i])    #Predict the score
  y_pred
  print()
  print(Models[i])                     #Print the Model description
  print("Accuracy score of training data {} %:".format(LR[i].score(X_train[i], y_train)*100))
  print("Accuracy score of testing data {} %".format(LR[i].score(X_test[i], y_test)*100))


LogisticRegression Without Scaling :
Accuracy score of training data 78.80281792064376 %:
Accuracy score of testing data 78.94766042232526 %

LogisticRegression with Normalisation :
Accuracy score of training data 81.89217027168499 %:
Accuracy score of testing data 81.95879024126668 %

LogisticRegression with Standardisation :
Accuracy score of training data 85.40588719151103 %:
Accuracy score of testing data 85.44319064458392 %

LogisticRegression with Robust Scaling :
Accuracy score of training data 85.73267866490094 %:
Accuracy score of testing data 85.78162011954787 %


Looks Standardisation and Robust Scaling both provides good accuracy comparing others. So let's proceed hyperparameter tuning with Standardisation and Robust Scaling.

In [15]:
from sklearn.metrics import confusion_matrix

models = ["Confusion Matrix with Standardisation Scaling :","Confusion Matrix with Robust Scaling :"]

for i in range (2):
  y_pred_test = LR[i].predict(X_test[i])    #Predict the score
  cm = confusion_matrix(y_test, y_pred_test)
  print(models[i])
  print(cm)
  print()

Confusion Matrix with Standardisation Scaling :
[[169649  37850]
 [ 49425 157638]]

Confusion Matrix with Robust Scaling :
[[159821  47678]
 [ 27114 179949]]



#**(i) Standardisation :**

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1', 'l2']
c_values = [0.1, 1, 10, 15, 20, 100, 105, 110]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_stand, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))



Best: 0.857322 using {'C': 20, 'penalty': 'l2', 'solver': 'lbfgs'}
0.000000 (0.000000) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'lbfgs'}
0.852119 (0.001028) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
0.835195 (0.001021) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.835195 (0.001021) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.835174 (0.001028) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 1, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 1, 'penalty': 'l1', 'solver': 'lbfgs'}
0.852199 (0.000938) with: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
0.853582 (0.000940) with: {'C': 1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.853582 (0.000940) with: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.853550 (0.000947) with: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 10, '

#**Let's Retrain the Model by using the best parameters :**

Stratified CV:  Best: 0.857322 using {'C': 20, 'penalty': 'l2', 'solver': 'lbfgs'}

In [24]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
logistic_R = LogisticRegression( C= 20, penalty= 'l2', solver='lbfgs', n_jobs=-1)
logistic_R.fit(X_train_stand, y_train)
print("Accuracy score of training data {} %:".format(logistic_R.score(X_train_stand, y_train)*100))
print("Accuracy score of testing data {} %".format(logistic_R.score(X_test_stand, y_test)*100))

Accuracy score of training data 85.7344275028283 %:
Accuracy score of testing data 85.78958032815358 %


#**(ii) Randomization:**

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

model = LogisticRegression()
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l1', 'l2']
c_values = [0.01, 0.1, 1, 10, 15, 20, 100, 105]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train_robust, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.857356 using {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
0.000000 (0.000000) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'lbfgs'}
0.846117 (0.002116) with: {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
0.857356 (0.000852) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
0.857356 (0.000853) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
0.857353 (0.000852) with: {'C': 0.01, 'penalty': 'l2', 'solver': 'liblinear'}
0.000000 (0.000000) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'newton-cg'}
0.000000 (0.000000) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'lbfgs'}
0.846090 (0.002122) with: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
0.857322 (0.000857) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.857322 (0.000857) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.857329 (0.000859) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.000000 (0.00000

#**Let's Retrain the Model by using the best parameters :**

stratified : Best: 0.857356 using {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}

In [27]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
logistic_R = LogisticRegression( C= 0.01, penalty= 'l2', solver='lbfgs', n_jobs=-1)
logistic_R.fit(X_train_robust, y_train)
print("Accuracy score of training data {} %:".format(logistic_R.score(X_train_robust, y_train)*100))
print("Accuracy score of testing data {} %".format(logistic_R.score(X_test_robust, y_test)*100))

Accuracy score of training data 85.7356335979506 %:
Accuracy score of testing data 85.785720833072 %
