In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [28]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import numpy as np

# Import Data

In [29]:
#Import cleaned csv file
data = pd.read_csv("output.csv")
data.columns

Index(['Customer ID',
       'If a customer has bounced in first EMI (1 : Bounced, 0 : Not bounced)',
       'Number of times bounced in recent 12 months',
       'Maximum MOB (Month of business with TVS Credit)',
       'Number of times bounced while repaying the loan', 'EMI', 'Loan Amount',
       'Tenure',
       'Dealer codes from where customer has purchased the Two wheeler',
       'No of advance EMI paid', 'Rate of Interest',
       'Gender(Male = 0, Female = 1)',
       'Age at which customer has taken the loan', 'Number of loans',
       ' Number of secured loans', ' Number of unsecured loans',
       'Maximum amount sanctioned in the Live loans',
       'Number of new loans in last 3 months',
       'Total sanctioned amount in the secured Loans which are Live',
       'Total sanctioned amount in the unsecured Loans which are Live',
       'Maximum amount sanctioned for any Two wheeler loan',
       'Time since last Personal loan taken (in months)',
       'Time since first co

In [30]:
#Select only relevant columns
model_data = data[[
    'EMI', 
    'Loan Amount', 
    'Maximum amount sanctioned for any Two wheeler loan',
    'Age at which customer has taken the loan', 
    'Rate of Interest', 
    'Number of times 30 days past due in last 6 months', 
    'Maximum MOB (Month of business with TVS Credit)', 
    'Number of times 60 days past due in last 6 months', 
    'Number of loans', 
    'Maximum amount sanctioned in the Live loans', 
    'Number of times 90 days past due in last 3 months', 
    'Tenure', 
    'Number of times bounced while repaying the loan',
    'Target variable ( 1: Defaulters / 0: Non-Defaulters)'
]]


In [31]:
#Rename Dependent Variable Column to "Class"
model_data = model_data.rename(columns={"Target variable ( 1: Defaulters / 0: Non-Defaulters)": "Class"})
model_data.head()

Unnamed: 0,EMI,Loan Amount,Maximum amount sanctioned for any Two wheeler loan,Age at which customer has taken the loan,Rate of Interest,Number of times 30 days past due in last 6 months,Maximum MOB (Month of business with TVS Credit),Number of times 60 days past due in last 6 months,Number of loans,Maximum amount sanctioned in the Live loans,Number of times 90 days past due in last 3 months,Tenure,Number of times bounced while repaying the loan,Class
0,2432.0,46500.0,46500.0,40.0,12.75,0,24.0,0,1,0.0,0,24.0,0.0,0
1,1495.0,28168.0,28168.0,47.0,13.65,0,24.0,0,1,0.0,0,24.0,1.0,0
2,1707.0,38900.0,38900.0,31.0,12.65,31,26.0,31,9,55000.0,16,30.0,0.0,0
3,2128.0,42900.0,42900.0,24.0,9.5,0,24.0,0,1,0.0,0,24.0,0.0,0
4,1824.0,40900.0,40900.0,30.0,13.5,0,27.0,0,1,0.0,0,30.0,0.0,0


# Train the Model

In [32]:
# Assign X (data) and y (target)
X = model_data.drop('Class', axis=1)
y = model_data['Class']
print(f"X Shape: {X.shape}")
print(f"y Shape: {y.shape}")

X Shape: (73532, 13)
y Shape: (73532,)


In [33]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=10)

#Resample Data using Oversampling Technique

In [34]:
# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
not_defaulted = X[X.Class==0]
defaulted = X[X.Class==1]

print(not_defaulted.Class.value_counts())
print(defaulted.Class.value_counts())

0    57489
Name: Class, dtype: int64
1    1336
Name: Class, dtype: int64


In [35]:
from sklearn.utils import resample
# upsample minority
defaulted_upsampled = resample(defaulted,
                               replace=True,
                               n_samples=len(not_defaulted),
                               random_state=27)
# combine majority and upsampled minority
upsampled = pd.concat([not_defaulted, defaulted_upsampled])
upsampled.Class.value_counts()

1    57489
0    57489
Name: Class, dtype: int64

In [36]:
#redefine the X and y training samples
y_train = upsampled.Class
X_train = upsampled.drop('Class', axis=1)

In [37]:
#Define the classifier as a logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(C=0.004832930238571752, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=5000, multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
classifier

#fit the training samples in the classifier
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7754266033502061
Testing Data Score: 0.8415720405249201


# Assessment 

In [38]:
#Compare with test data
predictions = classifier.predict(X_test)

print(sum(predictions), sum(y_test))

total = 0
default_num = 0

for i in range(len(predictions)):
    if(predictions[i] == y_test.array[i]):
        total +=1
        if(predictions[i]==1):
            default_num+=1
print("-------" * 2)            
print(default_num, sum(y_test))
print("-------" * 2)
print(total/len(predictions))

2448 292
--------------
205 292
--------------
0.8415720405249201


# Grid Search

In [39]:
#Import Grid Search
from sklearn.model_selection import GridSearchCV

#Define param_grid
param_grid = [    
    {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
    }
]

#define clf
clf = GridSearchCV(classifier, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)



In [14]:
best_clf = clf.fit(X_train, y_train)

Fitting 3 folds for each of 1600 candidates, totalling 4800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:  5.1min
[Parallel(n_jobs=-1)]: Done 438 tasks      | elapsed: 18.6min
[Parallel(n_jobs=-1)]: Done 788 tasks      | elapsed: 37.1min
[Parallel(n_jobs=-1)]: Done 1238 tasks      | elapsed: 62.3min
[Parallel(n_jobs=-1)]: Done 1788 tasks      | elapsed: 92.0min
[Parallel(n_jobs=-1)]: Done 2438 tasks      | elapsed: 141.1min
[Parallel(n_jobs=-1)]: Done 3188 tasks      | elapsed: 181.9min
[Parallel(n_jobs=-1)]: Done 4038 tasks      | elapsed: 229.7min
[Parallel(n_jobs=-1)]: Done 4800 out of 4800 | elapsed: 278.2min finished


In [20]:
best_clf.best_estimator_

LogisticRegression(C=0.004832930238571752, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=5000, multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)