In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,StackingClassifier,AdaBoostClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,make_scorer
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder

In [2]:
data_trans = pd.read_csv("Data/Transformed.csv")
data_org = pd.read_csv("Data/Datavalidation.csv")
data_org.drop(columns=["person_emp_exp","cb_person_cred_hist_length","person_age","person_gender"],inplace=True)

In [3]:
#encoding dataset for DecisionTree -> SMOTE
onehot = OneHotEncoder()
ot = onehot.fit_transform(data_org.iloc[:,[0,1,2,6]])
data = pd.DataFrame(ot.toarray(),columns=onehot.get_feature_names_out())
new = pd.concat([data,data_org.drop(columns=["person_education","person_home_ownership","loan_intent","previous_loan_defaults_on_file"])],axis=1)
new

Unnamed: 0,person_education_Associate,person_education_Bachelor,person_education_Doctorate,person_education_High School,person_education_Master,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,...,loan_intent_PERSONAL,loan_intent_VENTURE,previous_loan_defaults_on_file_No,previous_loan_defaults_on_file_Yes,loan_int_rate,loan_percent_income,credit_score,person_income,loan_amnt,loan_status
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,16.02,0.49,561,71948,35000,1
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,11.14,0.08,504,12282,1000,0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,12.87,0.44,635,12438,5500,1
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,15.23,0.44,675,79753,35000,1
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,14.27,0.53,586,66135,35000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44988,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,15.66,0.31,645,47971,15000,1
44989,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,14.07,0.14,621,65800,9000,1
44990,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,10.02,0.05,668,56942,2771,1
44991,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,13.23,0.36,604,33164,12000,1


In [4]:
x = data_trans.iloc[:,0:22]
Y = data_trans.iloc[:,22]
x_train,x_test,y_train,y_test = train_test_split(x,Y,test_size=0.2,random_state=1)
x_resampled,y_resampled = SMOTE(random_state=1).fit_resample(x_train,y_train) #Balancing the labelled feature

### LogisticRegression

In [15]:
#LogisticRegression
LG = LogisticRegression()
lg_fit = LG.fit(x_resampled,y_resampled)
lg_fit

In [20]:
lg_predict = lg_fit.predict(x_test)
print("accuracy:",accuracy_score(y_test,lg_predict))
print("precision:",precision_score(y_test,lg_predict))

accuracy: 0.8613179242138015
precision: 0.6282578875171467


In [66]:
#TUNING USING GridSearchCV
params = {
    "penalty":["l2","l1"],
    "class_weight":[None,"balanced"],
    # "random_state":[1],
    "solver":['liblinear','saga'],
}
score = make_scorer(precision_score,average="binary")
rs = GridSearchCV(LG,params,cv=5,n_jobs=-1,scoring=score)
rs_fit = rs.fit(x_resampled,y_resampled)
rs_fit

In [67]:
final = rs_fit.best_estimator_
predict = final.predict(x_test)
print("accuracy:",accuracy_score(y_test,predict))
print("precision:",precision_score(y_test,predict))

accuracy: 0.8615401711301256
precision: 0.6286890871654084


### SVM

In [57]:
svc = SVC()
svc_fit = svc.fit(x_resampled,y_resampled)
svc_fit

In [58]:
pre = svc_fit.predict(x_test)
print("accuracy:",accuracy_score(y_test,pre))
print("precision:",precision_score(y_test,pre))

accuracy: 0.8747638626514057
precision: 0.6542421015264466


In [None]:
model = rs.fit.best_estimator_
pre = model.predict(x_test)
print("accuracy:",accuracy_score(y_test,pre))
print("precision:",precision_score(y_test,pre))

### DecisionTreeClassifier

In [5]:
x = new.iloc[:,0:22]
Y = new.iloc[:,22]
x_train,x_test,y_train,y_test = train_test_split(x,Y,test_size=0.2,random_state=1)
x_resampled,y_resampled = SMOTE(random_state=1).fit_resample(x_train,y_train)
dt = DecisionTreeClassifier()
dt_fit = dt.fit(x_train,y_train)
dt_fit


In [6]:
pre = dt_fit.predict(x_test)
print("accuracy:",accuracy_score(y_test,pre))
print("precision:",precision_score(y_test,pre))

accuracy: 0.8999888876541838
precision: 0.7718253968253969


In [9]:
#tuning
params = {
    "min_samples_split":[2,4,6,10],
    "min_samples_leaf":[2,4,6,10],
    "criterion":["gini","entropy","log_loss"],
    "max_depth":[4,6,10]
}
score = make_scorer(precision_score,average="binary")
rs = RandomizedSearchCV(dt,param_distributions=params,scoring=score,cv=5,n_jobs=-1)
rs_fit = rs.fit(x_resampled,y_resampled)
rs_fit

In [12]:
pre = rs_fit.predict(x_test)
print("accuracy:",accuracy_score(y_test,pre))
print("precision:",precision_score(y_test,pre))

accuracy: 0.9144349372152462
precision: 0.8631516587677726


In [13]:
final = rs_fit.best_estimator_
joblib.dump(final,"Models/DecisionTree.pkl")

['Models/DecisionTree.pkl']

### RANDOM FOREST

In [14]:
rt = RandomForestClassifier()
rt_fit = rt.fit(x_resampled,y_resampled)
rt_fit

In [15]:
pre = rt_fit.predict(x_test)
print("accuracy:",accuracy_score(y_test,pre))
print("precision:",precision_score(y_test,pre))

accuracy: 0.9224358262029114
precision: 0.8534858387799564


In [21]:
#tuning
params = {
    "n_estimators":[90,100,110,120],
    "min_samples_split":[2,4,6,10],
    "min_samples_leaf":[2,4,6,10],
    "criterion":["gini","entropy","log_loss"],
    "max_depth":[4,6,10]
}
score = make_scorer(precision_score,average="binary")
rt_ = RandomizedSearchCV(rt,param_distributions=params,cv=5,n_jobs=-1,scoring=score)
rt_fit = rt_.fit(x_resampled,y_resampled)
rt_fit

In [22]:
final = rt_.best_estimator_
pre = final.predict(x_test)
print("accuracy:",accuracy_score(y_test,pre))
print("precision:",precision_score(y_test,pre))

accuracy: 0.9068785420602289
precision: 0.7670664206642066


### AdaboostClassifier

In [40]:
boost = AdaBoostClassifier(n_estimators=1000)
ada = boost.fit(x_resampled,y_resampled)
ada

In [41]:
pre = ada.predict(x_test)
print("accuracy:",accuracy_score(y_test,pre))
print("precision:",precision_score(y_test,pre))

accuracy: 0.913212579175464
precision: 0.7997039960532807


### FINAL REPORT: Model Implementation
<li>Used SMOTE for balancing the labelled target feature "loan_status"</li>
<li>Evaluation Metric: Precision_score to reduce the Type2 error(False Positive), minimizing bank's loss</li>
<li>Models:</li>
<ul>LogisticRegression: Accuracy= 82 and Precision=62</ul>
<ul>SVC: Accuracy= 87 and Precision=65 </ul>
<ul>DecisionTreeClassifier: Accuracy= 91 and Precision=86</ul>
<ul>RandomForestClassifier: Accuracy= 92 and Precision=85</ul>
<ul>AdaBoostClassifier: Accuracy= 91 and Precision=80</ul>
<li>Saved DecisionTreeClassifier Model</li>
