In [16]:
# import library 
import pandas as pd
import numpy as np
import time
import pickle
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.feature_selection import RFE

In [17]:
# load the data
dataset = pd.read_csv('Pre_loan_data.csv')
dataset

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,4.083,4,1.6,1,0,0,1,0,0,0
1,45,19,2.833,3,1.5,1,0,0,1,0,0,0
2,39,15,0.917,1,1.0,1,0,0,0,0,0,0
3,35,9,8.333,1,2.7,2,0,0,0,0,0,0
4,35,8,3.750,4,1.0,2,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
4995,29,3,3.333,1,1.9,3,0,0,0,0,1,0
4996,30,4,1.250,4,0.4,1,85,0,0,0,1,0
4997,63,39,2.000,2,0.3,3,0,0,0,0,0,0
4998,65,40,4.083,3,0.5,2,0,0,0,0,1,0


In [18]:
dataset.columns

Index(['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')

In [19]:
# spliting the data into input and output

independent = dataset[['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage','Securities Account', 'CD Account',
       'Online', 'CreditCard']]
dependent = dataset[['Personal Loan']]

In [20]:
# split the training and testing dataset
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(independent, dependent, test_size=1/3, random_state=0)
X_train

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
3276,55,31,13.250,1,3.9,3,0,0,0,0,0
3688,51,26,14.917,1,8.1,1,0,1,0,1,0
2204,63,37,1.667,2,0.4,1,76,0,0,0,0
572,39,15,10.667,1,3.4,1,0,0,0,0,0
229,48,24,5.917,2,1.7,1,145,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
4931,57,27,4.583,1,1.4,3,0,0,0,1,0
3264,67,41,9.500,4,2.4,3,0,0,0,1,0
1653,26,1,2.000,2,0.9,3,123,0,0,0,1
2607,57,33,4.083,4,1.5,1,214,1,1,1,1


In [105]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [117]:

# RFE
def rfefeature(indep_X,dep_Y,n):
    rfelist =[]
    selected_features = {}

    log_model = LogisticRegression(solver='lbfgs')
    svm_model = SVC(kernel='linear',random_state =0)
    dt_model = DecisionTreeClassifier(criterion ='gini',max_features ='sqrt',splitter ='best',random_state =0)
    rf_model = RandomForestClassifier(n_estimators =10, criterion ='entropy', random_state =0)
#     nb_model = GaussianNB(priors=None)
    rfemodellist = [log_model,svm_model,dt_model,rf_model]
    for model in rfemodellist:
        print(model)
        # https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        print(log_rfe)
        log_fit = log_rfe.fit(indep_X,dep_Y)
        log_rfe_feature= log_fit.transform(indep_X)
        selected_columns = indep_X.columns[log_rfe.support_].tolist()
        selected_features[model.__class__.__name__] = selected_columns
        rfelist.append(log_rfe_feature)
    return rfelist,selected_features

In [118]:
def split_scalar(indep_X,dep_Y):
    X_train,X_test,Y_train,Y_test = train_test_split(indep_X,dep_Y,test_size=0.30,random_state=0)
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    return X_train,X_test,Y_train,Y_test

In [119]:
def cm_pred(classifier,X_test):
    test_pred = classifier.predict(X_test)
    
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(Y_test,test_pred)
    
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    
    accuracy = accuracy_score(Y_test,test_pred)
    
    report = classification_report(Y_test,test_pred)
    return classifier,accuracy,report,X_test,Y_test,cm

In [120]:
def log(X_train,X_test,Y_train):
    classifier = LogisticRegression(random_state =0)
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm = cm_pred(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm

In [121]:

def svm(X_train,X_test,Y_train):
    classifier = SVC(kernel='linear',random_state =0)
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm = cm_pred(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm

In [122]:
def dtree(X_train,X_test,Y_train):
    classifier = DecisionTreeClassifier(criterion ='gini',max_features ='sqrt',splitter ='best',random_state =0)
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm = cm_pred(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm

In [123]:

def random(X_train,X_test,Y_train):
    classifier =RandomForestClassifier(n_estimators =10, criterion ='entropy', random_state =0)
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm = cm_pred(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm

In [124]:

def nb(X_train,X_test,Y_train):
    classifier = GaussianNB()
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm = cm_pred(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm

In [125]:
def knn(X_train,X_test,Y_train):
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, Y_train)
    classifier,accuracy,report,X_test,Y_test,cm=cm_pred(classifier,X_test)
    return  classifier,accuracy,report,X_test,Y_test,cm

In [184]:
def rfe_classification(acclog,accsvm,accdtree,accrandom,accnb,accknn):
    rfe_df =pd.DataFrame(index =['Logistic','SVM','DecisionTree','RandomForest',],columns = ['Logistic','SVM','DecisionTree','RandomForest','NavieBayes','KNN'])
    for number,idex in enumerate(rfe_df.index):
        rfe_df['Logistic'][idex]=acclog[number]
        rfe_df['SVM'][idex]=accsvm[number]
        rfe_df['DecisionTree'][idex]=accdtree[number]
        rfe_df['RandomForest'][idex]=accrandom[number]
        rfe_df['NavieBayes'][idex]=accnb[number]
        rfe_df['KNN'][idex]=accknn[number]

    return rfe_df

In [185]:
indep_X = dataset.drop('Personal Loan',axis =1)
dep_Y = dataset['Personal Loan']
rfelist,selected_features= rfefeature(indep_X,dep_Y,9)
acclog=[]
accsvm=[]
accdtree=[]
accrandom=[]
accnb=[]
accknn=[]

for i in rfelist:
    X_train,X_test,Y_train,Y_test = split_scalar(i,dep_Y)
    classifier,accuracy,report,X_test,Y_test,cm =log(X_train,X_test,Y_train)
    acclog.append(accuracy)
    classifier,accuracy,report,X_test,Y_test,cm =svm(X_train,X_test,Y_train)
    accsvm.append(accuracy)
    classifier,accuracy,report,X_test,Y_test,cm =dtree(X_train,X_test,Y_train)
    accdtree.append(accuracy)
    classifier,accuracy,report,X_test,Y_test,cm =random(X_train,X_test,Y_train)
    accrandom.append(accuracy)
    classifier,accuracy,report,X_test,Y_test,cm =nb(X_train,X_test,Y_train)
    accnb.append(accuracy)
    classifier,accuracy,report,X_test,Y_test,cm =knn(X_train,X_test,Y_train)
    accknn.append(accuracy)
    
result = rfe_classification(acclog,accsvm,accdtree,accrandom,accnb,accknn)
result
print("Selected Features:")
for model_name, features in selected_features.items():
    print(f"{model_name}: {features}")

LogisticRegression()
RFE(estimator=LogisticRegression(), n_features_to_select=9)
SVC(kernel='linear', random_state=0)
RFE(estimator=SVC(kernel='linear', random_state=0), n_features_to_select=9)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

DecisionTreeClassifier(max_features='sqrt', random_state=0)
RFE(estimator=DecisionTreeClassifier(max_features='sqrt', random_state=0),
    n_features_to_select=9)
RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=0)
RFE(estimator=RandomForestClassifier(criterion='entropy', n_estimators=10,
                                     random_state=0),
    n_features_to_select=9)
Selected Features:
LogisticRegression: ['Age', 'Experience', 'Income', 'Family', 'Education', 'Securities Account', 'CD Account', 'Online', 'CreditCard']
SVC: ['Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Securities Account', 'CD Account', 'Online', 'CreditCard']
DecisionTreeClassifier: ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'CD Account', 'Online']
RandomForestClassifier: ['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'CD Account', 'Online']


In [128]:
#Model Creation
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[10,50,100], 
              'criterion':['gini', 'entropy', 'log_loss'],# paramters are random forest from website
              'max_features':['sqrt', 'log2']}

grid = GridSearchCV(RandomForestClassifier(),parameters, refit = True, verbose =3, n_jobs=-1, scoring='f1') 
#refit true is for finding the best model,
#false will give last model
#https://scikit-learn.org/stable/modules/model_evaluation.html - for scoring 
grid.fit(X_train,Y_train) # can able to call directly without training and testing 

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [129]:
print(grid.best_params_)

{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 50}


In [130]:
classifier = grid.cv_results_ 
classifier
print('The value for best parameter: ',format(grid.best_params_))

The value for best parameter:  {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 50}


In [131]:
table = pd.DataFrame.from_dict(classifier)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.071133,0.006845,0.007059,0.006020912,gini,sqrt,10,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.890625,0.924242,0.941176,0.94964,0.888889,0.918915,0.025179,18
1,0.30292,0.04398,0.02187,0.007645219,gini,sqrt,50,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.890625,0.939394,0.964029,0.942029,0.942029,0.935621,0.024199,5
2,0.625851,0.033179,0.032509,0.00311904,gini,sqrt,100,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.899225,0.947368,0.971014,0.942857,0.926471,0.937387,0.023815,2
3,0.058638,0.006051,0.018746,0.01530574,gini,log2,10,"{'criterion': 'gini', 'max_features': 'log2', ...",0.864,0.917293,0.964029,0.948905,0.925373,0.92392,0.034262,14
4,0.340007,0.026876,0.02187,0.007652787,gini,log2,50,"{'criterion': 'gini', 'max_features': 'log2', ...",0.899225,0.924242,0.964029,0.942029,0.942029,0.934311,0.021609,7
5,0.611104,0.021227,0.043465,0.006879983,gini,log2,100,"{'criterion': 'gini', 'max_features': 'log2', ...",0.890625,0.940299,0.963504,0.950355,0.941176,0.937192,0.024737,3
6,0.065053,0.010645,0.011494,0.003467105,entropy,sqrt,10,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.864,0.932331,0.964029,0.942029,0.900763,0.92063,0.034871,17
7,0.31974,0.028449,0.018139,0.003230475,entropy,sqrt,50,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.907692,0.955224,0.971014,0.948905,0.940299,0.944627,0.021023,1
8,0.622478,0.011904,0.03396,0.006848963,entropy,sqrt,100,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.88189,0.939394,0.956522,0.94964,0.926471,0.930783,0.026463,10
9,0.069772,0.011364,0.00735,0.007046842,entropy,log2,10,"{'criterion': 'entropy', 'max_features': 'log2...",0.888889,0.947368,0.948148,0.93617,0.907692,0.925654,0.023505,13


In [132]:
grid_pred = grid.predict(X_test)
grid_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [133]:
#Evalution using confusion matrix
from sklearn.metrics import confusion_matrix
con_mat = confusion_matrix(Y_test,grid_pred)
con_mat

array([[1368,    4],
       [  18,  110]], dtype=int64)

In [134]:
#To create a classification report
from sklearn.metrics import classification_report
class_report = classification_report(Y_test,grid_pred)
print(class_report)


              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1372
           1       0.96      0.86      0.91       128

    accuracy                           0.99      1500
   macro avg       0.98      0.93      0.95      1500
weighted avg       0.99      0.99      0.98      1500



In [135]:

from sklearn.metrics import f1_score
f1_macro=f1_score(Y_test,grid_pred,average='weighted')
print("The f1_macro value for best parameter {}:".format(grid.best_params_),f1_macro)

The f1_macro value for best parameter {'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 50}: 0.9849463159513919


In [136]:
from sklearn.metrics import roc_auc_score #Receiver Operating Characteristic Curve, Area Under the curve
roc_auc_score(Y_test,grid.predict_proba(X_test)[:,1]) # to get a greater probability value, [:,1][R,C]

0.9971841973396501

In [43]:
dataset.columns
#'Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education', 'Mortgage', 'CD Account', 'Online'

Index(['Age', 'Experience', 'Income', 'Family', 'CCAvg', 'Education',
       'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account',
       'Online', 'CreditCard'],
      dtype='object')

In [170]:
Age = (input("Age: "))
Experience = (input("Experience: "))
Income = float(input("Income: "))
Family= int(input("Family: "))
CCAvg = float(input("CCAvg: "))
Education = int(input("Education: "))
Mortgage = int(input("Mortgage: "))
# SecuritiesAccount = int(input("Securities Account: "))
CDAccount = int(input("CD Account: "))
OnlineAccount = int(input("Online: "))
# CreditCard = int(input("CreditCard: "))

Age: 25
Experience: 1
Income: 3
Family: 1
CCAvg: 11
Education: 1
Mortgage: 0
CD Account: 0
Online: 0


In [172]:
Final_prediction =grid.predict([[Age,Experience,Income,Family,CCAvg,Education,Mortgage,CDAccount,OnlineAccount]])
print("Final_prediction of Random Forest: {}".format(Final_prediction))

Final_prediction of Random Forest: [1]


In [166]:
# Saving the model
import pickle
filename= "Personal_Bank_loan_Prediction.sav"

In [167]:
pickle.dump(grid,open(filename,'wb'))
load_model=pickle.load(open("Personal_Bank_loan_Prediction.sav",'rb'))

In [173]:
def get_categorical_prediction(prediction):
    return "Yes" if prediction == 1 else "No"
Age = (input("Age: "))
Experience = (input("Experience: "))
Income = float(input("Income: "))
Family= int(input("Family: "))
CCAvg = float(input("CCAvg: "))
Education = int(input("Education: "))
Mortgage = int(input("Mortgage: "))
# SecuritiesAccount = int(input("Securities Account: "))
CDAccount = int(input("CD Account: "))
OnlineAccount = int(input("Online: "))
# CreditCard = int(input("CreditCard: "))

Age: 24
Experience: 1
Income: 1
Family: 1
CCAvg: 1
Education: 1
Mortgage: 0
CD Account: 0
Online: 0


In [181]:
Final_result = load_model.predict([[Age,Experience,Income,Family,CCAvg,Education,Mortgage,CDAccount,OnlineAccount]])
# Final_result

array([1], dtype=int64)

In [182]:
future_prediction_categorical = get_categorical_prediction(Final_result)
future_prediction_categorical

'Yes'

In [183]:
# Print the categorical prediction
print("Whether this Customer will accept the Personal Loan:", future_prediction_categorical)

Whether this Customer will accept the Personal Loan: Yes
