In [57]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

df = pd.read_csv('crxdata.csv', header = None)
df.columns=['Gender', 'Age', 'Debt', 'Married', 'BankCustomer', 'EducationLevel', 'Ethnicity', 'YearsEmployed', 'PriorDefault', 'Employed', 'CreditScore', 'DriversLicense', 'Citizen', 'ZipCode', 'Income','ApprovalStatus']
df.describe()

Unnamed: 0,Debt,YearsEmployed,CreditScore,Income
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [58]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    object 
 1   Age             690 non-null    object 
 2   Debt            690 non-null    float64
 3   Married         690 non-null    object 
 4   BankCustomer    690 non-null    object 
 5   EducationLevel  690 non-null    object 
 6   Ethnicity       690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    object 
 9   Employed        690 non-null    object 
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    object 
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    object 
 14  Income          690 non-null    int64  
 15  ApprovalStatus  690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


# Pre-processing

In [59]:
df.replace('?',None,inplace = True)
df['Age'] = df['Age'].astype(float)
df.isnull().sum()

Gender            12
Age               12
Debt               0
Married            6
BankCustomer       6
EducationLevel     9
Ethnicity          9
YearsEmployed      0
PriorDefault       0
Employed           0
CreditScore        0
DriversLicense     0
Citizen            0
ZipCode           13
Income             0
ApprovalStatus     0
dtype: int64

In [60]:
for col in df.columns:
    if df[col].dtype == 'float64' or df[col].dtype == 'int64':
        df[col].fillna(df[col].mean(), inplace = True)
        
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].value_counts().index[0], inplace = True)

In [61]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = le.fit_transform(df[col])

In [62]:
df['Married'].replace(0, 1, inplace = True)
df['Married'].replace(2, 0, inplace = True)
df['BankCustomer'].replace(1, 0, inplace = True)
df['BankCustomer'].replace(2, 1, inplace = True)
df['Citizen'].replace(1, 0, inplace = True)
df['Citizen'].replace(2, 1, inplace = True)

In [63]:
df = df.drop(['ZipCode', 'EducationLevel', 'Ethnicity'],axis = 1)
x = df[df.columns[:-1]]
y = df[df.columns[-1]]
x.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,Income
0,1,30.83,0.0,1,0,1.25,1,1,1,0,0,0
1,0,58.67,4.46,1,0,3.04,1,1,6,0,0,560
2,0,24.5,0.5,1,0,1.5,1,0,0,0,0,824
3,1,27.83,1.54,1,0,3.75,1,1,5,1,0,3
4,1,20.17,5.625,1,0,1.71,1,0,0,0,1,0


In [64]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 12)

# KNeighbors

In [65]:
from sklearn.neighbors import KNeighborsClassifier

accuracy = []
acc = {}
for i in range(1,10):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(x_train, y_train)
    pred = knn.predict(x_test)
    accr = accuracy_score(pred, y_test)
    accuracy.append(accr)
    
best = accuracy.index(max(accuracy))+1
knn = KNeighborsClassifier(n_neighbors = best)
knn.fit(x_train, y_train)
acc['knn'] = accuracy_score(y_test, pred)
knnc = classification_report(y_test, pred)
print("Classification report of knn : ")
print(knnc)

Classification report of knn : 
              precision    recall  f1-score   support

           0       0.70      0.65      0.67        88
           1       0.75      0.79      0.77       119

    accuracy                           0.73       207
   macro avg       0.72      0.72      0.72       207
weighted avg       0.73      0.73      0.73       207



# Decision Tree

In [66]:
from sklearn.tree import DecisionTreeClassifier
import sklearn.tree as tree

Tree = DecisionTreeClassifier(criterion="gini", max_depth = 4)
Tree.fit(x_train, y_train)
pred = Tree.predict(x_test)
acc['Decision tree'] = accuracy_score(pred, y_test)
Treec = classification_report(y_test, pred)
print('Classification report of Decision tree : ')
print(Treec)

Classification report of Decision tree : 
              precision    recall  f1-score   support

           0       0.79      0.84      0.81        88
           1       0.88      0.83      0.85       119

    accuracy                           0.84       207
   macro avg       0.83      0.84      0.83       207
weighted avg       0.84      0.84      0.84       207



# Random Forest

In [67]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators = 50, criterion="entropy", max_depth = 5)
RF.fit(x_train, y_train)
pred = RF.predict(x_test)
acc['Random forest'] = accuracy_score(pred, y_test)
RFc = classification_report(y_test, pred)
print('Classification report of Random Forest : ')
print(RFc)

Classification report of Random Forest : 
              precision    recall  f1-score   support

           0       0.84      0.85      0.85        88
           1       0.89      0.88      0.89       119

    accuracy                           0.87       207
   macro avg       0.87      0.87      0.87       207
weighted avg       0.87      0.87      0.87       207



# Support Vector Machine

In [68]:
from sklearn.svm import SVC

svm = SVC(kernel = 'rbf')
svm.fit(x_train, y_train)
pred = svm.predict(x_test)
acc['svm'] = accuracy_score(pred, y_test)
svmc = classification_report(y_test, pred)
print('Classification report of Support Vector Machine : ')
print(svmc)

Classification report of Support Vector Machine : 
              precision    recall  f1-score   support

           0       0.92      0.27      0.42        88
           1       0.65      0.98      0.78       119

    accuracy                           0.68       207
   macro avg       0.78      0.63      0.60       207
weighted avg       0.76      0.68      0.63       207



# Gaussian Naive Bayes

In [69]:
from sklearn.naive_bayes import GaussianNB

GNB = GaussianNB()
GNB.fit(x_train, y_train)
pred = GNB.predict(x_test)
acc['Naive Bayes'] = accuracy_score(pred, y_test)
GNBc = classification_report(y_test, pred)
print('Classification report of Gaussian Naive Bayes : ')
print(GNBc)

Classification report of Gaussian Naive Bayes : 
              precision    recall  f1-score   support

           0       0.84      0.65      0.73        88
           1       0.78      0.91      0.84       119

    accuracy                           0.80       207
   macro avg       0.81      0.78      0.78       207
weighted avg       0.80      0.80      0.79       207



# Logistic Regression

In [70]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(max_iter = 2000)
LR.fit(x_train, y_train)
pred = LR.predict(x_test)
acc['Logistic Regression'] = accuracy_score(pred, y_test)
LRc = classification_report(y_test, pred)
print('Classification report of Logistic Regression : ')
print(LRc)

Classification report of Logistic Regression : 
              precision    recall  f1-score   support

           0       0.80      0.90      0.84        88
           1       0.92      0.83      0.87       119

    accuracy                           0.86       207
   macro avg       0.86      0.86      0.86       207
weighted avg       0.87      0.86      0.86       207



# Best Classifier

In [71]:
print("The best classifier based on all the reports is : ")
print(f'{list(acc.keys())[list(acc.values()).index(max(acc.values()))]}')

The best classifier based on all the reports is : 
Random forest


# Relation between attributes and approval status

In [73]:
fig = make_subplots(rows=4, cols=3,
                   subplot_titles=("Gender plot", "Age plot", "Debt plot", "Marital Status plot", "Bank Customer plot", "Years Employed plot",
                                  "Prior Default plot", "Employment Status plot", "Credit Score plot", "License plot", "Citizenship plot", "Income plot"))
fig.add_trace(go.Bar(x=df['Gender'], y = df['ApprovalStatus'], name = 'Gender plot'), row=1, col=1)
fig.add_trace(go.Scatter(x=df['Age'], y=df['ApprovalStatus'], mode="markers+text",name = 'Age plot'), row=1, col=2)
fig.add_trace(go.Scatter(x=df['Debt'], y=df['ApprovalStatus'], mode="markers+text",name = 'Debt plot'), row=1, col=3)
fig.add_trace(go.Bar(x=df['Married'], y = df['ApprovalStatus'], name = 'Marital status plot'), row=2, col=1)
fig.add_trace(go.Bar(x=df['BankCustomer'], y = df['ApprovalStatus'], name = 'Bank Customer plot'), row=2, col=2)
fig.add_trace(go.Scatter(x=df['YearsEmployed'], y=df['ApprovalStatus'], mode="markers+text",name = 'Years Employed plot'), row=2, col=3)
fig.add_trace(go.Bar(x=df['PriorDefault'], y = df['ApprovalStatus'], name = 'Prior Default plot'), row=3, col=1)
fig.add_trace(go.Bar(x=df['Employed'], y = df['ApprovalStatus'], name = 'Employment Status plot'), row=3, col=2)
fig.add_trace(go.Scatter(x=df['CreditScore'], y=df['ApprovalStatus'], mode="markers+text",name = 'Credit Score plot'), row=3, col=3)
fig.add_trace(go.Bar(x=df['DriversLicense'], y = df['ApprovalStatus'], name = 'License plot'), row=4, col=1)
fig.add_trace(go.Bar(x=df['Citizen'], y = df['ApprovalStatus'], name = 'Citizenship plot'), row=4, col=2)
fig.add_trace(go.Scatter(x=df['Income'], y=df['ApprovalStatus'], mode="markers+text",name = 'Income plot'), row=4, col=3)
fig.update_layout(height = 1600, width = 960)
fig.show()

In [78]:
print("Enter the personal details : ")
gender = int(input("Enter gender (0 for female, 1 for male) "))
age = float(input("Enter age "))
debt = float(input("Enter debt "))
married = int(input("Enter marital status (0 for unmarried, 1 for married) "))
cust = int(input("Enter if you are a bank customer (0 for yes, 1 for no) "))
year = float(input("Enter years employed "))
priordef = int(input("Enter Prior Default (1 for True, 0 for False) "))
emp = int(input("Enter employment status (0 for employed, 1 for unemployed) "))
cred = int(input(f"Enter credit score between {min(df['CreditScore'])}-{max(df['CreditScore'])} "))
lic = int(input("Enter Driver license availability (0 for yes, 1 for no) "))
citizen = int(input("Enter citizenship (0 for a citizen, 1 for no) "))
income = int(input("Enter income "))
data = [[gender, age, debt, married, cust, year, priordef, emp, cred, lic, citizen, income]]
dd = pd.DataFrame(data, columns = [1,2,3,4,5,6,7,8,9,10,11,12])
pred = LR.predict(dd)
if pred == 1:
    print("Credit card Approved!")
else:
    print("Credit card denied, feel free to apply again")

Enter the personal details : 
Enter gender (0 for female, 1 for male) 1
Enter age 21.08
Enter debt 10.085
Enter marital status (0 for unmarried, 1 for married) 0
Enter if you are a bank customer (0 for yes, 1 for no) 1
Enter years employed 1.25
Enter Prior Default (1 for True, 0 for False) 0
Enter employment status (0 for employed, 1 for unemployed) 0
Enter credit score between 0-67 0
Enter Driver license availability (0 for yes, 1 for no) 0
Enter citizenship (0 for a citizen, 1 for no) 0
Enter income 0
Credit card Approved!



X does not have valid feature names, but LogisticRegression was fitted with feature names

