In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'credit-card-approval:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3807174%2F6598147%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240224%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240224T020609Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D7c689f52694a7a8e8f8f44ffe94c82ec0055ade971eaf0ea740acdbcb31f583baaaf2c6d7166488ef6d259252f677f9a7f12b77525af8eb2de29c400091a1301ceb32c36d77969f9699008f27cd8194ac24f4af21635d71811bcdd7edf017cc5524589e10d303ca8eca3a33a72f5cfb1a0f9e0b3df3abefb425c8dd92b6b66b038d8709e2cb3248f97640a04c8fcecbafb1731f812f760451176485b795c5ec42dd8970651d289ee00a915a9c69c69b90d303a70bce1215f7b3b65e4b704ea608a72ca897797cd9ccd2a2334cc2317545ecf0c0dbe90e279d4f58f2e8d8a80a4055832a1993f55a5ee276888253b2f831330636565ae1b807a883a4f8c36add6'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

**Use the followinf Data to make a Machine Learning Model to predict Credit card Approval**

In [None]:
df = pd.read_csv('/kaggle/input/credit-card-approval/cc_approvals.data',header=None)

cols = ['Gender','Age','Debt','Married','BankCustomer','EducationLevel','Ethnicity',
        'YearsEmployed','PriorDefault','Employed','CreditScore','DriversLicense','Citizen',
        'ZipCode','Income','ApprovalStatus']

df.columns = cols
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
(df == '?').sum()

In [None]:
#convert all '?' into null or blank cells
df1 = df.applymap(lambda x: np.nan if x == '?' else x)
df1.head()

In [None]:
df1.isnull().sum()

In [None]:
#%ge of data we may loose if we drop rows with null values

(len(df1) - len(df1.dropna()))/len(df1)

In [None]:
df2 = df1.dropna()     #drops all the rows with null values

In [None]:
df2['Age'] = df2['Age'].astype(float)
df2['ZipCode'] = df2['ZipCode'].astype(int)

In [None]:
df2.head()

In [None]:
df2['ApprovalStatus'].unique()

In [None]:
df2['ApprovalStatus'] = df2['ApprovalStatus'].map({'+' : 1, '-' : 0})

In [None]:
df2.dtypes

In [None]:
list(df2.columns[df2.dtypes == object])      #columns having non-numerical data

In [None]:
df2_non_num  = df2[list(df2.columns[df2.dtypes == object])]
df2_non_num.head()

In [None]:
df2.columns

In [None]:
list(df2.columns[(df2.dtypes == float) | (df2.dtypes == int)])  #Columns having numerical data

In [None]:
df2_num = df2[list(df2.columns[(df2.dtypes == float) | (df2.dtypes == int)])]
df2_num.head()

In [None]:
df2.groupby('ApprovalStatus')['Age'].agg(['mean','median','max','min'])

In [None]:
df2.groupby('ApprovalStatus')['Age'].agg(['mean','median','max','min']).plot.bar()
plt.show()

In [None]:
sns.boxplot(data = df2 , x = 'ApprovalStatus' , y = 'Age')
plt.show()

In [None]:
#Break the data into two parts => Card not Approved & Card Approved

df2_a0 = df2[df2['ApprovalStatus'] == 0]
df2_a1 = df2[df2['ApprovalStatus'] == 1]

In [None]:
#Outlier analysis for data where card is not approved

q1 = np.percentile(df2_a0['Age'],25)
q3 = np.percentile(df2_a0['Age'],75)
print(q3 , q1)
print()
iqr = q3 - q1
print(iqr)
print()
lf = q1 - 1.5 * iqr
uf = q3 + 1.5 * iqr
print(lf , uf)
print()
percentage_of_outlier_0 = ((df2_a0['Age'] > uf) | (df2_a0['Age'] < lf)).sum()/len(df2)
print(percentage_of_outlier_0)

In [None]:
df2_a0.shape

In [None]:
df2_a1.shape

* remove data came under outlier category for class 0, i.e. when card was not approved

In [None]:
#updating data after removing outliers
df2_a0 = df2_a0[~((df2_a0['Age'] > uf) | (df2_a0['Age'] < lf))]

In [None]:
df2_a0.shape

In [None]:
#Outlier analysis for data where card is approved

q1 = np.percentile(df2_a1['Age'],25)
q3 = np.percentile(df2_a1['Age'],75)
print(q3 , q1)
print()
iqr = q3 - q1
print(iqr)
print()
lf = q1 - 1.5 * iqr       #lower fence
uf = q3 + 1.5 * iqr       #upper fence
print(lf , uf)
print()
percentage_of_outlier_1 = ((df2_a1['Age'] > uf) | (df2_a1['Age'] < lf)).sum()/len(df2)
print(percentage_of_outlier_1)

* remove data came under outlier category for class 1, i.e. when card was approved

In [None]:
#updating data without outliers

df2_a1 = df2_a1[~((df2_a1['Age'] > uf) | (df2_a1['Age'] < lf))]

In [None]:
df2_a1.shape

In [None]:
#Join back df2_a0 & df2_a1

df3 = pd.concat((df2_a0,df2_a1))

In [None]:
print(df2.shape)
print(df3.shape)

In [None]:
(len(df2) - len(df3))/len(df2)

**Overall Outlier Analysis**

In [None]:
plt.subplot(1,2,1)
sns.boxplot(data = df2  , y = 'Age')

plt.subplot(1,2,2)
sns.boxplot(data = df3  , y = 'Age')

plt.show()

In [None]:
q1 = np.percentile(df2['Age'],25)
q3 = np.percentile(df2['Age'],75)
print(q3 , q1)
print()
iqr = q3 - q1
print(iqr)
print()
lf = q1 - 1.5 * iqr
uf = q3 + 1.5 * iqr
print(lf , uf)

In [None]:
((df2['Age'] > uf) | (df2['Age'] < lf)).sum()/len(df)

In [None]:
sns.boxplot(data = df3 , x = 'ApprovalStatus' , y = 'Age')
plt.show()

In [None]:
sns.pairplot(df2_num , hue = 'ApprovalStatus')
plt.show()

In [None]:
df3.head()

In [None]:
df4 = pd.get_dummies(df3)
df4.head()

In [None]:
df4['ApprovalStatus'].value_counts()

In [None]:
X = df4.drop('ApprovalStatus', axis = 'columns')
Y = df4['ApprovalStatus']

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,train_size=0.8)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report

****
K_Nearest_Neighbors
****

In [None]:
model1 = KNeighborsClassifier(n_neighbors=5)
model1.fit(xtrain,ytrain)

print("Train Data Accuracy" , model1.score(xtrain,ytrain))
print("Test Data Accuracy" , model1.score(xtest,ytest))

In [None]:
confusion_matrix(ytrain , model1.predict(xtrain))

In [None]:
confusion_matrix(ytest , model1.predict(xtest))

In [None]:
print(classification_report(ytrain , model1.predict(xtrain)))

In [None]:
print(classification_report(ytest , model1.predict(xtest)))

****
Logistic Regression
****

In [None]:
model2 = LogisticRegression()
model2.fit(xtrain,ytrain)

print("Train Data Accuracy=" , model2.score(xtrain,ytrain))
print("Test Data Accuracy=" , model2.score(xtest,ytest))

In [None]:
print(classification_report(ytrain , model2.predict(xtrain)))

In [None]:
print(classification_report(ytest , model2.predict(xtest)))

****
Decision Tree Classifier
****

In [None]:
#Grid Search CV -> to tune hyperparameter for the model

from sklearn.model_selection import GridSearchCV

model3 = DecisionTreeClassifier()

param_grid = {
    'max_depth' : [None,1,2,3,4],
    'min_samples_split' : [5,7,9,10,11],
    'min_samples_leaf' : [1,2,3,4,5,6,7,8,9,10]
}

#Create a GridSearchCV object
grid_search = GridSearchCV(model3 , param_grid , cv=5)

#fit the data into grid search object
grid_search.fit(X,Y)

#Best Hyperparameter
print("Best Hyperparameter:" , grid_search.best_params_)

In [None]:
model4 = DecisionTreeClassifier(max_depth = 1, min_samples_leaf = 1, min_samples_split = 5)
model4.fit(xtrain,ytrain)

print("Train Data Accuracy=" , model4.score(xtrain,ytrain))
print("Test Data Accuracy=" , model4.score(xtest,ytest))

In [None]:
print(classification_report(ytrain , model4.predict(xtrain)))

In [None]:
print(classification_report(ytest , model4.predict(xtest)))

****
Random Forest Classifier
****

In [None]:
#Grid Search CV -> to tune hyperparameter for the model

from sklearn.model_selection import GridSearchCV

model5 = RandomForestClassifier()

param_grid = {
    'n_estimators' : [5,10,20,40,80],
    'max_depth' : [None,1,2,3,4],
    'min_samples_split' : [5,7,9,10,11],
    'min_samples_leaf' : [1,2,3,4,5,6,7,8,9,10]
}

#Create a GridSearchCV object
grid_search = GridSearchCV(model5 , param_grid , cv=5)

#fit the data into grid search object
grid_search.fit(X,Y)

#Best Hyperparameter
print("Best Hyperparameter:" , grid_search.best_params_)

In [None]:
model6 = RandomForestClassifier(max_depth = 1, min_samples_leaf =  7,
                                min_samples_split = 9, n_estimators = 80)

model6.fit(xtrain,ytrain)

print("Train Data Accuracy=" , model6.score(xtrain,ytrain))
print("Test Data Accuracy=" , model6.score(xtest,ytest))

In [None]:
print(classification_report(ytrain , model6.predict(xtrain)))

In [None]:
print(classification_report(ytest , model4.predict(xtest)))

**ROC AUC Curve**

In [None]:
from sklearn.metrics import roc_curve,roc_auc_score

In [None]:
#model1.predict_proba(xtrain)[:,1]

In [None]:
aucKNN = roc_auc_score(ytrain , model1.predict_proba(xtrain)[:,1])
aucLogistic = roc_auc_score(ytrain , model2.predict_proba(xtrain)[:,1])
aucDTC = roc_auc_score(ytrain , model4.predict_proba(xtrain)[:,1])
aucRFC = roc_auc_score(ytrain , model6.predict_proba(xtrain)[:,1])

print("KNN=", aucKNN)
print("Logistic=", aucLogistic)
print("DecisionTreeClassifier=", aucDTC)
print("RandomForestClassifier=", aucRFC)

In [None]:
fprKNN,tprKNN,threshKNN = roc_curve(ytrain , model1.predict_proba(xtrain)[:,1])
fprLogistic,tprLogistic,threshLogistic = roc_curve(ytrain , model2.predict_proba(xtrain)[:,1])
fprDTC,tprDTC,threshDTC = roc_curve(ytrain , model4.predict_proba(xtrain)[:,1])
fprRFC,tprRFC,threshRFC = roc_curve(ytrain , model6.predict_proba(xtrain)[:,1])

In [None]:
import matplotlib.pyplot as plt

plt.plot(fprKNN , tprKNN , marker = '*' , label = "KNN")
plt.plot(fprLogistic, tprLogistic , marker = '*' , label = "Logistc")
plt.plot(fprDTC ,tprDTC , marker = '*' , label = "Decision Tree Classifier")
plt.plot(fprRFC ,tprRFC , marker = '*' , label = "Random Forest Classifier")

plt.legend()
plt.show()