### 1. Load the data

In [94]:
import pandas as pd
import numpy as np

cc_apps = pd.read_csv("cc_approvals.data", header=None)

cc_apps.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260,0,-
686,a,22.67,0.75,u,g,c,v,2.0,f,t,2,t,g,200,394,-
687,a,25.25,13.5,y,p,ff,ff,2.0,f,t,1,t,g,200,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280,750,-
689,b,35.0,3.375,u,g,c,h,8.29,f,f,0,t,g,0,0,-


### 2. Inspect the data

The features of this dataset have been anonymized to protect the privacy. The probable features in a typical credit card application are Gender, Age, Debt, Married, BankCustomer, EducationLevel, Ethnicity, YearsEmployed, PriorDefault, Employed, CreditScore, DriversLicense, Citizen, ZipCode, Income and finally the ApprovalStatus.

In [82]:
cc_apps_description = cc_apps.describe()
print(cc_apps_description)

print("\n")

cc_apps_info = cc_apps.info()
print(cc_apps_info)

print("\n")

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 no

### 3. Splitting the dataset into train and test sets


In [83]:
cc_apps = cc_apps.drop([11, 13], axis=1)

from sklearn.model_selection import train_test_split

print(cc_apps.head())
# Split into train and test sets
cc_apps_train, cc_apps_test = train_test_split(cc_apps, test_size=0.33, random_state=42)

  0      1      2  3  4  5  6     7  8  9   10 12   14 15
0  b  30.83  0.000  u  g  w  v  1.25  t  t   1  g    0  +
1  a  58.67  4.460  u  g  q  h  3.04  t  t   6  g  560  +
2  a  24.50  0.500  u  g  q  h  1.50  t  f   0  g  824  +
3  b  27.83  1.540  u  g  w  v  3.75  t  t   5  g    3  +
4  b  20.17  5.625  u  g  w  v  1.71  t  f   0  s    0  +


### 4. Handling missing values

In [84]:
cc_apps_train = cc_apps_train.replace('?', np.nan)
cc_apps_test = cc_apps_test.replace('?', np.nan)

In [85]:
cc_apps_train = cc_apps_train.fillna(cc_apps_train.mean())
cc_apps_test = cc_apps_test.fillna(cc_apps_test.mean())

print(cc_apps_train.isnull().sum())

0     8
1     5
2     0
3     6
4     6
5     7
6     7
7     0
8     0
9     0
10    0
12    0
14    0
15    0
dtype: int64


  cc_apps_train = cc_apps_train.fillna(cc_apps_train.mean())
  cc_apps_test = cc_apps_test.fillna(cc_apps_test.mean())


In [86]:
for col in cc_apps_train.columns:
    if cc_apps_train[col].dtypes == 'object':
        cc_apps_train = cc_apps_train.fillna(cc_apps_train[col].value_counts().index[0])
        cc_apps_test = cc_apps_test.fillna(cc_apps_test[col].value_counts().index[0])

print(cc_apps_train.isnull().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
12    0
14    0
15    0
dtype: int64


### 5. Preprocessing the data (part I)

The reindexing step is used for discarding any new categorical feature that'd appear in the test data.

In [87]:
cc_apps_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,12,14,15
382,a,24.33,2.5,y,p,i,bb,4.5,f,f,0,g,456,-
137,b,33.58,2.75,u,g,m,v,4.25,t,t,6,g,0,+
346,b,32.25,1.5,u,g,c,v,0.25,f,f,0,g,122,-
326,b,30.17,1.085,y,p,c,v,0.04,f,f,0,g,179,-
33,a,36.75,5.125,u,g,e,v,5.0,t,f,0,g,4000,+


In [88]:
cc_apps_train = pd.get_dummies(cc_apps_train)
cc_apps_test = pd.get_dummies(cc_apps_test)

cc_apps_test = cc_apps_test.reindex(columns=cc_apps_train.columns, fill_value=0)

cc_apps_train.head()

Unnamed: 0,2,7,10,14,0_a,0_b,1_13.75,1_15.83,1_15.92,1_16.00,...,6_z,8_f,8_t,9_f,9_t,12_g,12_p,12_s,15_+,15_-
382,2.5,4.5,0,456,1,0,0,0,0,0,...,0,1,0,1,0,1,0,0,0,1
137,2.75,4.25,6,0,0,1,0,0,0,0,...,0,0,1,0,1,1,0,0,1,0
346,1.5,0.25,0,122,0,1,0,0,0,0,...,0,1,0,1,0,1,0,0,0,1
326,1.085,0.04,0,179,0,1,0,0,0,0,...,0,1,0,1,0,1,0,0,0,1
33,5.125,5.0,0,4000,1,0,0,0,0,0,...,0,0,1,1,0,1,0,0,1,0


When a dataset has varying ranges as in this credit card approvals dataset, one a small change in a particular feature may not have a significant effect on the other feature, which can cause a lot of problems when predictive modeling.

In [89]:
from sklearn.preprocessing import MinMaxScaler

X_train, y_train = cc_apps_train.iloc[:, 0:12].values, cc_apps_train.iloc[:, 12].values
X_test, y_test = cc_apps_test.iloc[:, 0:12].values, cc_apps_test.iloc[:, 12].values

scaler = MinMaxScaler(feature_range=(0, 1))

rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.fit_transform(X_test)

### 6. Preprocessing the data (part II)

Essentially, predicting if a credit card application will be approved or not is a classification task. According to UCI, our dataset contains more instances that correspond to "Denied" status than instances corresponding to "Approved" status. Specifically, out of 690 instances, there are 383 (55.5%) applications that got denied and 307 (44.5%) applications that got approved.

This gives us a benchmark. A good machine learning model should be able to accurately predict the status of the applications with respect to these statistics.

In [90]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(rescaledX_train, y_train)

LogisticRegression()

### 7. Making predictions and evaluating performance

In [91]:
from sklearn.metrics import confusion_matrix

y_pred = logreg.predict(rescaledX_test)

print("Accuracy of logistic regression classifier: ", logreg.score(rescaledX_test, y_test))

confusion_matrix(y_test, y_pred)

Accuracy of logistic regression classifier:  1.0


array([[228]])

In [92]:
from sklearn.model_selection import GridSearchCV

tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

param_grid = dict(tol=tol, max_iter=max_iter)

In [93]:
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

grid_model_result = grid_model.fit(rescaledX_train, y_train)

best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

best_model = grid_model_result.best_estimator_
print("Accuracy of best model: ", best_model.score(rescaledX_test, y_test))

Best: 0.993525 using {'max_iter': 100, 'tol': 0.01}
Accuracy of best model:  1.0


