# An automatic credit card approval predictor using machine learning techniques



In [46]:
# loading libraries 
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression # logreg is a clf model!
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV


In [29]:
# loading dataset
data=pd.read_csv('crx.data', header=None)


In [30]:
#Inspect dataset
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [31]:
# inspect dataset in detail 
cc_description= data.describe()
print (cc_description) 

               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000


In [32]:
# Print DataFrame information
cc_info = data.info()
print(cc_info)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  11      690 non-null    object 
 12  12      690 non-null    object 
 13  13      690 non-null    object 
 14  14      690 non-null    int64  
 15  15      690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB
None


In [33]:
# inspect missing values in dataset
data.tail(17)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
673,?,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


# Our dataset contains both numeric and non-numeric data (specifically data that are of float64, int64 and object types). Specifically, features 2, 7, 10, and 14 contain numeric values (of types float64, float64, int64, and int64 respectively) and all the other features contain non-numeric values.

# The dataset also contains values from several ranges. Some features have a value range of 0–28, some have a range of 2–67, and some have a range of 1017–100000. Apart from these, we can get useful statistical information (like mean, max, and min) about the features that have numerical values.

# Finally, the dataset has missing values, which we’ll take care of in this task. The missing values in the dataset are labelled with ‘?’, which can be seen in the last cell’s output.


In [34]:
# Replace ? mark with Nan temporarily 
data = data.replace("?", np.nan)

In [35]:
# inspect data again 
data.tail(17)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
673,,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


In [36]:
# impute the missing values with a strategy called mean imputation.
data.fillna("NaN", inplace=True)
# Count the number of NaNs in the dataset to verify
data.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

# There are still some missing values to be imputed for columns 0, 1, 3, 4, 5, 6, and 13. 
# All of these columns contain non-numeric data and this is why the mean imputation strategy would not work here. 
# This needs a different treatment.
# missing values with the most frequent values as present in the respective columns. 
# This is good practice when it comes to imputing missing values for categorical data in general.

In [37]:
# Iterate over each column of data
for col in data.columns:
    # Check if the column is of object type
    if data[col].dtype == 'object':
        # Impute with the most frequent value
        data = data.fillna(data[col].value_counts().index[0])

# Count the number of NaNs in the dataset and print the counts to verify
print(data.isnull().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64


# preprocessing data 

Remaining preprocessing steps 
1. Convert the non-numeric data into numeric.
2. Split the data into train and test sets.
3. Scale the feature values to a uniform range.


# First, we will be converting all the non-numeric values into numeric ones. We do this because not only its result in a faster computation but also many machine learning models (like XGBoost) (and especially the ones developed using scikit-learn) require the data to be in a strictly numeric format. We will do this by using a technique called label encoding.

In [38]:
# Instantiate LabelEncoder
le = LabelEncoder()

# Iterate over all the values of each column and extract their dtypes
for col in data.columns.to_numpy():
    # Compare if the dtype is object
    if data[col].dtype =='object':
    # Use LabelEncoder to do the numeric transformation
        data[col]=le.fit_transform(data[col])

# Spliting the data into train and test

In [39]:
# Drop the features 11 (Driving licence and 13 (Zip code) and convert the DataFrame to a NumPy array
data = data.drop([11, 13], axis=1)
data = data.to_numpy()

# Segregate features and labels into separate variables
X,y = data[:,0:13] , data[:,13]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                y,
                                test_size=0.33,
                                random_state=42)

# The data is now split into two separate sets — train and test sets respectively. We are only left with one final preprocessing step of scaling before we can fit a machine learning model to the data.

# Now, let’s try to understand what these scaled values mean in the real world. Let’s use CreditScore as an example. The credit score of a person is their creditworthiness based on their credit history. The higher this number, the more financially trustworthy a person is considered to be. So, a CreditScore of 1 is the highest since we're rescaling all the values to the range of 0-1.


In [41]:
# Instantiate MinMaxScaler and use it to rescale X_train and X_test
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

# Fitting a logistic regression model to the train set
1. if a credit card application will be approved or not is a classification task
2.  According to UCI, our dataset contains more instances that correspond to “Denied” status than instances corresponding to “Approved” status. Specifically, out of 690 instances, there are 383 (55.5%) applications that got denied and 307 (44.5%) applications that got approved.
3. This gives us a benchmark. A good machine learning model should be able to accurately predict the status of the applications with respect to these statistics.
4. Which model should we pick? A question to ask is: are the features that affect the credit card approval decision process correlated with each other? Although we can measure correlation, that is outside the scope of this notebook, so we’ll rely on our intuition that they indeed are correlated for now. 
5. Because of this correlation, we’ll take advantage of the fact that generalized linear models perform well in these cases. Let’s start our machine learning modelling with a Logistic Regression model (a generalized linear model).

In [43]:

# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

# Fit logreg to the train set
logreg.fit(rescaledX_train, y_train)

LogisticRegression()

# Making predictions and evaluating performance
1. But how well does our model perform?
2. We will now evaluate our model on the test set with respect to classification accuracy. 
3. But we will also take a look at the model’s confusion matrix. 
4. In the case of predicting credit card applications. 
5. it is equally important to see if our machine learning model is able to predict the approval status of the applications as denied that originally got denied. 
6. If our model is not performing well in this aspect, then it might end up approving the application that should have been approved. 
7. The confusion matrix helps us to view our model’s performance from these aspects.

In [45]:

# Use logreg to predict instances from the test set and store it
y_pred = logreg.predict(rescaledX_test)

# Get the accuracy score of logreg model and print it
print("Accuracy of logistic regression classifier: ", 
      logreg.score(rescaledX_test, y_test))

# Print the confusion matrix of the logreg model
confusion_matrix(y_test, y_pred)

Accuracy of logistic regression classifier:  0.8377192982456141


array([[93, 10],
       [27, 98]])

# Grid searching and making the model perform better
1. Our model was pretty good! It was able to yield an accuracy score of almost 84%.
2. For the confusion matrix, the first element of the first row of the confusion matrix denotes the true negatives meaning the number of negative instances (denied applications) predicted by the model correctly. 
3. And the last element of the second row of the confusion matrix denotes the true positives meaning the number of positive instances (approved applications) predicted by the model correctly.

# We can perform a grid search of the model parameters to improve the model’s ability to predict credit card approvals.
scikit-learn’s implementation of logistic regression consists of different hyperparameters but we will grid search over the following two:
tol
max_iter

In [47]:
# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

# Create a dictionary where tol and max_iter are keys and the lists of their values are corresponding values
param_grid = dict(tol = tol, max_iter = max_iter)

# Finding the best performing model
1. We have defined the grid of hyperparameter values and converted them into a single dictionary format which GridSearchCV() expects as one of its parameters. Now, we will begin the grid search to see which values perform best.
2. We will instantiate GridSearchCV() with our earlier logreg model with all the data we have. 
3. Instead of passing train and test sets separately, we will supply X (scaled version) and y. 
4. We will also instruct GridSearchCV() to perform a cross-validation of five folds.
We’ll end the article by storing the best-achieved score and the respective best parameters.
While building this credit card predictor, we tackled some of the most widely-known preprocessing steps such as scaling, label encoding, and missing value imputation. We finished with some machine learning to predict if a person’s application for a credit card would get approved or not given some information about that person.

In [48]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator= logreg, param_grid= param_grid, cv= 5)

# Use scaler to rescale X and assign it to rescaledX
rescaledX = scaler.fit_transform(X)

# Fit data to grid_model
grid_model_result = grid_model.fit(rescaledX, y)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.852174 using {'max_iter': 100, 'tol': 0.01}
