In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [34]:
df = pd.read_csv('cc_approvals.csv', header= None) 

In [35]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


## Its quite common to see this type of datasets where all the features and data are being anonymized.

### Lets try to figure out which are the most important features in our dataset.
#### The typical features in a credit card are:
    * Gender
    * Age
    * Debt
    * Married
    * BankCustomer
    * EducationLevel
    * Ethnicity
    * YearsEmployeed
    * PriorDefault
    * Employed
    * CreditScore
    * ZipCode
    * Income
    * ApprovalStatus
### This gives us a pretty good starting point, and we can map these features with respect to the columns in the output

In [36]:
df.describe()

Unnamed: 0,2,7,10,14
count,690.0,690.0,690.0,690.0
mean,4.758725,2.223406,2.4,1017.385507
std,4.978163,3.346513,4.86294,5210.102598
min,0.0,0.0,0.0,0.0
25%,1.0,0.165,0.0,0.0
50%,2.75,1.0,0.0,5.0
75%,7.2075,2.625,3.0,395.5
max,28.0,28.5,67.0,100000.0


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    object 
 1   1       690 non-null    object 
 2   2       690 non-null    float64
 3   3       690 non-null    object 
 4   4       690 non-null    object 
 5   5       690 non-null    object 
 6   6       690 non-null    object 
 7   7       690 non-null    float64
 8   8       690 non-null    object 
 9   9       690 non-null    object 
 10  10      690 non-null    int64  
 11  11      690 non-null    object 
 12  12      690 non-null    object 
 13  13      690 non-null    object 
 14  14      690 non-null    int64  
 15  15      690 non-null    object 
dtypes: float64(2), int64(2), object(12)
memory usage: 86.4+ KB


In [38]:
df.tail(17)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
673,?,29.5,2.0,y,p,e,h,2.0,f,f,0,f,g,256,17,-
674,a,37.33,2.5,u,g,i,h,0.21,f,f,0,f,g,260,246,-
675,a,41.58,1.04,u,g,aa,v,0.665,f,f,0,f,g,240,237,-
676,a,30.58,10.665,u,g,q,h,0.085,f,t,12,t,g,129,3,-
677,b,19.42,7.25,u,g,m,v,0.04,f,t,1,f,g,100,1,-
678,a,17.92,10.21,u,g,ff,ff,0.0,f,f,0,f,g,0,50,-
679,a,20.08,1.25,u,g,c,v,0.0,f,f,0,f,g,0,0,-
680,b,19.5,0.29,u,g,k,v,0.29,f,f,0,f,g,280,364,-
681,b,27.83,1.0,y,p,d,h,3.0,f,f,0,f,g,176,537,-
682,b,17.08,3.29,u,g,i,v,0.335,f,f,0,t,g,140,2,-


## Handling missing values

## We've uncovered some issues that will affect the performance of our machine learning model(s) if they go unchanged:

 * Our dataset contains both numeric and non-numeric data (specifically data that are of float64, int64 and object types). Specifically, the features 2, 7, 10 and 14 contain numeric values (of types float64, float64, int64 and int64 respectively) and all the other features contain non-numeric values.
 * The dataset also contains values from several ranges. Some features have a value range of 0 - 28, some have a range of 2 - 67, and some have a range of 1017 - 100000. Apart from these, we can get useful statistical information (like mean, max, and min) about the features that have numerical values.
 * Finally, the dataset has missing values, which we'll take care of in this task. The missing values in the dataset are labeled with '?', which can be seen in the last cell's output.
 
### Now, let's temporarily replace these missing value question marks with NaN.

In [39]:
df.isnull().values.sum()

0

In [40]:
df = df.replace('?', np.nan)

#### Total NaN:

In [41]:
df.isnull().values.sum()

67

### Total NaN in each column:

In [42]:
df.isnull().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

#### We replaced all the question marks with NaNs. This is going to help us in the next missing value treatment that we are going to perform.

#### An important question that gets raised here is why are we giving so much importance to missing values? Can't they be just ignored? Ignoring missing values can affect the performance of a machine learning model heavily. While ignoring the missing values our machine learning model may miss out on information about the dataset that may be useful for its training. Then, there are many models which cannot handle missing values implicitly.

### So, to avoid this problem, we are going to impute the missing values with a strategy called mean imputation.

In [43]:
df.fillna(df.mean(), inplace=True)

In [44]:
df.isnull().values.sum()

67

### We dont see any change because all the missing values are not in a number type of variable, instead they are all in an object type

#### We have successfully taken care of the missing values present in the numeric columns. There are still some missing values to be imputed for columns 0, 1, 3, 4, 5, 6 and 13. All of these columns contain non-numeric data and this why the mean imputation strategy would not work here. This needs a different treatment.

#### We are going to impute these missing values with the most frequent values as present in the respective columns. This is good practice when it comes to imputing missing values for categorical data in general.

In [45]:
for col in df:
    # Check if the column is of object type
    if df[col].dtypes == 'object':
        # Impute with the most frequent value
        df = df.fillna(df[col].value_counts().index[0])

In [46]:
df.isnull().values.sum()

0

## Preprocessing the data
### The missing values are now successfully handled.

#### There is still some minor but essential data preprocessing needed before we proceed towards building our machine learning model. We are going to divide these remaining preprocessing steps into three main tasks:
    * Convert the non-numeric data into numeric.
    * Split the data into train and test sets.
    * Scale the feature values to a uniform range.
    
#### First, we will be converting all the non-numeric values into numeric ones. We do this because not only it results in a faster computation but also many machine learning models (like XGBoost) (and especially the ones developed using scikit-learn) require the data to be in a strictly numeric format. We will do this by using a technique called label encoding.

In [47]:
from sklearn.preprocessing import LabelEncoder

In [48]:
le = LabelEncoder()

In [49]:
# Iterate over all the values of each column and extract their dtypes
for col in df:
    # Compare if the dtype is object
    if df[col].dtypes =='object':
    # Use LabelEncoder to do the numeric transformation
        le.fit(df[col])
        df[col]=le.transform(df[col])
#  information of the new dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       690 non-null    int32  
 1   1       690 non-null    int32  
 2   2       690 non-null    float64
 3   3       690 non-null    int32  
 4   4       690 non-null    int32  
 5   5       690 non-null    int32  
 6   6       690 non-null    int32  
 7   7       690 non-null    float64
 8   8       690 non-null    int32  
 9   9       690 non-null    int32  
 10  10      690 non-null    int64  
 11  11      690 non-null    int32  
 12  12      690 non-null    int32  
 13  13      690 non-null    int32  
 14  14      690 non-null    int64  
 15  15      690 non-null    int32  
dtypes: float64(2), int32(12), int64(2)
memory usage: 54.0 KB


## Splitting the dataset into train and test set

#### Now, we will split our data into train set and test set to prepare our data for two different phases of machine learning modeling: training and testing. Ideally, no information from the test data should be used to scale the training data or should be used to direct the training process of a machine learning model. Hence, we first split the data and then apply the scaling.

#### Also, features like DriversLicense and ZipCode are not as important as the other features in the dataset for predicting credit card approvals. We should drop them to design our machine learning model with the best set of features. In Data Science literature, this is often referred to as feature selection.

In [50]:
from sklearn.model_selection import train_test_split

In [51]:
df.drop([11, 13], axis=1, inplace=True)

In [52]:
df = df.values

In [54]:
X, y = df[:,0:12], df[:,13]

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

### The data is now split into two separate sets - train and test sets respectively. We are only left with one final preprocessing step of scaling before we can fit a machine learning model to the data.

In [57]:
from sklearn.preprocessing import StandardScaler

In [60]:
scaler = StandardScaler()
scaler.fit(X_train)
rescaledX_train = scaler.transform(X_train)
rescaledX_test = scaler.transform(X_test)

## Using Decision Tree Classifier Model

In [80]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [65]:
dt_class = DecisionTreeClassifier()

In [72]:
dt_class.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [73]:
y_pred = dt_class.predict(X_test)

In [77]:
accuracy_score(y_test,y_pred)

0.7971014492753623

In [81]:
confusion_matrix(y_test, y_pred)

array([[54, 16],
       [12, 56]], dtype=int64)

#### For the confusion matrix, the first element of the of the first row of the confusion matrix denotes the true negatives meaning the number of negative instances (denied applications) predicted by the model correctly. And the last element of the second row of the confusion matrix denotes the true positives meaning the number of positive instances (approved applications) predicted by the model correctly.

In [84]:
from sklearn.model_selection import GridSearchCV

In [88]:
def dtree_grid_search(X,y,nfolds):
    #create a dictionary of all values we want to test
    param_grid = { 'criterion':['gini','entropy'],'max_depth': np.arange(3, 15)}
    # decision tree model
    dtree_model=DecisionTreeClassifier()
    #use gridsearch to test all values
    dtree_gscv = GridSearchCV(dtree_model, param_grid, cv=nfolds)
    #fit model to data
    dtree_gscv.fit(X, y)
    return dtree_gscv

In [89]:
df_class_best = dtree_grid_search(rescaledX_train, y_train, 3)

In [90]:
y_pred_2 = df_class_best.predict(rescaledX_test)

In [91]:
accuracy_score(y_test,y_pred_2)

0.8478260869565217

### Using GridSearch we can see that we got 5% of increase in our model performance