<a href="https://colab.research.google.com/github/HectorPulido/Angry-birds-like-game-made-with-UNITY-and-C-/blob/master/%20Credit_Approval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysis of Credit Approval Data
This is a dataset from [Credit Approval Data Set](https://archive.ics.uci.edu/ml/datasets/credit+approval) More info here [Analysis of Credit Approval Data](https://rstudio-pubs-static.s3.amazonaws.com/73039_9946de135c0a49daa7a0a9eda4a67a72.html)

## Importing Dependencies

In [0]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np


## Downloading the dataset

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data"
!wget {url}
!ls

--2019-07-01 05:37:48--  https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 32218 (31K) [application/x-httpd-php]
Saving to: ‘crx.data.4’


2019-07-01 05:37:48 (996 KB/s) - ‘crx.data.4’ saved [32218/32218]

crx.data  crx.data.1  crx.data.2  crx.data.3  crx.data.4  sample_data


## Here we go...

In [3]:
cc_apps = pd.read_csv('crx.data', header = None)
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


## Dataset summary

In [4]:
# Print summary statistics
cc_apps_description = cc_apps.describe()
print(cc_apps_description)

print("\n")

# Print DataFrame information
cc_apps_info = cc_apps.info()
print(cc_apps_info)

print("\n")

# Inspect missing values in the dataset
cc_app_missing = cc_apps.tail(17)
print(cc_app_missing)


               2           7          10             14
count  690.000000  690.000000  690.00000     690.000000
mean     4.758725    2.223406    2.40000    1017.385507
std      4.978163    3.346513    4.86294    5210.102598
min      0.000000    0.000000    0.00000       0.000000
25%      1.000000    0.165000    0.00000       0.000000
50%      2.750000    1.000000    0.00000       5.000000
75%      7.207500    2.625000    3.00000     395.500000
max     28.000000   28.500000   67.00000  100000.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
0     690 non-null object
1     690 non-null object
2     690 non-null float64
3     690 non-null object
4     690 non-null object
5     690 non-null object
6     690 non-null object
7     690 non-null float64
8     690 non-null object
9     690 non-null object
10    690 non-null int64
11    690 non-null object
12    690 non-null object
13    690 non-null object
14    690 non-null int64

## Replacing ? by NaN

In [5]:
# Inspect missing values in the dataset before replacing
print(cc_apps.isna().sum())

# Replace the '?'s with NaN
cc_apps = cc_apps.replace(to_replace = "?", value = np.nan)

# Inspect missing values in the dataset after replacing
print(cc_apps.isna().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64
0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64


## Remove NaN changing it by the mean

In [6]:
# Inspect missing values in the dataset before fillna
print(cc_apps.isna().sum())

cc_apps[2] = cc_apps[2].fillna(cc_apps[2].mean())
cc_apps[7] = cc_apps[7].fillna(cc_apps[7].mean())
cc_apps[10] = cc_apps[10].fillna(cc_apps[10].mean())
cc_apps[14] = cc_apps[14].fillna(cc_apps[14].mean())

# Inspect missing values in the dataset after fillna
print(cc_apps.isna().sum())

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64
0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64


## Change the null values by the mode

In [7]:
# Iterate over each column of cc_apps
for col in cc_apps.columns:
    # Check if the column is of object type
    if cc_apps[col].dtypes == 'object':
        # Impute with the most frequent value
        cc_apps.fillna(cc_apps[col].value_counts().index[0], inplace=True)        

print(cc_apps.isna().sum())

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64


## Converting labels into numbers

In [8]:
le = LabelEncoder()
for col in cc_apps.columns:
    if cc_apps[col].dtypes == 'object':
        cc_apps[col] = le.fit(cc_apps[col]).transform(cc_apps[col])
        
print(cc_apps.head(7))
print(cc_apps.tail(7))


   0    1      2   3   4   5   6     7   8   9   10  11  12   13     14  15
0   1  156  0.000   2   1  13   8  1.25   1   1   1   0   0   68      0   0
1   0  328  4.460   2   1  11   4  3.04   1   1   6   0   0   11    560   0
2   0   89  0.500   2   1  11   4  1.50   1   0   0   0   0   96    824   0
3   1  125  1.540   2   1  13   8  3.75   1   1   5   1   0   31      3   0
4   1   43  5.625   2   1  13   8  1.71   1   0   0   0   2   37      0   0
5   1  168  4.000   2   1  10   8  2.50   1   0   0   1   0  115      0   0
6   1  179  1.040   2   1  12   4  6.50   1   0   0   1   0   54  31285   0
     0    1       2   3   4   5   6      7   8   9   10  11  12   13   14  15
683   1  208   0.750   3   3   4   8  0.585   0   0   0   0   0   84    3   1
684   1  240   3.290   2   1  10   8  3.500   0   0   0   1   2  129    0   1
685   1   52  10.085   3   3   5   4  1.250   0   0   0   0   0   90    0   1
686   0   71   0.750   2   1   2   8  2.000   0   1   2   1   0   67  394   1
68

## Drop 11 and 13 for the performance sake

In [9]:

cc_apps = cc_apps.drop([11, 13], axis=1, errors="ignore")
cc_apps_array = np.array(cc_apps)

# # Segregate features and labels into separate variables
X,y = cc_apps_array[:,0:12] , cc_apps_array[:,13]
print(cc_apps_array.shape)
print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(690, 14)
(690, 12)
(690,)
(462, 12)
(228, 12)
(462,)
(228,)


## Rescaling the data

In [0]:
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_X_train = scaler.fit_transform(X_train)
rescaled_X_test = scaler.fit_transform(X_test)

## Training the model

In [11]:
logreg = LogisticRegression(solver='liblinear')
logreg.fit(rescaled_X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

## How much did we fuck it up?

In [12]:
y_pred = logreg.predict(rescaled_X_test)

print("Accuracy of logistic regression classifier in Train: ", logreg.score(rescaled_X_train, y_train))
print("Accuracy of logistic regression classifier in Test: ", logreg.score(rescaled_X_test, y_test))

confusion_matrix(y_test, y_pred)

Accuracy of logistic regression classifier in Train:  0.8722943722943723
Accuracy of logistic regression classifier in Test:  0.8377192982456141


array([[92, 11],
       [26, 99]])

## Little bit of meta-learning

In [13]:
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]
param_grid = dict(tol=tol, max_iter=max_iter)

# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

# Use scaler to rescale X and assign it to rescaledX
rescaledX = scaler.fit_transform(X)

# Fit data to grid_model
grid_model_result = grid_model.fit(rescaledX, y)

# Summarize results
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_score, best_params))

Best: 0.852174 using {'max_iter': 100, 'tol': 0.01}


## Voila!