<h1><center>Credit Risk Analysis</center></h1>
 

### imports

In [None]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 999
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_curve, roc_auc_score
from scipy.stats import chi2_contingency,ttest_ind
from sklearn.utils import shuffle
import time

import warnings
warnings.filterwarnings('ignore')


In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}


## Load Dataset


Let's take a quick look at the dataset.


In [None]:
cust_pd_full = pd.read_csv('./data/CUST_HISTORY_1000.csv')

# rows=1000
cust_pd = cust_pd_full # .head(rows)
print("There are " + str(len(cust_pd_full)) + " observations in the customer history dataset.")
print("There are " + str(len(cust_pd_full.columns)) + " variables in the dataset.")


# Data Preparation

In [None]:
cust_pd.head()

## Split Dataframe into Features and Label

In [None]:
cust_pd_Y = cust_pd[['IS_DEFAULT']]
cust_pd_X = cust_pd.drop(['IS_DEFAULT'],axis=1)

print('cust_pd_X.shape=%s, cust_pd_Y.shape=%s'% (cust_pd_X.shape, cust_pd_Y.shape))


## Transform Label

In [None]:
cust_pd_Y.head()

In [None]:
le = LabelEncoder()
cust_pd_Y['IS_DEFAULT'] = le.fit_transform(cust_pd_Y['IS_DEFAULT'])
cust_pd_Y.head()

## Transform Features

In [None]:
print(f'features df shape = {cust_pd_X.shape}')
cust_pd_X.head()

### Label Encoder for categorical Columns

In [None]:
categoricalColumns = ['CREDIT_HISTORY', 'TRANSACTION_CATEGORY', 'ACCOUNT_TYPE', 'ACCOUNT_AGE',
                      'STATE', 'IS_URBAN', 'IS_STATE_BORDER', 'HAS_CO_APPLICANT', 'HAS_GUARANTOR',
                      'OWN_REAL_ESTATE', 'OTHER_INSTALMENT_PLAN',
                      'OWN_RESIDENCE', 'RFM_SCORE', 'OWN_CAR', 'SHIP_INTERNATIONAL']
cat_indexes =  [cust_pd_X.columns.get_loc(col) for col in categoricalColumns]
cat_indexes = np.asarray(cat_indexes)   # .ravel()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

labelList=[]
labelEncoderList={}
for col in categoricalColumns:
    labenc = LabelEncoder()
    cust_pd_X[col] = labenc.fit_transform(cust_pd_X[col]) 
    labelEncoderList[col] = labenc
    newclas = [col + "_" + str(clas).replace(' ', '_') for clas in labenc.classes_ ]
    labelList.append(np.asarray(newclas))
labelEncoded_X = cust_pd_X
cust_pd_X.head()

In [None]:
collabelList = np.concatenate( labelList, axis=0 )

### One hot encoding for categorical Columns

In [None]:
OH_enc = OneHotEncoder(categorical_features=cat_indexes, handle_unknown='ignore', n_values="auto")
OH_enc.fit(cust_pd_X.values)

In [None]:
newcols = np.append(collabelList, ["EMI_TENURE", "TRANSACTION_AMOUNT", "NUMBER_CREDITS"])

In [None]:
cust_pd_X_enc = OH_enc.transform(cust_pd_X)
cust_pd_X_df = pd.DataFrame(cust_pd_X_enc.toarray(), columns=newcols)
cust_pd_X_df.head()

### Feature Normalization 

In [None]:
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(cust_pd_X_df)
features = min_max_scaler.transform(cust_pd_X_df)
features = normalize(features, axis=1, norm='l1')

cust_pd_X = pd.DataFrame(features,columns=newcols)
cust_pd_X.head()

## Split Train and Test Dataset

In [None]:
label    = cust_pd_Y.values
features  = cust_pd_X.values

label = np.reshape(label,(-1,1))
# label = np.float32(label)
X_train,X_test,y_train,y_test = \
       train_test_split(features, label, test_size=0.3, random_state=42, stratify=label)
print(f'X_train.shape={X_train.shape} Y_train.shape={y_train.shape}')
print(f'X_test.shape={X_test.shape} Y_test.shape={y_test.shape}')

# Sklearn Training

In [None]:
# Import
from sklearn.linear_model import LogisticRegression
sklearn_lr = LogisticRegression(verbose=1)

In [None]:
# TRAIN
t0 = time.time()
sklearn_lr.fit(X_train, y_train)
print("[sklearn] Training time (s):  {0:.5f}".format(time.time()-t0))


In [None]:
# Evaluate log-loss on test set
# proba_test = sklearn_lr.predict_proba(X_test)
# from sklearn.metrics import log_loss
# logloss_sklearn = log_loss(y_test, proba_test)
# print("[sklearn] Logarithmic loss:   {0:.4f}".format(logloss_sklearn))
sklearn_prediction = sklearn_lr.predict(X_test)
print(f'sklearn ml accuracy score = {accuracy_score(y_test,sklearn_prediction)}')

## Further Analysis

### Check how good is our Model

In [None]:
features_order = labelEncoded_X.columns.tolist()
labelEncoded_X.head()

In [None]:
# Process the data without Onehot Encoder as RandomForest works best with Categorical columns
min_max_scaler_ = MinMaxScaler()
cust_pd_X_ = min_max_scaler_.fit_transform(labelEncoded_X)
cust_pd_X_ = normalize(cust_pd_X_, axis=1, norm='l1')
cust_pd_X_ = pd.DataFrame(cust_pd_X_,columns=features_order)

features_  = cust_pd_X_.values
label_    = cust_pd_Y.values

label_ = np.reshape(label_,(-1,))

X_train_,X_test_,y_train_,y_test_ = \
       train_test_split(features_, label_, test_size=0.3, random_state=42, stratify=label_)

In [None]:
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier()

#Stratify split and train on 5 folds
skf = StratifiedKFold(y_train_, n_folds=5)
counter = 1
for train_fold, test_fold in skf:
    random_forest.fit(X_train_[train_fold], y_train_[train_fold])
    print( str(counter) + ": ", random_forest.score(X_train_[test_fold], y_train_[test_fold]))
    counter += 1 

### List Top feature which influence the Model

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=250,
                              random_state=0)

forest.fit(X_train_, y_train_)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(5):
    print("%d. feature %s (%f)" % (f + 1, features_order[indices[f]], importances[indices[f]]))