# Micro-Credit Defaulter Model Project:

In [None]:
#Imported required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

#Importing Boosting models
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier

#Importing error metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,roc_curve,auc
from sklearn.model_selection import GridSearchCV,cross_val_score

#Display all the columns of the dataframe
pd.pandas.set_option('display.max_columns',None)

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv(r"G:\Users\Lenovo\Desktop\Micro-Credit-Project\Micro Credit Project\Data file.csv")
df

We can see that label is the objective column, and for each loan transaction, I have to predict the likelihood that the customer would repay the lent amount within five days after the loan's insurance. Label "1" in this instance denotes a paid debt, or a non-defaulter, while Label "0" denotes a loan that has not been paid, or a defaulter. It is therefore evidently a classification issue.

# Features information:

- label : Flag indicating whether the user paid back the credit amount within 5 days of issuing the loan{1:success, 0:failure}
- msisdn : mobile number of user
- aon : age on cellular network in days
- daily_decr30 : Daily amount spent from main account, averaged over last 30 days (in Indonesian Rupiah)
- daily_decr90 : Daily amount spent from main account, averaged over last 90 days (in Indonesian Rupiah)
- rental30 : Average main account balance over last 30 days
- rental90 : Average main account balance over last 90 days
- last_rech_date_ma : Number of days till last recharge of main account
- last_rech_date_da: Number of days till last recharge of data account
- last_rech_amt_ma : Amount of last recharge of main account (in Indonesian Rupiah)
- cnt_ma_rech30 : Number of times main account got recharged in last 30 days
- fr_ma_rech30 : Frequency of main account recharged in last 30 days
- sumamnt_ma_rech30 : Total amount of recharge in main account over last 30 days (in Indonesian Rupiah)
- medianamnt_ma_rech30 : Median of amount of recharges done in main account over last 30 days at user level (in Indonesian Rupiah)
- medianmarechprebal30 : Median of main account balance just before recharge in last 30 days at user level (in Indonesian Rupiah)
- cnt_ma_rech90 : Number of times main account got recharged in last 90 days
- fr_ma_rech90 : Frequency of main account recharged in last 90 days
- sumamnt_ma_rech90 : Total amount of recharge in main account over last 90 days (in Indonasian Rupiah)
- medianamnt_ma_rech90 : Median of amount of recharges done in main account over last 90 days at user level (in Indonasian Rupiah)
- medianmarechprebal90 : Median of main account balance just before recharge in last 90 days at user level (in Indonasian Rupiah)
- cnt_da_rech30 : Number of times data account got recharged in last 30 days
- fr_da_rech30: Frequency of data account recharged in last 30 days
- cnt_da_rech90 : Number of times data account got recharged in last 90 days
- fr_da_rech90 : Frequency of data account recharged in last 90 days
- cnt_loans30 : Number of loans taken by user in last 30 days
- amnt_loans30: Total amount of loans taken by user in last 30 days
- maxamnt_loans30 : maximum amount of loan taken by the user in last 30 days
- medianamnt_loans30 : Median of amounts of loan taken by the user in last 30 days
- cnt_loans90 : Number of loans taken by user in last 90 days
- amnt_loans90 : Total amount of loans taken by user in last 90 days
- maxamnt_loans90 : maximum amount of loan taken by the user in last 90 days
- medianamnt_loans90 : Median of amounts of loan taken by the user in last 90 days
- payback30 : Average payback time in days over last 30 days
- payback90 : Average payback time in days over last 90 days
- pcircle : telecom circle
- pdate : date

# Preprocessing and EDA

In [None]:
df.shape

Observation: There are 209593 rows and 37 columns present in the dataset.

In [None]:
df.columns

# Features:

- Variable : Defination -> comment
- label : Flag indicating whether the user paid back the credit amount within 5 days of issuing the loan{1:success, 0:failure}
- msisdn : mobile number of user
- aon : age on cellular network in days
- daily_decr30 : Daily amount spent from main account, averaged over last 30 days (in Indonesian Rupiah)
- daily_decr90 : Daily amount spent from main account, averaged over last 90 days (in Indonesian Rupiah)
- rental30 : Average main account balance over last 30 days -> Unsure of given definition
- rental90 : Average main account balance over last 90 days -> Unsure of given definition
- last_rech_date_ma : Number of days till last recharge of main account
- last_rech_date_da : Number of days till last recharge of data account
- last_rech_amt_ma : Amount of last recharge of main account (in Indonesian Rupiah)
- cnt_ma_rech30 : Number of times main account got recharged in last 30 days
- fr_ma_rech30 : Frequency of main account recharged in last 30 days -> Unsure of given definition
- sumamnt_ma_rech30 : Total amount of recharge in main account over last 30 days (in Indonesian Rupiah)
- medianamnt_ma_rech30 : Median of amount of recharges done in main account over last 30 days at user level (in Indonesian Rupiah)
- medianmarechprebal30 : Median of main account balance just before recharge in last 30 days at user level (in Indonesian Rupiah)
- cnt_ma_rech90 : Number of times main account got recharged in last 90 days
- fr_ma_rech90 : Frequency of main account recharged in last 90 days -> Unsure of given definition
- sumamnt_ma_rech90 : Total amount of recharge in main account over last 90 days (in Indonasian Rupiah)
- medianamnt_ma_rech90 : Median of amount of recharges done in main account over last 90 days at user level (in Indonasian Rupiah)
- medianmarechprebal90 : Median of main account balance just before recharge in last 90 days at user level (in Indonasian Rupiah)
- cnt_da_rech30 : Number of times data account got recharged in last 30 days
- fr_da_rech30 : Frequency of data account recharged in last 30 days
- cnt_da_rech90 : Number of times data account got recharged in last 90 days
- fr_da_rech90 : Frequency of data account recharged in last 90 days
- cnt_loans30 : Number of loans taken by user in last 30 days
- amnt_loans30 : Total amount of loans taken by user in last 30 days
- maxamnt_loans30 : maximum amount of loan taken by the user in last 30 days -> There are only two options: 5 & 10 Rs., for which the user needs to pay back 6 & 12 Rs. respectively
- medianamnt_loans30 : Median of amounts of loan taken by the user in last 30 days
- cnt_loans90 : Number of loans taken by user in last 90 days
- amnt_loans90 : Total amount of loans taken by user in last 90 days
- maxamnt_loans90 : maximum amount of loan taken by the user in last 90 days
- medianamnt_loans90 : Median of amounts of loan taken by the user in last 90 days
- payback30 : Average payback time in days over last 30 days
- payback90 : Average payback time in days over last 90 days
- pcircle : telecom circle
- pdate : date

In [None]:
df.dtypes

#### Observation:

- There are two types of data present in the dataset categorical and numerical.

- msisdn, pcircle, pdate are in categorical data type rest other columns are in numerical data types

In [None]:
df.info()

# Checking missing values

In [None]:
df.isnull().values.any()

In [None]:
df.isnull().sum()

There are no missing values present in the dataset. We can also confirm this via visualization with the help of heatmap.

In [None]:
plt.figure(figsize=(22,5))
sns.heatmap(df.isnull(),yticklabels=False,cbar=False)

observation: We confirm that there are no missing values present in the dataset through heatmap.

# Exploring categorical columns

In [None]:
for column in df.columns:
    if df[column].dtypes == object:
        print(str(column) + ' : ' + str(df[column].unique()))
        print(df[column].value_counts())
        print('*******************************************************************************************************')
        print('\n')

Observation:

In the column msisdn the mobile number of users are recurring so to do it's analysis and check the correlation we will have to convert it to float data type.

In the column pcircle there is only one unique value 'UPW' is present so we will be dropping this column.

In [None]:
from collections import Counter
Counter(df['pdate'])

Observation:

1. From the pdate column we will be extracting day and month to seperate columns.

2. As there is only one unique year value '2016' present we will be ignoring it.

# Feature extraction

In [None]:
df['pdate'] = pd.to_datetime(df['pdate'])
df['pdate']

In [None]:
df['pdate'].dt.day

In [None]:
df['day'] = df['pdate'].dt.day

In [None]:
df['pdate'].dt.month

In [None]:
df['month'] = df['pdate'].dt.month

In [None]:
df.head()

We have extracted day and month from the pdate column

# Encoding categorical column

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['msisdn']=le.fit_transform(df['msisdn'].astype(str))

In [None]:
df.head()

# Summary statistics

In [None]:
df.describe()

Observation:

1. Maximum standard deviation is observed in aon column.


2. In the columns aon, daily_decr30, daily_decr90, rental30, rental90, last_rech_date_ma, last_rech_date_da, maxamnt_loans30, cnt_loans90, amnt_loans90 mean is considerably greater than median so the columns are positively skewed.


3. In the columns label, month median is greater than mean so the columns are negatively skewed.


4. In the columns aon, daily_decr30, daily_decr90, rental30, rental90, last_rech_date_ma, last_rech_date_da, maxamnt_loans30, cnt_loans90, payback30, payback90 there is huge difference present between 75th perecentile and maximum so outliers are present here.


5. Maximum aon (age on cellular network in days) observed is 999860.755168 and the minimum is -48.

# To check the correlation

In [None]:
df_cor=df.corr()
df_cor

With the aid of a heatmap, we will visualise it for easier understanding.

In [None]:
plt.figure(figsize=(16,16))
sns.heatmap(df_cor,annot=True,fmt='.0%',cmap='cool')
plt.show()

Observation:

1. The columns amnt_loans90, amnt_loans30, cnt_loans20, sumamnt_ma_rech90, cnt_ma_rech90, cnt_ma_rech30, sumamnt_ma_rech30 are highly positively correlated with label column.


2. We have observed multicollinearity in between columns so we will be using PCA(Principal Component Analysis)


3. No correlation has been observed in Unnamed: 0 ,msisdn, last_rechdate_ma, last_rech_date_da column so will be dropping these columns.


In [None]:
#checking the correlation with target variable 'label'

plt.figure(figsize=(25,10))
df.drop('label', axis=1).corrwith(df['label']).plot(kind='bar',grid=True)
plt.xticks(rotation='vertical')
plt.title("correaltion with target variable label")

Observation:

Target variable label is highly positively correlated with cnt_ma_rech30, cnt_ma_rech90 and negatively correlated with aon, medianmarechprebal30, fr_da_rech_90.

# Dropping unnecessary columns

In [None]:
df.drop(['Unnamed: 0','msisdn','pdate','pcircle','last_rech_date_ma','last_rech_date_da','fr_ma_rech30','fr_da_rech30'],axis=1,inplace=True)


In [None]:
df.head()

# Data Visualization

Our target variable is label

1 - Success(Non Defaulter)

0 - Failure(Defaulter)

### Univariate Analysis

In [None]:
#Checking the target variable

plt.subplots(figsize=(15,5))
sns.countplot(x="label", data=df)
plt.title("Category of label")
plt.xlabel(' 0 = Failure(Defaulter) and 1 = Success(Non Defaulter)')
plt.ylabel("count")
plt.show()

df['label'].value_counts()

Observation:

1. We observe 183431 number of Non defaulters where as 26162 number of defaulters.


2. We observe that this is a very imbalanced data set.

In [None]:
#Checking the month column

plt.subplots(figsize=(15,5))
sns.countplot(x="month", data=df)
plt.title("Countplot of month")
plt.xlabel('month')
plt.ylabel("count")
plt.show()

df['month'].value_counts()

Observation:

   Maximum(85765) number of users has taken credit on 7th month.

In [None]:
#Checking the day column

plt.subplots(figsize=(15,5))
sns.countplot(x="day", data=df)
plt.title("Countplot of day")
plt.xlabel('day')
plt.ylabel("count")
plt.show()

df['day'].value_counts()

Observation:

Maximum(8092) number of users have taken credit on 11th day of the month. 

In [None]:
#Voilinplot of column last_rech_amt_ma


sns.set(style='whitegrid')
sns.violinplot(df['last_rech_amt_ma'])
plt.show()

df['last_rech_amt_ma'].value_counts()

Observation:

Maximum(56297) number of customers last_rech_amt_ma is 1539.

In [None]:
# SCatter plot of cnt_loans30 column

plt.scatter(df.index,df['cnt_loans30'])
plt.show()

df['cnt_loans30'].value_counts()

Observation:

      Maximum(83432) number of customers cnt_loans30 is 1.

In [None]:
# SCatter plot of amnt_loans90 column

plt.scatter(df.index,df['amnt_loans90'])
plt.show()

df['amnt_loans90'].value_counts()

Observation:

     Maximum(69131) number of customers took the amnt_loans90 of 6.

In [None]:
# SCatter plot of sumamnt_ma_rech30 column

plt.scatter(df.index,df['sumamnt_ma_rech30'])
plt.show()

df['sumamnt_ma_rech30'].value_counts()

Observation:

    Maximum(27979) customers sumamnt_ma_rech30 is 0.

In [None]:
# SCatter plot of payback30 column

plt.scatter(df.index,df['payback30'])
plt.show()

df['payback30'].value_counts()

Observation:

Maximum(106712) customers payback30 is 0.

In [None]:
#Creating histogram of every column

for col in df.describe().columns:
    data=df.copy()
    data[col].hist(bins=25)
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.title(col)
    plt.show()

## Bivariate Analysis

In [None]:
#Scatter plot between all feature variables and target variable

for col in df.describe().columns:
    data=df.copy()
    plt.scatter(data[col],data['label'])
    plt.xlabel(col)
    plt.ylabel('label')
    plt.show()

In [None]:
#Factor plot of label vs aon
plt.figure(figsize=(18,8))
sns.factorplot(x='label',y='aon',data=df,kind='bar',size=5,palette='muted',aspect=1)
plt.title('label vs aon')
plt.xticks(rotation='vertical')
plt.ylabel('aon')
plt.show()


print(df.groupby('aon')['label'].value_counts())

Observation:

If the aon is high the number of defaulters are more.

In [None]:
#Factor plot of label vs cnt_ma_rech30
plt.figure(figsize=(18,8))
sns.factorplot(x='label',y='cnt_ma_rech30',data=df,kind='bar',size=5,palette='muted',aspect=1)
plt.title('label vs cnt_ma_rech30')
plt.xticks(rotation='vertical')
plt.ylabel('cnt_ma_rech30')
plt.show()

print(df.groupby('cnt_ma_rech30')['label'].value_counts())

Observation:

If Number of times main account got recharged in last 30 days(cnt_ma_rech30) is more then there is less chance of default.

In [None]:
#Factor plot of label vs sumamnt_ma_rech30
plt.figure(figsize=(18,8))
sns.factorplot(x='label',y='sumamnt_ma_rech30',data=df,kind='bar',size=5,palette='muted',aspect=1)
plt.title('label vs sumamnt_ma_rech30')
plt.xticks(rotation='vertical')
plt.ylabel('sumamnt_ma_rech30')
plt.show()

print(df.groupby('sumamnt_ma_rech30')['label'].value_counts())

Observation:

If Number of times main account got recharged in last 30 days(cnt_ma_rech30) is more then there is less chance of default.

In [None]:
#Factor plot of label vs cnt_ma_rech90
plt.figure(figsize=(18,8))
sns.factorplot(x='label',y='cnt_ma_rech90',data=df,kind='bar',size=5,palette='muted',aspect=1)
plt.title('label vs cnt_ma_rech90')
plt.xticks(rotation='vertical')
plt.ylabel('cnt_ma_rech90')
plt.show()

print(df.groupby('cnt_ma_rech90')['label'].value_counts())

Observation:

If Number of times main account got recharged in last 90 days(cnt_ma_rech90) is more then there is less chance of default.

In [None]:
#Factor plot of label vs sumamnt_ma_rech90
plt.figure(figsize=(18,8))
sns.factorplot(x='label',y='sumamnt_ma_rech90',data=df,kind='bar',size=5,palette='muted',aspect=1)
plt.title('label vs sumamnt_ma_rech90')
plt.xticks(rotation='vertical')
plt.ylabel('sumamnt_ma_rech90')
plt.show()

print(df.groupby('sumamnt_ma_rech90')['label'].value_counts())


Observation:

If Total amount of recharge in main account over last 30 days(sumamnt_ma_rech90) is more the chances of default are less

In [None]:
#Factor plot of label vs amnt_loans90
plt.figure(figsize=(18,8))
sns.factorplot(x='label',y='amnt_loans90',data=df,kind='bar',size=5,palette='muted',aspect=1)
plt.title('label vs amnt_loans90')
plt.xticks(rotation='vertical')
plt.ylabel('amnt_loans90')
plt.show()

print(df.groupby('amnt_loans90')['label'].value_counts())

Observation:

If Total amount of loans taken by user in last 90 days(amnt_loans90) is high there is less chance of default.

In [None]:
#label vs medianamnt_ma_rech90

plt.figure(figsize=(18,8))
mean_price=np.mean(df['medianamnt_ma_rech90'])
sns.boxplot(y='medianamnt_ma_rech90',x='label',data=df,palette="rainbow")
plt.axhline(mean_price,color='r',linestyle='dashed',linewidth=2)
plt.title("medianamnt_ma_rech90 vs label",fontsize=20)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
#Exploring label vs daily_decr30 via FaceGrid

facet= sns.FacetGrid(df, col='label')
facet.map(sns.distplot, "daily_decr30")
plt.show()

In [None]:
#Exploring label vs daily_decr90 via FaceGrid

facet= sns.FacetGrid(df, col='label')
facet.map(sns.distplot, "daily_decr90")
plt.show()

In [None]:
#Violinplot of label vs cnt_ma_rech30

sns.set(rc={'figure.figsize':(9,9)})
sns.violinplot(x=df['label'],y=df['cnt_ma_rech30'],data=df)

Observation:

      If Number of times main account got recharged in last 30 days(cnt_ma_rech30) is more then there is less chance of default.

# Multivariate Analysis

In [None]:
#checking month and cnt_ma_rech90 with respect to label

sns.factorplot(x='month',y='cnt_ma_rech90',hue='label',data=df,kind='violin',size=5,palette='muted',aspect=2)
plt.title('label according to month and cnt_ma_rech90')
plt.xticks()
plt.ylabel('cnt_ma_rech90')
plt.show()

In [None]:
#scatter plot between cnt_ma_rech_30 and cnt_ma_rech90 with respect to label
plt.figure(figsize=(14,14))
sns.lmplot(x='cnt_ma_rech30',y='cnt_ma_rech90',fit_reg=False,data=df,hue='label',markers=['+','x'])
plt.xlabel('cnt_ma_rech30')
plt.title('correaltion between cnt_ma_rech30 and cnt_ma_rech90')
plt.ylabel('cnt_ma_rech90')
plt.show()

Observation:

     As cnt_ma_rech30 and cnt_ma_rech90 are increasing the number of non defaulters are also increasing.

In [None]:
#scatter plot between cnt_loans30 and amnt_loans30 with respect to label
plt.figure(figsize=(14,14))
sns.lmplot(x='cnt_loans30',y='cnt_loans30',fit_reg=False,data=df,hue='label',markers=['+','x'])
plt.xlabel('cnt_loans30')
plt.title('correaltion between cnt_loans30 and cnt_amnt30')
plt.ylabel('cnt_amnt30')
plt.show()

Observation:

As cnt_loans30 and cnt_amnt30 are increasing the number of non defaulters are also increasing.

In [None]:
#scatter plot between sumamnt_ma_rech90 and amnt_loans90 with respect to label
plt.figure(figsize=(14,14))
sns.lmplot(x='sumamnt_ma_rech90',y='amnt_loans90',fit_reg=False,data=df,hue='label',markers=['+','x'])
plt.xlabel('sumamnt_ma_rech90')
plt.title('correaltion between sumamnt_ma_rech90 and amnt_loans90')
plt.ylabel('amnt_loans90')
plt.show()

Observation:

As sumamnt_rech90 and amnt_loans30 are increasing the number of non defaulters are also increasing.

# PreProcessing Pipeline

### Checking skewness

In [None]:
for col in df.describe().columns:
    sns.distplot(df[col],color='r')
    plt.show()

In [None]:
df.skew()

# Handling skewness and outliers through Winsorization

In [None]:
plt.figure(figsize=(18,14))

for i in range(len(df.columns)):
        plt.subplot(17,2,i+1)
        plt.boxplot(df[df.columns[i]], vert=False)
        plt.title(df.columns[i])
plt.show()

In [None]:
features = ['aon','daily_decr30','daily_decr90','rental30','rental90','last_rech_amt_ma','cnt_ma_rech30','sumamnt_ma_rech30','medianamnt_ma_rech30','medianmarechprebal30','cnt_ma_rech90','fr_ma_rech90','sumamnt_ma_rech90','medianamnt_ma_rech90','medianmarechprebal90','cnt_da_rech30','cnt_da_rech90','fr_da_rech90','cnt_loans30','amnt_loans30','maxamnt_loans30','medianamnt_loans30','cnt_loans90','amnt_loans90','maxamnt_loans90','medianamnt_loans90','payback30','payback90','day','month']

In [None]:
from scipy.stats import zscore
z_score=abs(zscore(df))
print(df.shape)
df_final=df.loc[(z_score<5).all(axis=1)]
print(df_final.shape)

In [None]:
df=df_final

In [None]:
df_cap = df.copy()

In [None]:
from scipy import stats

In [None]:
def percentile_capping(df, cols, from_low_end, from_high_end):
    for col in cols:
        stats.mstats.winsorize(a=df[col], limits=(from_low_end, from_high_end), inplace=True)

In [None]:
percentile_capping(df_cap, features, 0.01, 0.10)

In [None]:
for col in features:
    plt.figure(figsize=(16,4))
    
    plt.subplot(141)
    sns.distplot(df[col], label="skew: " + str(np.round(df[col].skew(),2)))
    plt.title('Before')
    plt.legend()
    
    plt.subplot(142)
    sns.distplot(df_cap[col], label="skew: " + str(np.round(df_cap[col].skew(),2)))
    plt.title('After')
    plt.legend()
    
    plt.subplot(143)
    sns.boxplot(df[col])
    plt.title('Before')
    
    plt.subplot(144)
    sns.boxplot(df_cap[col])
    plt.title('After')
    plt.tight_layout()
    plt.show()

In [None]:
df_cap.skew()

In [None]:
df_cap.shape

# Model Training

In [None]:
df_x=df_cap.drop(columns=['label'],axis=1)

In [None]:
y=df_cap['label']

In [None]:
#Scaling input variables

from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x=sc.fit_transform(df_x)
x=pd.DataFrame(x,columns=df_x.columns)

# Applying PCA

In [None]:
from sklearn.decomposition import PCA
covar_matrix = PCA(n_components = len(x.columns))
covar_matrix.fit(x)

In [None]:
plt.ylabel('Eigenvalues')
plt.xlabel('Number of features')
plt.title('PCA Eigenvalues')
plt.ylim(0,max(covar_matrix.explained_variance_))
plt.style.context('seaborn-whitegrid')
plt.axhline(y=1, color='r', linestyle='--')
plt.plot(covar_matrix.explained_variance_)
plt.show()

In [None]:
variance = covar_matrix.explained_variance_ratio_
var=np.cumsum(np.round(covar_matrix.explained_variance_ratio_, decimals=3)*100)

plt.ylabel('% Variance Explained')
plt.xlabel('Number of Features')
plt.title('PCA Variance Explained')
plt.ylim(min(var),100.5)
plt.style.context('seaborn-whitegrid')
plt.axhline(y=80, color='r', linestyle='--')
plt.plot(var)
plt.show()

In [None]:
pca=PCA(n_components=7)
xpca=pca.fit_transform(x)
x=xpca

In [None]:
pd.DataFrame(data=x)

In [None]:
#splitting the data into training and testing data

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30,random_state=9,stratify=y)

# Handling imbalanced data through Smote technique

In [None]:
from imblearn.combine import SMOTETomek

In [None]:
os=SMOTETomek(0.75)
x_train_ns,y_train_ns=os.fit_resample(x_train,y_train)
print("The number of classes before fit{}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

# Building Machine Learning Model

In [None]:
KNN=KNeighborsClassifier(n_neighbors=6)
LR=LogisticRegression()
DT=DecisionTreeClassifier(random_state=6)
XGB=XGBClassifier()
RF=RandomForestClassifier()
ADA=AdaBoostClassifier()
GNB=GaussianNB()
GBC=GradientBoostingClassifier()
BC=BaggingClassifier()
ETC=ExtraTreesClassifier()

In [None]:
models= []
models.append(('KNeighborsClassifier', KNN))
models.append(('LogisticRegression', LR))
models.append(('DecisionTreeClassifier', DT))
models.append(('XGBClassifier', XGB))
models.append(('RandomForestClassifier', RF))
models.append(('AdaBoostClassifier', ADA))
models.append(('GaussianNB', GNB))
models.append(('GradientBoostingClassifier', GBC))
models.append(('BaggingClassifier', BC))
models.append(('ExtraTreesClassifier', ETC))

In [None]:
Model= []
score= []
cvs=[]
rocscore=[]
for name,model in models:
    print('******************************************',name,'********************************************************')
    print('\n')
    Model.append(name)
    model.fit(x_train_ns,y_train_ns)
    print(model)
    pre=model.predict(x_test)
    print('\n')
    AS=accuracy_score(y_test,pre)
    print('Accuracy_score = ',AS)
    score.append(AS*100)
    print('\n')
    sc= cross_val_score(model, x, y, cv=10, scoring='accuracy').mean()
    print('Cross_Val_Score = ',sc)
    cvs.append(sc*100)
    print('\n')
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,pre)
    roc_auc= auc(false_positive_rate, true_positive_rate)
    print ('roc_auc_score = ',roc_auc)
    rocscore.append(roc_auc*100)
    print('\n')
    print('classification_report\n',classification_report(y_test,pre))
    print('\n')
    cm=confusion_matrix(y_test,pre)
    print(cm)
    print('\n')
    plt.figure(figsize=(10,40))
    plt.subplot(911)
    plt.title(name)
    print(sns.heatmap(cm,annot=True))
    plt.subplot(912)
    plt.title(name)
    plt.plot(false_positive_rate, true_positive_rate, label='AUC = %0.2f'% roc_auc)
    plt.plot([0,1],[0,1],'r--')
    plt.legend(loc='lower right')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    print('\n\n')

In [None]:
result = pd.DataFrame({'Model': Model, 'Accuracy_score': score,'Cross_val_score': cvs,'Roc_auc_curve':rocscore})
result

Because XGBClassifier has the best rocscore and has a high recall, the classifier performs well by not identifying legitimate transactions as fraudulent, which is why we chose it as our final model.

# Using GridSearchCV to find the best parameters in XGBClassifier

In [None]:
parameters={'n_estimators': [100, 250, 500], 'max_depth': [6,9,12], 'subsample':[0.9, 1.0]}
XGB=XGBClassifier()

clf=GridSearchCV(XGB, parameters, n_jobs=4)
clf.fit(x,y)
print(clf.best_params_)       

In [None]:
#XGBClassifier with best parameters

XGB=XGBClassifier(max_depth=6,n_estimators=100,subsample=1.0,gamma=1)
XGB.fit(x_train_ns,y_train_ns)
XGB.score(x_train_ns,y_train_ns)
predXGB=XGB.predict(x_test)
print(accuracy_score(y_test,predXGB))
print(confusion_matrix(y_test,predXGB))
print(classification_report(y_test,predXGB)) 

In [None]:
#cross validate XGBClassifier accuracy

score=cross_val_score(XGB,x,y,cv=5,scoring='accuracy')

print(score)
print('Mean:',score.mean())
print('Std:',score.std())

In [None]:
#plotting the roc auc curve

from sklearn import metrics
y_pred_proba = clf.predict_proba(x_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

In [None]:
#plotting the distribution plot and we find the Gaussian plot

sns.distplot(y_test-y_pred_proba)
plt.show()

In [None]:
#Scatter plot between test data and prediction

plt.scatter(y_test,y_pred_proba, alpha=0.5)
plt.xlabel("Y_test")
plt.ylabel("Y_pred_proba")
plt.title("Scatter plot between test data and predicted data",fontsize=15)
plt.show()

In [None]:
#saving our model

import joblib
joblib.dump(XGB,'XGB_MicroCreditDefaulter.csv')

In [None]:
model=joblib.load('XGB_MicroCreditDefaulter.csv')

In [None]:
#Testing our model
import sys
nums= model.predict(x_test)
np.set_printoptions(threshold=sys.maxsize)
print(nums)

# Conclusion

#### Using several Classification Models, we projected the outcomes in terms of a likelihood for each loan transaction, including whether the consumer will repay the lent amount within 5 days of loan insurance. Although both the random forest and gradient boosting models performed well, the best score of 0.90 was obtained utilising the optimal XGBClassifier parameters with GridSearchCV.