# Research project transformed to Python based from R

Methodology of RP ML model based on R
1. Data input from "application_train.xlsx"
2. Removal of keys (id)
3. Removal of columns with > 80% missing values
?4. PCA on AMT_REQ_CREDIT_BUREAU
5. Train Test split
6. Imputation of missing values using mean of each splitted group
7. # Variable selection on key columns (can do PCA reduce further number of features)
8. Define Target variable for training and testing set
9. Use balanced or imbalanced datasets
10. Fit model with training data
11. Predict with testing data
12. Evaluate PPR and FPR using Confusion Rate
13. Create ROC Model

10-subs
a. Logistic Regressor
b. Random Tree
c. XGBoost
d. Neural Network
e. ! SMOTE for imbalanced datasets

In [2]:
# import relevant libraries
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv("application_train.csv")

In [4]:
data = data.drop(["SK_ID_CURR"], axis = 1) # removed ID column

In [5]:
# Check for missing values

# Number of rows
nrows = len(data.index)
# Number of permittable missing values
import math
miss_nums = math.floor(0.5*nrows)
_ = data.isnull().sum()
missed_cols = [i for i in _.keys() if _[i] > miss_nums]
missed_cols # 41 for 50% above missing values

['OWN_CAR_AGE',
 'EXT_SOURCE_1',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE']

In [6]:
reduced_data = data.drop(missed_cols, axis=1)

In [7]:
reduced_data.select_dtypes(include=['object']) # 16 columns with object

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE,WEEKDAY_APPR_PROCESS_START,ORGANIZATION_TYPE,EMERGENCYSTATE_MODE
0,Cash loans,M,N,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Laborers,WEDNESDAY,Business Entity Type 3,No
1,Cash loans,F,N,N,Family,State servant,Higher education,Married,House / apartment,Core staff,MONDAY,School,No
2,Revolving loans,M,Y,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Laborers,MONDAY,Government,
3,Cash loans,F,N,Y,Unaccompanied,Working,Secondary / secondary special,Civil marriage,House / apartment,Laborers,WEDNESDAY,Business Entity Type 3,
4,Cash loans,M,N,Y,Unaccompanied,Working,Secondary / secondary special,Single / not married,House / apartment,Core staff,THURSDAY,Religion,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,Cash loans,M,N,N,Unaccompanied,Working,Secondary / secondary special,Separated,With parents,Sales staff,THURSDAY,Services,No
307507,Cash loans,F,N,Y,Unaccompanied,Pensioner,Secondary / secondary special,Widow,House / apartment,,MONDAY,XNA,No
307508,Cash loans,F,N,Y,Unaccompanied,Working,Higher education,Separated,House / apartment,Managers,THURSDAY,School,No
307509,Cash loans,F,N,Y,Unaccompanied,Commercial associate,Secondary / secondary special,Married,House / apartment,Laborers,WEDNESDAY,Business Entity Type 1,No


In [8]:
object_cols = reduced_data.select_dtypes(include=['object'])
object_cols.nunique()

NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
EMERGENCYSTATE_MODE            2
dtype: int64

In [9]:
object_cols = reduced_data.select_dtypes(include=['object'])
_ = object_cols.nunique() == 2
bin_cols = [i for i in _.keys() if _[i]]
bin_cols

['NAME_CONTRACT_TYPE',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'EMERGENCYSTATE_MODE']

In [10]:
# label encoder for binary objects
reduced_data[bin_cols]

Unnamed: 0,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,EMERGENCYSTATE_MODE
0,Cash loans,N,Y,No
1,Cash loans,N,N,No
2,Revolving loans,Y,Y,
3,Cash loans,N,Y,
4,Cash loans,N,Y,
...,...,...,...,...
307506,Cash loans,N,N,No
307507,Cash loans,N,Y,No
307508,Cash loans,N,Y,No
307509,Cash loans,N,Y,No


In [11]:
# Import label encoder 
from sklearn import preprocessing
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column bin_cols.
for i in bin_cols:
    reduced_data[i]= label_encoder.fit_transform(data[i])

In [12]:
object_cols = reduced_data.select_dtypes(include=['object'])
_ = object_cols.nunique() != 2
ohe_cols = [i for i in _.keys() if _[i]]
ohe_cols

['CODE_GENDER',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE']

In [13]:
# importing one hot encoder 
from sklearn.preprocessing import OneHotEncoder
# creating one hot encoder object 
onehotencoder = OneHotEncoder()

In [14]:
ohe = OneHotEncoder(categories='auto')
feature_arr = ohe.fit_transform(reduced_data[ohe_cols]).toarray()
feature_labels = ohe.categories_

In [15]:
feature_labels=[]
import typing
def unlist(ls: list):
    for i in ls:
        if type(i) != list:
            feature_labels.append(i)
        else:
            unlist(i)
    return None

unlist([list(i) for i in ohe.categories_])
len(feature_labels)

120

In [16]:
ohe_cols_dt ={}
for i in ohe_cols:
    ohe_cols_dt[i] = len(reduced_data[i].unique())
ohe_cols_dt

{'CODE_GENDER': 3,
 'NAME_TYPE_SUITE': 8,
 'NAME_INCOME_TYPE': 8,
 'NAME_EDUCATION_TYPE': 5,
 'NAME_FAMILY_STATUS': 6,
 'NAME_HOUSING_TYPE': 6,
 'OCCUPATION_TYPE': 19,
 'WEEKDAY_APPR_PROCESS_START': 7,
 'ORGANIZATION_TYPE': 58}

In [17]:
#  TEMP NOT USING
feature_cols = []
for key in ohe_cols_dt.keys():
    i = 0
    while i < ohe_cols_dt[key]:
        feature_cols.append(str(key)+"_"+str(feature_labels.pop(0)))
        i += 1
len(feature_cols)

120

In [18]:
features = pd.DataFrame(feature_arr, columns=feature_cols)

In [19]:
dropped = reduced_data.drop(ohe_cols,axis=1)

In [20]:
numeric_data = pd.concat([dropped, features],axis = 1)

## Imputation of missing values

In [23]:
import numpy as np
from sklearn.impute import SimpleImputer
fill_Nan = SimpleImputer(missing_values=np.nan, strategy='mean')
imputed_data = pd.DataFrame(fill_Nan.fit_transform(numeric_data))
imputed_data.columns = numeric_data.columns
imputed_data.index = numeric_data.index

In [24]:
imputed_data[imputed_data.isnull().any(axis=1)]

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA


In [25]:
imputed_data.head()

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA
0,1.0,0.0,0.0,1.0,0.0,202500.0,406597.5,24700.5,351000.0,0.018801,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,270000.0,1293502.5,35698.5,1129500.0,0.003541,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,1.0,0.0,67500.0,135000.0,6750.0,135000.0,0.010032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,135000.0,312682.5,29686.5,297000.0,0.008019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,121500.0,513000.0,21865.5,513000.0,0.028663,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
Y = imputed_data['TARGET']
X = imputed_data.drop(['TARGET'], axis = 1)

## Model building

In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=23)

### Logistic Regression

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

clf = Pipeline(steps=[('preprocessor', StandardScaler()),
                      ('classifier', LogisticRegression(solver="lbfgs",max_iter=5000))])

clf.fit(X_train, y_train)  # apply scaling on training data

Pipeline(steps=[('preprocessor', StandardScaler()),
                ('classifier', LogisticRegression(max_iter=5000))])

In [61]:
y_predicted = clf.predict(X_test)

y_predicted
#.map(lambda x: 1 if x >= 0.5 else 0)

array([0., 0., 0., ..., 0., 0., 0.])

In [63]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

matrix = confusion_matrix(y_test,y_predicted, labels=[1,0])
print('Confusion matrix : \n',matrix)

Confusion matrix : 
 [[   51  4885]
 [   62 56505]]


In [65]:
# outcome values order in sklearn
tp, fn, fp, tn = confusion_matrix(y_test,y_predicted,labels=[1,0]).reshape(-1)
print('Outcome values : \n', tp, fn, fp, tn)

# classification report for precision, recall f1-score and accuracy
matrix = classification_report(y_test,y_predicted,labels=[1,0])
print('Classification report : \n',matrix)

Outcome values : 
 51 4885 62 56505
Classification report : 
               precision    recall  f1-score   support

           1       0.45      0.01      0.02      4936
           0       0.92      1.00      0.96     56567

    accuracy                           0.92     61503
   macro avg       0.69      0.50      0.49     61503
weighted avg       0.88      0.92      0.88     61503

