# Confirmation of competition details

- Learn to use alternative data such as telcom and transactional history to broaden financial inclusion among the unbanking population and to acurately predict their ability to repay the loan.
- The file to create is called SK_ID_CURR which would predict a probability for the TARGET variable. The file contains a header and have the following format:

SK_ID_CURR,TARGET

100001,0.1

100005,0.9

100013,0.2
etc.

- Submissions are evaluated on area under the Receiver operating characteristic (ROC) curve between the predicted probability and the observed target

# Learning and Verification

- import necessary libraries
- load relevant data
- deleting null data
- extracting target variables

In [9]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score

df = pd.read_csv('application_train.csv')

na_df = df.dropna()

x = na_df.loc[:,['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY']]
y = na_df['TARGET']

In [10]:
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


Spliting the data into train and test using sklearn
- data standardization
- data fitting and prediction

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

scaler = StandardScaler()
scaler.fit(x_train)
x_train_trans = scaler.transform(x_train)
x_test_trans = scaler.transform(x_test)

reg = LinearRegression().fit(x_train_trans, y_train)

reg_pred = reg.predict(x_test_trans)

print('MSE:', mean_squared_error(y_true=y_test, y_pred=reg_pred))
print('ROC', roc_auc_score(y_test,reg_pred))

MSE: 0.052153197403226256
ROC 0.5938285747369815


The MSE is tilted towards zero indicating a very low error

# Estimation for test data

In [14]:
test_df = pd.read_csv('application_test.csv')

na_df = test_df.dropna(axis=0)

test_x = na_df.loc[:,['AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY']]
                      
test_scaler = StandardScaler()
test_x_test_trans = scaler.fit_transform(test_x)

test_reg_pred = reg.predict(test_x_test_trans)

                      
#Submission to Kaggle
                      
kgl_submission = pd.concat([test_df['SK_ID_CURR'], pd.Series(test_reg_pred, name='TARGET')], axis=1)
kgl_submission = kgl_submission.fillna(0)
kgl_submission.at[648,'TARGET']= 0
kgl_submission.shape
kgl_submission.to_csv('kggl_submission.csv', index=False)                      

# Feature engineering

The following 5 perspectives were considered based on:

- which feature to use
- how to pre-process

In [19]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.5-py3-none-win_amd64.whl (1.0 MB)
     ---------------------------------------- 1.0/1.0 MB 10.9 MB/s eta 0:00:00
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.5
Note: you may need to restart the kernel to use updated packages.


In [22]:
# Pattern 1

# Imputation

import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

imp_mean = SimpleImputer(strategy='mean')

# deleting the missing values

imp_x = imp_mean.fit_transform(x)

# One hot encoding
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
enc_imp_x = enc.fit_transform(imp_x).toarray()

# Splitting the data
x_train_1, x_test_1, y_train_1,y_test_1 = train_test_split(enc_imp_x, y, test_size=0.25, random_state=42)

# Data standardization
scaler = StandardScaler()
scaler.fit(x_train_1)
x_train_trans_1 = scaler.transform(x_train_1)
x_test_trans_1 = scaler.transform(x_test_1)

# Fitting the data
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(random_state=5)
lgb = lgbm.fit(x_train_trans_1, y_train_1)

# Predicting
reg_pred_a = lgb.predict(x_test_trans_1)

print('accuracy:', accuracy_score(y_test_1, reg_pred_a))


accuracy: 0.9446768944676894


In [25]:
# Pattern 2

imp_median = SimpleImputer(strategy='median')

# deleting the missing values

imp_x_a = imp_median.fit_transform(x)

# One hot encoding
from sklearn.preprocessing import OneHotEncoder
enc_a = OneHotEncoder(handle_unknown='ignore')
enc_imp_x_a = enc.fit_transform(imp_x).toarray()

# Splitting the data
x_train_a, x_test_a, y_train_a, y_test_a = train_test_split(enc_imp_x_a, y, test_size=0.25, random_state=42)

# Data standardization
scaler = StandardScaler()
scaler.fit(x_train_a)
x_train_trans_a = scaler.transform(x_train_a)
x_test_trans_a = scaler.transform(x_test_a)

# Fitting the data
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(random_state=5)
lgb = lgbm.fit(x_train_trans_a, y_train_a)

# Predicting
reg_pred_b = lgb.predict(x_test_trans_a)

print('accuracy:', accuracy_score(y_test_1, reg_pred_b))

accuracy: 0.9446768944676894


In [26]:
# Pattern 3

imp_mf = SimpleImputer(strategy='most_frequent')

# deleting the missing values

imp_x_c = imp_mf.fit_transform(x)

# One hot encoding
from sklearn.preprocessing import OneHotEncoder
enc_c = OneHotEncoder(handle_unknown='ignore')
enc_imp_x_c = enc.fit_transform(imp_x_c).toarray()

# Splitting the data
x_train_c, x_test_c, y_train_c, y_test_c = train_test_split(enc_imp_x_c, y, test_size=0.25, random_state=42)

# Data standardization
scaler = StandardScaler()
scaler.fit(x_train_c)
x_train_trans_c = scaler.transform(x_train_c)
x_test_trans_c = scaler.transform(x_test_c)

# Fitting the data
from lightgbm import LGBMClassifier
lgbm_c = LGBMClassifier(random_state=5)
lgb_c = lgbm.fit(x_train_trans_c, y_train_c)

# Predicting
reg_pred_d = lgb_c.predict(x_test_trans_c)

print('accuracy:', accuracy_score(y_test_c, reg_pred_d))

  mode = stats.mode(array)


accuracy: 0.9446768944676894


In [27]:
# Pattern 4

imp_cons = SimpleImputer(strategy='constant')

# deleting the missing values

imp_x_d = imp_cons.fit_transform(x)

# One hot encoding
from sklearn.preprocessing import OneHotEncoder
enc_d = OneHotEncoder(handle_unknown='ignore')
enc_imp_x_d = enc.fit_transform(imp_x_d).toarray()

# Splitting the data
x_train_d, x_test_d, y_train_d, y_test_d = train_test_split(enc_imp_x_d, y, test_size=0.25, random_state=42)

# Data standardization
scaler = StandardScaler()
scaler.fit(x_train_d)
x_train_trans_d = scaler.transform(x_train_d)
x_test_trans_d = scaler.transform(x_test_d)

# Fitting the data
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(random_state=5)
lgb = lgbm.fit(x_train_trans_d, y_train_d)

# Predicting
reg_pred_e = lgb.predict(x_test_trans_d)

print('accuracy:', accuracy_score(y_test_d, reg_pred_e))

accuracy: 0.9446768944676894
