## Package Import

In [6]:
import pandas as pd
import numpy as np
import os
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.tree import plot_tree
from xgboost import XGBClassifier


import seaborn as sns
import matplotlib.pyplot as plt

## File Upload into Environment

Here, we stored our files in a Google Cloud Storage so we would not have to read directly from our local machines and change filepaths during development.

In [7]:
# Public URL after making the file public in the format 'https://storage.googleapis.com/...'
file_url = 'https://storage.googleapis.com/home_credit_files/application_train.csv'
test_url = 'https://storage.googleapis.com/home_credit_files/application_test.csv'
# POS_CASH_balance_url = 'https://storage.googleapis.com/home_credit_files/POS_CASH_balance.csv'
# bureau_url = 'https://storage.googleapis.com/home_credit_files/bureau.csv'
# bureau_balance_url = 'https://storage.googleapis.com/home_credit_files/bureau.csv'
# credit_card_balance = 'https://storage.googleapis.com/home_credit_files/credit_card_balance.csv'
# installments_payments = 'https://storage.googleapis.com/home_credit_files/credit_card_balance.csv'
# previous_application = 'https://storage.googleapis.com/home_credit_files/previous_application.csv'
sample_sub = 'https://storage.googleapis.com/home_credit_files/sample_submission.csv'


# Read the CSV directly from the URL
df = pd.read_csv(file_url)
test_df = pd.read_csv(test_url)

#print(df.head())

  df = pd.read_csv(file_url)


# EDA
Below, we perform EDA on the train data set to prepare it for modeling. We began by analyzing the initial data set and obtaining the proportion of the target variable in the train set. About 92% of the data is 0, indicating those that repayed their loans on time. About 8% represent 1, those that did not repay on time.

Next, we explored missing variables. We calculated the percentage of missing values per column, and after analyzing the descriptions for each, decided to remove those that were missing greater than 10% of their values.

In [8]:
print(df.shape)

(307511, 122)


In [9]:
df['TARGET'].value_counts()

TARGET
0    282686
1     24825
Name: count, dtype: int64

In [10]:
df['TARGET'].value_counts(normalize = True) * 100

TARGET
0    91.927118
1     8.072882
Name: proportion, dtype: float64

# Data Preparation

### Evaluating Missingness

In [11]:
# Train data set

# calculate total number of missing values for each column
missing_values_train = df.isnull().sum()

# calculate total number of rows
total_rows_train = df.shape[0]

# calculate percentage of missing values for each column
pct_missing_train = (missing_values_train / total_rows_train) * 100

# sort output
pct_missing_sorted_train = pct_missing_train.sort_values(ascending=False)

In [12]:
missings = pct_missing_sorted_train.to_frame(name='MissingPercentage')

### Removing columns with greater than 10% missing values

In [13]:
filter_missings = missings[(missings['MissingPercentage'] < 10)]
filter_missings.shape

(65, 1)

### Removing 'FLAG' columns and ID column

In [14]:
columns = list(filter_missings.index)
column_list = [item for item in columns if 'FLAG' not in item]

### Removing additional unhelpful columns

In [15]:
list1 = column_list
list2 = ['NAME_TYPE_SUITE', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE']

# Initialize an empty list to store items from list1 that are not in list2
items_not_in_list2 = []

# Iterate through each item in list1
for item in list1:
    # Check if the item is not in list2
    if item not in list2:
        # Add the item to the items_not_in_list2 list
        items_not_in_list2.append(item)


In [16]:
len(items_not_in_list2)

32

In [17]:
pd.set_option('display.max_columns', None)
selected_df = df[items_not_in_list2]
selected_df

test_filtered_columns = [item for item in items_not_in_list2 if item != 'TARGET']
test_selected_df = test_df[test_filtered_columns]

In [18]:
# Extract the 'TARGET' column
target = selected_df['TARGET']

# Remove the 'TARGET' column from the dataframe and reassign the result back to selected_df
selected_df = selected_df.drop(columns=['TARGET'])

# Insert the 'TARGET' column at the beginning of the dataframe
selected_df.insert(0, 'TARGET', target)

### Fix problematic values

In [19]:
selected_df.loc[:, 'DAYS_EMPLOYED'] = selected_df['DAYS_EMPLOYED'].replace(365243, 0)
selected_df = selected_df[selected_df['AMT_INCOME_TOTAL'] <= 9000000]

test_selected_df.loc[:, 'DAYS_EMPLOYED'] = test_selected_df['DAYS_EMPLOYED'].replace(365243, 0)
test_selected_df = test_selected_df[test_selected_df['AMT_INCOME_TOTAL'] <= 9000000]

In [20]:
print(selected_df.shape)
print(test_selected_df.shape)

(307508, 32)
(48744, 31)


test_selected_df should have one less column, TARGET, which is reflected.

In [21]:
selected_df = selected_df.drop(columns = ['HOUR_APPR_PROCESS_START', 'DAYS_ID_PUBLISH'])
test_selected_df = test_selected_df.drop(columns = ['HOUR_APPR_PROCESS_START', 'DAYS_ID_PUBLISH'])

In [22]:
set(test_selected_df)-set(selected_df)

set()

In [23]:
# Abs value of negatives
selected_df['DAYS_LAST_PHONE_CHANGE'] = selected_df.loc[:, 'DAYS_LAST_PHONE_CHANGE'].abs()
selected_df['DAYS_REGISTRATION'] = selected_df.loc[:, 'DAYS_REGISTRATION'].abs()
selected_df['DAYS_EMPLOYED'] = selected_df.loc[:, 'DAYS_EMPLOYED'].abs()
selected_df['DAYS_BIRTH'] = selected_df.loc[:, 'DAYS_BIRTH'].abs()

test_selected_df['DAYS_LAST_PHONE_CHANGE'] = test_selected_df.loc[:, 'DAYS_LAST_PHONE_CHANGE'].abs()
test_selected_df['DAYS_REGISTRATION'] = test_selected_df.loc[:, 'DAYS_REGISTRATION'].abs()
test_selected_df['DAYS_EMPLOYED'] = test_selected_df.loc[:, 'DAYS_EMPLOYED'].abs()
test_selected_df['DAYS_BIRTH'] = test_selected_df.loc[:, 'DAYS_BIRTH'].abs()

## Data Preparation Summary

Our initial approach was fairly simplistic for our first attempt. We started by dropped columns that had too much missing data with a cutoff of 10%. We then cut out all 'FLAG' columns as we thought these weren't needed for our basic approach. We fixed a couple columns that had some bad information, and all the while we did the same manipulations to the test set as we did with the train set. 

## Preparing Logistic Regression

In [24]:
# Grab numeric columns
numeric_cols = selected_df.drop(['SK_ID_CURR'], axis=1).select_dtypes(include='number')
numeric_cols

test_numeric_cols = test_selected_df.drop(['SK_ID_CURR'], axis=1).select_dtypes(include='number')
test_numeric_cols

Unnamed: 0,EXT_SOURCE_2,AMT_GOODS_PRICE,AMT_ANNUITY,CNT_FAM_MEMBERS,DAYS_LAST_PHONE_CHANGE,CNT_CHILDREN,AMT_CREDIT,AMT_INCOME_TOTAL,LIVE_CITY_NOT_WORK_CITY,REG_CITY_NOT_WORK_CITY,REG_CITY_NOT_LIVE_CITY,LIVE_REGION_NOT_WORK_REGION,REG_REGION_NOT_WORK_REGION,REG_REGION_NOT_LIVE_REGION,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,DAYS_REGISTRATION,DAYS_EMPLOYED,DAYS_BIRTH,REGION_POPULATION_RELATIVE
0,0.789654,450000.0,20560.5,2.0,1740.0,0,568800.0,135000.0,0,0,0,0,0,0,2,2,5170.0,2329,19241,0.018850
1,0.291656,180000.0,17370.0,2.0,0.0,0,222768.0,99000.0,0,0,0,0,0,0,2,2,9118.0,4469,18064,0.035792
2,0.699787,630000.0,69777.0,2.0,856.0,0,663264.0,202500.0,0,0,0,0,0,0,2,2,2175.0,4458,20038,0.019101
3,0.509677,1575000.0,49018.5,4.0,1805.0,2,1575000.0,315000.0,0,0,0,0,0,0,2,2,2000.0,1866,13976,0.026392
4,0.425687,625500.0,32067.0,3.0,821.0,1,625500.0,180000.0,1,1,0,0,0,0,2,2,4000.0,2191,13040,0.010032
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48739,0.648575,270000.0,17473.5,1.0,684.0,0,412560.0,121500.0,0,0,0,0,0,0,3,3,9094.0,5169,19970,0.002042
48740,0.684596,495000.0,31909.5,4.0,0.0,2,622413.0,157500.0,1,1,0,0,0,0,2,2,3015.0,1149,11186,0.035792
48741,0.632770,315000.0,33205.5,3.0,838.0,1,315000.0,202500.0,0,0,0,0,0,0,2,2,2681.0,3037,15922,0.026392
48742,0.445701,450000.0,25128.0,2.0,2308.0,0,450000.0,225000.0,1,1,0,1,1,0,2,2,1461.0,2731,13968,0.018850


In [25]:
# Create a list of numeric column names excluding target for imputations

column_names = numeric_cols.columns.tolist()
my_list = [x for x in column_names if x != 'TARGET']
print(my_list)

test_column_names = numeric_cols.columns.tolist()
test_my_list = [x for x in column_names if x != 'TARGET']
print(test_my_list)

['EXT_SOURCE_2', 'AMT_GOODS_PRICE', 'AMT_ANNUITY', 'CNT_FAM_MEMBERS', 'DAYS_LAST_PHONE_CHANGE', 'CNT_CHILDREN', 'AMT_CREDIT', 'AMT_INCOME_TOTAL', 'LIVE_CITY_NOT_WORK_CITY', 'REG_CITY_NOT_WORK_CITY', 'REG_CITY_NOT_LIVE_CITY', 'LIVE_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_LIVE_REGION', 'REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT', 'DAYS_REGISTRATION', 'DAYS_EMPLOYED', 'DAYS_BIRTH', 'REGION_POPULATION_RELATIVE']
['EXT_SOURCE_2', 'AMT_GOODS_PRICE', 'AMT_ANNUITY', 'CNT_FAM_MEMBERS', 'DAYS_LAST_PHONE_CHANGE', 'CNT_CHILDREN', 'AMT_CREDIT', 'AMT_INCOME_TOTAL', 'LIVE_CITY_NOT_WORK_CITY', 'REG_CITY_NOT_WORK_CITY', 'REG_CITY_NOT_LIVE_CITY', 'LIVE_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_WORK_REGION', 'REG_REGION_NOT_LIVE_REGION', 'REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT', 'DAYS_REGISTRATION', 'DAYS_EMPLOYED', 'DAYS_BIRTH', 'REGION_POPULATION_RELATIVE']


### N/a Imputer

In [26]:
# Create a SimpleImputer instance
imputer = SimpleImputer(strategy='median')

# Fit and transform the selected numeric columns
selected_df.loc[:, my_list] = imputer.fit_transform(selected_df.loc[:, my_list])

test_selected_df.loc[:, test_my_list] = imputer.fit_transform(test_selected_df.loc[:, test_my_list])

## Interaction Term

In [27]:
selected_df['CREDIT_TO_INCOME'] = selected_df['AMT_CREDIT']/selected_df['AMT_INCOME_TOTAL']

test_selected_df['CREDIT_TO_INCOME'] = test_selected_df['AMT_CREDIT']/test_selected_df['AMT_INCOME_TOTAL']

### Standard Scaler to scale dataset

In [28]:
#scalar_list = ['EXT_SOURCE_2','AMT_GOODS_PRICE','AMT_ANNUITY','AMT_CREDIT','AMT_INCOME_TOTAL','REGION_RATING_CLIENT_W_CITY','REGION_RATING_CLIENT','CREDIT_TO_INCOME']

In [29]:
scaler = StandardScaler()

# Fit and transform the selected columns
selected_df[my_list] = scaler.fit_transform(selected_df[my_list])

test_selected_df[test_my_list] = scaler.fit_transform(test_selected_df[test_my_list])

print(selected_df.shape)
print(test_selected_df.shape)

(307508, 31)
(48744, 30)


### Dummy Encoder 

In [30]:
# dummy encoding data set

selected_df = pd.get_dummies(selected_df, drop_first=True)

test_selected_df = pd.get_dummies(test_selected_df, drop_first=True)

print(selected_df.shape)
print(test_selected_df.shape)

print(set(test_selected_df)-set(selected_df))
print(set(selected_df)-set(test_selected_df))

columns_to_drop = set(selected_df.columns) - set(test_selected_df.columns)
columns_to_drop.discard('TARGET')
selected_df = selected_df.drop(columns=list(columns_to_drop), axis=1)

print(selected_df.shape)
print(test_selected_df.shape)

(307508, 110)
(48744, 106)
set()
{'NAME_INCOME_TYPE_Maternity leave', 'NAME_FAMILY_STATUS_Unknown', 'CODE_GENDER_XNA', 'TARGET'}
(307508, 107)
(48744, 106)


### Splitting Target and Predictors

In [31]:
X = selected_df.drop(columns=['TARGET', 'SK_ID_CURR'])
y = selected_df['TARGET']

## Upsampling

We created an upsampled fold of data to test how this effected each of our models.

In [32]:
X = selected_df.drop(['TARGET', 'SK_ID_CURR'], axis=1)
y = selected_df['TARGET']

data = pd.concat([X, y], axis=1)

# Separate majority and minority classes
majority = data[data.TARGET==0]
minority = data[data.TARGET==1]

# Upsample minority class
minority_upsampled = resample(minority, 
                              replace=True,     # sample with replacement
                              n_samples=len(majority),    # to match majority class size
                              random_state=42)  # reproducible results

# Combine majority class with upsampled minority class
upsampled_data = pd.concat([majority, minority_upsampled])

# Checking the new class distribution
upsampled_data.TARGET.value_counts()

X_upsampled = upsampled_data.drop('TARGET', axis=1)
y_upsampled = upsampled_data['TARGET']


In [33]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [34]:

xgb_clf = XGBClassifier(use_label_encoder=False, 
                        eval_metric='logloss', 
                        n_estimators=100, 
                        max_depth=6
                        )

xgb_clf.fit(X, y)

xgb_y_pred = xgb_clf.predict(X)

xgb_accuracy = accuracy_score(y, xgb_y_pred)

print("Train Accuracy:", xgb_accuracy)
print("Train Accuracy using built-in score:", xgb_clf.score(X, y))

Train Accuracy: 0.9206524708300272
Train Accuracy using built-in score: 0.9206524708300272
