In [1]:
# Import libraries

import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV



In [2]:
df = pd.read_csv("../cleaned_data/train_data.csv") # Preprocessed training data
df

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education_Bachelor's,...,HasDependents_No,HasDependents_Yes,LoanPurpose_Auto,LoanPurpose_Business,LoanPurpose_Education,LoanPurpose_Home,LoanPurpose_Other,HasCoSigner_No,HasCoSigner_Yes,Default
0,45,72928,23252,516,72,1,24.53,24,0.79,0,...,1,0,0,0,1,0,0,0,1,0
1,41,65437,15975,328,81,4,19.00,60,0.69,0,...,1,0,0,0,0,1,0,1,0,0
2,34,147083,73467,607,67,1,8.20,36,0.66,0,...,0,1,0,1,0,0,0,0,1,0
3,25,97594,165559,489,115,4,21.12,24,0.21,0,...,0,1,0,0,0,0,1,1,0,0
4,22,143415,197038,424,15,3,12.84,36,0.61,0,...,0,1,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163416,56,95731,78549,580,53,2,13.39,48,0.54,1,...,1,0,0,0,0,0,1,0,1,0
163417,32,46410,27511,799,57,4,14.20,48,0.41,0,...,0,1,1,0,0,0,0,0,1,0
163418,54,142603,177416,519,20,3,10.69,12,0.47,0,...,1,0,0,0,0,1,0,1,0,0
163419,50,99826,161987,613,61,3,12.31,48,0.56,0,...,0,1,0,0,0,0,1,0,1,0


In [3]:
default = np.array(df["Default"])
df.drop(columns=["Default"], inplace=True)

We now train the adaboost classifier on the training data, using a decision tree as the estimator. The hyperparameters are decided using GridSearchCV and the best estimator is used to make the final predictions.

In [4]:
dtc = DecisionTreeClassifier(
    random_state=42,
    criterion="entropy",
    max_depth=2
)

abc = AdaBoostClassifier(estimator=dtc, random_state=42)

In [5]:
params = {
    "n_estimators": [50, 70, 90],
    "learning_rate": [0.1, 1]
}

grid_search = GridSearchCV(abc, params, verbose=2)

In [6]:
grid_search.fit(df, default)
model = grid_search.best_estimator_ # Get the best model

Fitting 5 folds for each of 6 candidates, totalling 30 fits




[CV] END .................learning_rate=0.1, n_estimators=50; total time=  29.2s




[CV] END .................learning_rate=0.1, n_estimators=50; total time=  27.9s




[CV] END .................learning_rate=0.1, n_estimators=50; total time=  27.8s




[CV] END .................learning_rate=0.1, n_estimators=50; total time=  31.2s




[CV] END .................learning_rate=0.1, n_estimators=50; total time=  29.5s




[CV] END .................learning_rate=0.1, n_estimators=70; total time=  39.9s




[CV] END .................learning_rate=0.1, n_estimators=70; total time=  41.1s




[CV] END .................learning_rate=0.1, n_estimators=70; total time=  47.9s




[CV] END .................learning_rate=0.1, n_estimators=70; total time=  41.1s




[CV] END .................learning_rate=0.1, n_estimators=70; total time=  44.1s




[CV] END .................learning_rate=0.1, n_estimators=90; total time=  53.8s




[CV] END .................learning_rate=0.1, n_estimators=90; total time=  51.1s




[CV] END .................learning_rate=0.1, n_estimators=90; total time=  50.4s




[CV] END .................learning_rate=0.1, n_estimators=90; total time=  49.5s




[CV] END .................learning_rate=0.1, n_estimators=90; total time=  51.2s




[CV] END ...................learning_rate=1, n_estimators=50; total time=  28.2s




[CV] END ...................learning_rate=1, n_estimators=50; total time=  27.5s




[CV] END ...................learning_rate=1, n_estimators=50; total time=  28.4s




[CV] END ...................learning_rate=1, n_estimators=50; total time=  28.2s




[CV] END ...................learning_rate=1, n_estimators=50; total time=  27.2s




[CV] END ...................learning_rate=1, n_estimators=70; total time=  37.8s




[CV] END ...................learning_rate=1, n_estimators=70; total time=  38.8s




[CV] END ...................learning_rate=1, n_estimators=70; total time=  38.9s




[CV] END ...................learning_rate=1, n_estimators=70; total time=  37.9s




[CV] END ...................learning_rate=1, n_estimators=70; total time=  37.3s




[CV] END ...................learning_rate=1, n_estimators=90; total time=  48.6s




[CV] END ...................learning_rate=1, n_estimators=90; total time=  48.8s




[CV] END ...................learning_rate=1, n_estimators=90; total time=  48.9s




[CV] END ...................learning_rate=1, n_estimators=90; total time=  49.5s




[CV] END ...................learning_rate=1, n_estimators=90; total time=  50.2s




In [7]:
model

In [8]:
test_df = pd.read_csv("../cleaned_data/test_data.csv") # Validation data for testing the trained model

y_true = np.array(test_df["Default"])
test_df.drop(columns=["Default"], inplace=True)

# Accuracy of model on validation data
y_pred = model.predict(test_df)
accuracy_score(y_true, y_pred)

0.8844722929312708

In [9]:
test_df = pd.read_csv("../data/test.csv") # Test data

ids = np.array(test_df["LoanID"])
test_df.drop(columns="LoanID", inplace=True)

# Do label encoding on test data
test_df = pd.get_dummies(test_df, columns = test_df.select_dtypes(include=['object','category']).columns,dtype=int)

Now, we generate the csv file for the predictions on the actual test data. It achieves a score of 0.88752 on Kaggle.

In [10]:
submission = []

y_pred = model.predict(test_df)

submission.append(ids)
submission.append(y_pred)

submission = np.array(submission).T
submission = pd.DataFrame(submission)

submission.to_csv("adaboost_submission.csv", header=["LoanID", "Default"], index=False)