In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
from sklearn.cluster import KMeans
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder

In [15]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

In [16]:
random = 1234

# Basic Exploratory Analysis

In [17]:
train.default_status = train.default_status.replace({'no':0,'yes':1})
train.form_field47 = train.form_field47.replace({'charge':1,'lending':0})
test.form_field47 = test.form_field47.replace({'charge':1,'lending':0})

### Cluster Analysis

<b>The Columns used where based on feature importances of them on Training at first instance</b>\

<b> The K feature was selected using the Elbow Method</b>

In [18]:
clus = ['form_field47','form_field1','form_field2', 'form_field6']
train_km = train[clus].copy()
train_km = train_km.fillna(0)
test_km = test[clus].copy()
test_km = test_km.fillna(0)

In [19]:
km = KMeans(n_clusters=4,init='k-means++',n_init=10,max_iter=300,random_state=0)
km.fit(train_km)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

In [20]:
train['cluster'] = km.predict(train_km[clus])
test['cluster'] = km.predict(test_km[clus])

In [21]:
def col(data):
    cat_col = [i for i in data.columns if train[i].dtypes == 'O']
    num_col = [i for i in data.columns.tolist() if i not in cat_col]
    return num_col

In [22]:
X = train[col(train)].drop('default_status', axis = 1)
y = train.default_status

## ML Model using a KFold of 7 proved to be the best and a iteration of 1500

In [23]:
cv_score=[]
test_pred=[]
split = 7
fold=KFold(n_splits=split, random_state=random)

for train_index, test_index in fold.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    cat=CatBoostClassifier(iterations=1500, logging_level='Silent', od_wait=50, od_type='Iter', eval_metric='AUC')
    cat.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=100,verbose=100)
    
    # Appending predictions into list
    preds=cat.predict_proba(X_test)[:,1]
    print("ROC AUC Score: ",roc_auc_score(y_test,preds))
    cv_score.append(roc_auc_score(y_test,preds))
    test_p = cat.predict_proba(test[X.columns])[:,1]
    test_pred.append(test_p)

ROC AUC Score:  0.8362494461840382
ROC AUC Score:  0.8433676815109168
ROC AUC Score:  0.839796343368959
ROC AUC Score:  0.8407875338962245
ROC AUC Score:  0.8280431748170967
ROC AUC Score:  0.8447979083869023
ROC AUC Score:  0.8473595746028934


In [31]:
print(f'Mean of ROC Score: {np.array(cv_score).mean()}')

Mean of ROC Score: 0.8400573803952902


In [33]:
pred = np.sum(np.array(test_pred),axis=0)/split

### Creating Prediction file

In [34]:
sub = pd.DataFrame({'Applicant_ID':test.Applicant_ID, 'default_status':pred})

In [35]:
sub.to_csv('cluster_submission.csv', index = False)

In [36]:
sub.describe()

Unnamed: 0,default_status
count,24000.0
mean,0.244367
std,0.232597
min,0.001922
25%,0.046488
50%,0.16945
75%,0.384267
max,0.96818
