In [None]:
#Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from scipy.stats import skew

pd.set_option('display.max_columns', None)

import copy

In [None]:
#Reading the data files
train = pd.read_csv('train.csv', index_col='Applicant_ID')

train.head()

In [None]:
train.shape 

In [None]:
test = pd.read_csv('test.csv', index_col='Applicant_ID')

test.head()

In [None]:
train.isna().sum()#Check if missing values exists

In [None]:
#Handling Missing Values
train = train.fillna(-999)
test = test.fillna(-999)

In [None]:
train.isnull().sum()

In [None]:
train.isnull().sum().any()

In [None]:
#Join train and test dataset
data = pd.concat([train.drop("default_status", axis=1),test], axis=0) 

In [None]:
#Handling the Categorical data with LabelEncoder encoding

from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
data['form_field47'] = lb.fit_transform(data['form_field47'])

data.head(3)

In [None]:
#Seperating the Target from the Predictors
y = train[["default_status"]]

y.head(3)

In [None]:
#Handling the Categorical data with LabelEncoder encoding

from sklearn.preprocessing import LabelEncoder

lb = LabelEncoder()
y = pd.DataFrame(lb.fit_transform(y))

y.head(3)

In [None]:
X = data.iloc[:56000,:]
X_test = data.iloc[-24000:, :]

In [None]:
X.shape, X_test.shape, y.shape

In [None]:
#Scaling using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X = pd.DataFrame(scaler.fit_transform(X))
X_test = pd.DataFrame(scaler.fit_transform(X_test))

In [None]:
# Normal Train test split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y[0], test_size=0.3, random_state=42)

In [None]:
#Handling skewed features using log transformation
skew_features = np.abs(data.apply(lambda x: skew(x)).sort_values(ascending=False))
skew_features[:10] # Displaying top ten skewed features

In [None]:
# Filtering skewed features.
high_skew = skew_features[skew_features > 1]
# Taking indexes of high skew.
skew_index = high_skew.index
#Applying log transformation
for i in skew_index:
    data[i] = np.log1p(data[i])

In [None]:
# Creating new features  based on previous observations.
data['Total14&15'] = data['form_field14'] + data['form_field15']
data['Total13&14'] = data['form_field14'] - data['form_field13']


In [None]:
#Importing Other Necessary Libraries
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score

In [None]:
# Creating instances for both Kfold and StratifiedKfold
splits = 25

skfold = StratifiedKFold(n_splits=splits , random_state=419, shuffle=True)

In [None]:
#Models used for Voting
cat = CatBoostClassifier(random_state=419, verbose=False, thread_count=-1)
rfc = RandomForestClassifier(n_estimators=500, random_state=419, verbose=False, n_jobs=-1)
lgb = LGBMClassifier(n_jobs=-1)
catnorm = CatBoostClassifier(n_estimators=600, thread_count=-1, verbose=0)

In [None]:

def stratified_vote():
    scores, preds = [], []
    i = 1
    for train_split, val_split in skfold.split(X, y):
        x_train, x_test, y_train, y_test = X.iloc[train_split],X.iloc[val_split], y.iloc[train_split],y.iloc[val_split]
        
        vote = VotingClassifier(estimators=[('model1',cat),('model3',rfc),('model4',lgb)], voting='soft')
        vote.fit(x_train, y_train)
        
        #To see how the model performs after splitting
        score = roc_auc_score(y_test,vote.predict_proba(x_test)[:,1])
        pred = vote.predict_proba(X_test)[:,1]
        scores.append(score)
        preds.append(pred)
        print('Roc for {} split: '.format(i), score)
        i += 1
    print('final absolute Roc: ', np.mean(scores))
    final_predictions = np.mean(preds, axis=0)
    return final_predictions

final_predictions = stratified_vote()
        

In [None]:
output = pd.read_csv("SampleSubmission.csv")

output.head(3)

In [None]:
output.default_status = final_predictions

In [None]:
output.to_csv("Submission7.csv", index=False)

In [None]:
output.head()