In [18]:
# Required Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,f1_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer,StandardScaler,LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import seaborn as sns

In [19]:
# reading the imputed csv files of both training and test
df = pd.read_csv("data1.csv")
df_test = pd.read_csv("data1_test.csv")
df.head()
df.shape

(83000, 43)

In [20]:
#encoding the categorical variable 'mvar47'
le=LabelEncoder()
df['mvar47']=le.fit_transform(df['mvar47'])
df_test['mvar47'] = le.fit_transform(df_test['mvar47'])

In [21]:
# getting the info about features and their datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83000 entries, 0 to 82999
Data columns (total 43 columns):
Unnamed: 0         83000 non-null int64
application_key    83000 non-null int64
mvar1              83000 non-null float64
mvar2              83000 non-null float64
mvar3              83000 non-null float64
mvar4              83000 non-null float64
mvar5              83000 non-null float64
mvar6              83000 non-null float64
mvar8              83000 non-null float64
mvar9              83000 non-null float64
mvar10             83000 non-null float64
mvar11             83000 non-null float64
mvar12             83000 non-null float64
mvar13             83000 non-null float64
mvar14             83000 non-null float64
mvar15             83000 non-null float64
mvar17             83000 non-null float64
mvar19             83000 non-null float64
mvar21             83000 non-null float64
mvar22             83000 non-null float64
mvar24             83000 non-null float64
mvar25       

In [22]:
# maintaining consistency with the train data
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47000 entries, 0 to 46999
Data columns (total 43 columns):
Unnamed: 0         47000 non-null int64
application_key    47000 non-null int64
mvar1              47000 non-null float64
mvar2              47000 non-null float64
mvar3              47000 non-null float64
mvar4              47000 non-null float64
mvar5              47000 non-null float64
mvar6              47000 non-null float64
mvar8              47000 non-null float64
mvar9              47000 non-null float64
mvar10             47000 non-null float64
mvar11             47000 non-null float64
mvar12             47000 non-null float64
mvar13             47000 non-null float64
mvar14             47000 non-null float64
mvar15             47000 non-null float64
mvar17             47000 non-null float64
mvar19             47000 non-null float64
mvar21             47000 non-null float64
mvar22             47000 non-null float64
mvar24             47000 non-null float64
mvar25       

#### Datatypes of some features are changed as they do not align with the description of the features given with the original dataset.

In [23]:
# to get statistical description of the features in the dataset
df.describe()

Unnamed: 0.1,Unnamed: 0,application_key,mvar1,mvar2,mvar3,mvar4,mvar5,mvar6,mvar8,mvar9,...,mvar39,mvar40,mvar41,mvar42,mvar43,mvar44,mvar45,mvar47,mvar48,default_ind
count,83000.0,83000.0,83000.0,83000.0,83000.0,83000.0,83000.0,83000.0,83000.0,83000.0,...,83000.0,83000.0,83000.0,83000.0,83000.0,83000.0,83000.0,83000.0,83000.0,83000.0
mean,41499.5,318146.93106,1743.085114,1.023918,5.3723,0.458599,1.077351,1589.356741,5642.634571,29494.529998,...,0.084155,88.94187,72.471249,0.342021,6.494181,0.72773,0.10746,0.372976,556.021277,0.28741
std,23960.180509,97179.489955,95.820622,1.511977,11.062063,1.699125,5.725989,3346.828417,9143.236258,48303.003341,...,0.408517,24.42697,21.40933,0.384921,6.266396,0.244145,0.240348,0.483599,1106.953295,0.452557
min,0.0,230032.0,1477.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.014,0.0,0.0,0.0,0.06066,0.0,0.0,2.0,0.0
25%,20749.75,250781.75,1674.85,0.1351,0.0,0.0,0.0,60.0,357.0,1784.0,...,0.0,71.575765,59.256818,0.0,2.0,0.536545,0.0,0.0,8.0,0.0
50%,41499.5,271531.5,1739.0,0.50995,0.309685,0.0,0.0,375.655,1856.81,9910.0,...,0.0,92.132145,76.10799,0.2,5.0,0.778615,0.0,0.0,17.0,0.0
75%,62249.25,362296.25,1809.0,1.3443,6.536,0.0,0.0,1514.6175,6963.0,36937.25,...,0.0,102.177328,90.494535,0.58824,10.0,0.96205,0.11,1.0,111.0,1.0
max,82999.0,578068.0,1950.0,31.0181,399.334,25.754,165.492,94302.0,291810.0,840658.0,...,21.0,631.36,182.111,2.0,91.0,1.0,7.0,1.0,3247.0,1.0


In [24]:
# dropping the unnecessary columns in both train and test datasets
i_d_test = df_test['application_key']
df = df.drop(['application_key','Unnamed: 0'], axis = 1)
df_test = df_test.drop(['application_key','Unnamed: 0'], axis = 1)
df.columns

Index(['mvar1', 'mvar2', 'mvar3', 'mvar4', 'mvar5', 'mvar6', 'mvar8', 'mvar9',
       'mvar10', 'mvar11', 'mvar12', 'mvar13', 'mvar14', 'mvar15', 'mvar17',
       'mvar19', 'mvar21', 'mvar22', 'mvar24', 'mvar25', 'mvar26', 'mvar28',
       'mvar29', 'mvar30', 'mvar31', 'mvar32', 'mvar33', 'mvar34', 'mvar35',
       'mvar36', 'mvar38', 'mvar39', 'mvar40', 'mvar41', 'mvar42', 'mvar43',
       'mvar44', 'mvar45', 'mvar47', 'mvar48', 'default_ind'],
      dtype='object')

In [25]:
# checking for the class ratio in the dependent variable in 'default_ind'
df.default_ind.value_counts()

0    59145
1    23855
Name: default_ind, dtype: int64

In [26]:
# the final features with the dataframe is shown below
df.head()

Unnamed: 0,mvar1,mvar2,mvar3,mvar4,mvar5,mvar6,mvar8,mvar9,mvar10,mvar11,...,mvar39,mvar40,mvar41,mvar42,mvar43,mvar44,mvar45,mvar47,mvar48,default_ind
0,1696.0,1.6541,0.0,0.0,0.0,0.0,322.0,40369.0,18414.0,693.35,...,1.0,73.78,82.547,0.08696,10.0,0.63899,0.0,0,10,0
1,1846.0,0.8095,0.0,0.0,0.0,102.0,3171.0,18234.0,13664.0,502.49,...,0.0,99.129,61.30928,0.0,13.0,0.63836,0.0,1,732,1
2,1745.0,0.4001,0.0,0.0,0.0,1934.97,2345.03,9390.42,2536.0,1820.46,...,0.0,85.19551,29.29,0.0,1.0,1.0,0.0,0,89,1
3,1739.0,0.2193,0.0,0.0,0.0,1982.0,4955.0,20316.0,37013.0,466.65,...,0.0,96.272,66.03354,0.15385,3.0,0.53241,0.0,1,3,0
4,1787.0,0.0118,0.225,0.0,0.0,5451.0,5494.0,7987.0,4696.0,1737.55,...,0.0,115.019,80.26384,0.0,1.0,0.92665,0.0,1,5,0


#### So, for modelling we are considering all the features because of two reasons-
- one due to the lack of domain knowledge we can't intuitively tell the feature importance and also it seems all the features are important based on the description.
- Performing Hypothesis testing, subset selection methods haven't improved the model much so keeping in mind that removing features or samples leads to loss in information so we decided to go with all the features and removed a few samples and features containing high proportion of missing values.

# Modelling

In modelling we prioritized the importance of default class i.e. 1. So, we aimed at high recall i.e good F1-Score and also good balanced accuracy. So, we fixed an objective function which is the sum of accuracy, balanced accuracy, F1-score which we want to maximize and dsince F1-score metric is more important so we added a weighted F1-score in the objective function.

In [27]:
# Accessing the dependent and independent features and storing them
x = df.drop(['default_ind'], axis=1)
# due to the presence of NAN values in the default_ind feature
xtest = df_test.drop(['default_ind'], axis=1) 
y = df['default_ind']

# Scaling the high magnitude features to reduce model bias towards their magnitude
Std = StandardScaler()
x_scaled = Std.fit_transform(x)
x_sc = pd.DataFrame(x_scaled,columns=df.drop(['default_ind'],axis=1).columns)
# transforming the test data in a similar fashion as train dara
xtest_scaled =  Std.transform(xtest)
xtest_sc = pd.DataFrame(xtest_scaled,columns=df_test.drop(['default_ind'],axis=1).columns)

# Splitting the data into train and validation sets for training and model selection purposes.
X_train, X_test, y_train, y_test = train_test_split(x_sc, y, test_size=0.20, random_state=13)

## XGBoost Classifier

In [29]:
# XGBoost Classifier
model1 = XGBClassifier(max_depth=3,n_estimators=250,scale_pos_weight = 1.9)
model1.fit(X_train,y_train)
y_pred_valid1 = model1.predict(X_test)
y_prob1 = model1.predict_proba(X_test)

ytest_prob1 = model1.predict_proba(xtest_sc)

print('XGBClassifier')
print("Accuracy :", accuracy_score(y_pred_valid1, y_test))
print("f1score : ", f1_score(y_pred_valid1, y_test))
print("Balanced Accuracy :", balanced_accuracy_score(y_pred_valid1, y_test))
# The objective function considered to identify the best model
print("score :", accuracy_score(y_pred_valid1, y_test) + 1.3*f1_score(y_pred_valid1, y_test) + balanced_accuracy_score(y_pred_valid1, y_test))

XGBClassifier
Accuracy : 0.7487349397590362
f1score :  0.5884558460779477
Balanced Accuracy : 0.6987524802393108
score : 2.212480019899679


# AdaBoost Classifier

In [30]:
# AdaBoost Classifier
model2 = AdaBoostClassifier(n_estimators = 100)
model2.fit(X_train,y_train)
y_pred_valid2 = model2.predict(X_test)
y_prob2 = model2.predict_proba(X_test)
ytest_prob2 = model2.predict_proba(xtest_sc)

print('AdaBoostClassifier')
print("Accuracy:", balanced_accuracy_score(y_pred_valid2, y_test))
print("f1score:", f1_score(y_pred_valid2, y_test))
print("Balanced Accuracy:", balanced_accuracy_score(y_pred_valid2, y_test))
print("score:", accuracy_score(y_pred_valid2, y_test)+1.3*f1_score(y_pred_valid2, y_test)+balanced_accuracy_score(y_pred_valid2, y_test))

AdaBoostClassifier
Accuracy: 0.7223638850066874
f1score: 0.5088276387654007
Balanced Accuracy: 0.7223638850066874
score: 2.1508880081727924


# Gradient Boost Classifier

In [31]:
# GradientBoosting classifier
model3 = GradientBoostingClassifier()
model3.fit(X_train,y_train)
y_pred_valid3 = model3.predict(X_test)
y_prob3 = model3.predict_proba(X_test)

ytest_prob3 = model3.predict_proba(xtest_sc)
print('GradientBoostClassifier')
print("Accuracy:", accuracy_score(y_pred_valid3, y_test))
print("f1score:", f1_score(y_pred_valid3, y_test))
print("Balanced Accuracy:", balanced_accuracy_score(y_pred_valid3, y_test))
print("score:", accuracy_score(y_pred_valid3, y_test)+1.3*f1_score(y_pred_valid3, y_test)+balanced_accuracy_score(y_pred_valid3, y_test))

GradientBoostClassifier
Accuracy: 0.7703012048192771
f1score: 0.5133375877472879
Balanced Accuracy: 0.7283405498665467
score: 2.165980618757298


# LightGradientBoost Classifier - 1

In [32]:
# Light Gradient Boosting Classifier - 1
model6 = LGBMClassifier(boosting_type='dart',n_estimators=110,class_weight = {0:1,1:2})
model6.fit(X_train,y_train)
y_pred_valid6 = model6.predict(X_test)
y_prob6 = model6.predict_proba(X_test)
ytest_prob6 = model6.predict_proba(xtest_sc)

print('LGBClassifier- 1')
print("Accuracy:", accuracy_score(y_pred_valid6, y_test))
print("f1score:", f1_score(y_pred_valid6, y_test))
print("Balanced Accuracy:", balanced_accuracy_score(y_pred_valid6, y_test))
print("score:", accuracy_score(y_pred_valid6, y_test)+1.3*f1_score(y_pred_valid6, y_test)+balanced_accuracy_score(y_pred_valid6, y_test))

LGBClassifier- 1
Accuracy: 0.743855421686747
f1score: 0.5902871458855272
Balanced Accuracy: 0.6957418574244287
score: 2.2069705687623613


# Random Forest Classifier

In [33]:
# Random Forest Classifier
model4 = RandomForestClassifier(n_estimators = 150, class_weight = {0:1,1:2}, max_depth = 12, min_samples_leaf = 20)
model4.fit(X_train,y_train)
y_pred_valid4 = model4.predict(X_test)
y_prob4 = model4.predict_proba(X_test)
ytest_prob4 = model4.predict_proba(xtest_sc)

print('Random Forest Classifier')
print("Accuracy:", accuracy_score(y_pred_valid4, y_test))
print("f1score:", f1_score(y_pred_valid4, y_test))
print("Balanced Accuracy:", balanced_accuracy_score(y_pred_valid4, y_test))
print("score:", accuracy_score(y_pred_valid4, y_test)+1.3*f1_score(y_pred_valid4, y_test)+balanced_accuracy_score(y_pred_valid4, y_test))

Random Forest Classifier
Accuracy: 0.7509036144578313
f1score: 0.5852141639081152
Balanced Accuracy: 0.6997630972613426
score: 2.2114451247997238


# LightGradientBoost Classifier- 2

In [34]:
#Light Gradient Boosting - 2
model5 = LGBMClassifier(boosting_type='gbdt',n_estimators=150,class_weight = {0:1,1:2})
model5.fit(X_train,y_train)
y_pred_valid5 = model5.predict(X_test)
y_prob5 = model5.predict_proba(X_test)
ytest_prob5 = model5.predict_proba(xtest_sc)

print('LGBClassifier- 2')
print("Accuracy:", accuracy_score(y_pred_valid5, y_test))
print("f1score:", f1_score(y_pred_valid5, y_test))
print("Balanced Accuracy:", balanced_accuracy_score(y_pred_valid5, y_test))
print("score:", accuracy_score(y_pred_valid5, y_test)+1.3*f1_score(y_pred_valid5, y_test)+balanced_accuracy_score(y_pred_valid5, y_test))

LGBClassifier- 2
Accuracy: 0.7448192771084338
f1score: 0.5860855970295096
Balanced Accuracy: 0.6952747306924997
score: 2.202005283939296


# Stacking/ Ensembling

In [85]:
# Stacking of the above five models to enhance performance or improve the objective function value
# A manual gridsearch of weights to be assigned to each model so as to get the maximum score is what the followiwng code performs...
# This is done on the validation set and the optimal weights are then used for prediction on the test set.
j1=[]
k1=[]
l1=[]
m1=[]
n1=[]
o1=[]
accu=[]
f1_s = []
baccu = []
final_score = []
for j in np.arange(0,2,0.4):
    for k in np.arange(0,2,0.4):
        for l in np.arange(0,2,0.4):
            for m in np.arange(0,2,0.4):
                for n in np.arange(0,2,0.4):
                    for o in np.arange(0,2,0.4):
                        f = y_prob1*j + y_prob2*k + l*y_prob3 + m*y_prob4 + n*y_prob5 + o*y_prob6
                        y_pred_val = np.argmax(f,axis=1)
                        acc = accuracy_score(y_test, y_pred_val)
                        f1 = f1_score(y_test, y_pred_val)
                        bal_acc = balanced_accuracy_score(y_test, y_pred_val)
                        f_s = acc + 1.3 * f1 + bal_acc
                        j1.append(j)
                        k1.append(k)
                        l1.append(l)
                        m1.append(m)
                        n1.append(n)
                        o1.append(o)
                        accu.append(acc)
                        f1_s.append(f1)
                        baccu.append(bal_acc)
                        final_score.append(f_s)
            
Result_df = pd.DataFrame({'j':j1, 'k':k1,'l':l1, 'm':m1, 'n':n1, 'o': o1, 'Accuracy':accu, 'f1-score':f1_s, 'Bal_Accuracy':baccu, 'Final Score':final_score})
Result_df = Result_df.sort_values(by='Final Score', axis=0, ascending=False)
Result_df

  'precision', 'predicted', average, warn_for)


Unnamed: 0,j,k,l,m,n,o,Accuracy,f1-score,Bal_Accuracy,Final Score
10702,1.2,0.8,0.0,1.2,0.0,0.8,0.750482,0.591035,0.712892,2.231719
11327,1.2,1.2,0.0,1.2,0.0,0.8,0.750482,0.590954,0.712830,2.231552
6303,0.8,0.0,0.0,0.8,0.0,1.2,0.749217,0.591342,0.713178,2.231139
10077,1.2,0.4,0.0,1.2,0.0,0.8,0.750241,0.590801,0.712723,2.231005
6928,0.8,0.4,0.0,0.8,0.0,1.2,0.749217,0.591181,0.713054,2.230806
...,...,...,...,...,...,...,...,...,...,...
2500,0.0,1.6,0.0,0.0,0.0,0.0,0.767048,0.508828,0.663317,2.091841
625,0.0,0.4,0.0,0.0,0.0,0.0,0.767048,0.508828,0.663317,2.091841
1250,0.0,0.8,0.0,0.0,0.0,0.0,0.767048,0.508828,0.663317,2.091841
1875,0.0,1.2,0.0,0.0,0.0,0.0,0.767048,0.508828,0.663317,2.091841


In [None]:
# obtaining the predictions from the optimal weights obtained above....
f = ytest_prob1*1.6 + ytest_prob2*0.0 + 0.0*ytest_prob3 + 1.2*ytest_prob4 + 0.4*ytest_prob5 + 2*ytest_prob6
y_pred_val_test_org = np.argmax(f,axis=1)

In [None]:
# storing the submission  as .csv file
i_d_test.values.reshape(-1,1)
y_pred_val_test_org.reshape(-1,1).shape
submission = np.concatenate((i_d_test.values.reshape(-1,1), y_pred_val_test_org.reshape(-1,1)), axis = 1)
np.savetxt("N.csv", submission, delimiter=",")
print('processed')