In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [3]:
# Loading data
df_train = pd.read_csv('home_credit_train_engineered.csv')

df_train.dropna(inplace=True)


In [4]:
# Obtaining features
features = [f for f in df_train.columns if f not in ['Unnamed: 0','TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]


In [5]:
# Getting train test splits and scaling data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


x_train, x_test, y_train, y_test = train_test_split(
df_train[features], df_train['TARGET'], test_size=0.33, random_state=42)

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [19]:
# Getting undersampled data
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42, replacement=True)# fit predictor and target variable
x_u, y_u = rus.fit_resample(x_train, y_train)

print('Original train set shape:', len(x_train))
print('Resample train set shape :', len(x_r))


Original train set shape: 6951
Resample train set shape : 1460


In [20]:
# Getting oversampled data

from imblearn.over_sampling import SMOTE 

sm = SMOTE(random_state=42)

x_o, y_o = sm.fit_resample(x_train, y_train)

print('Original train set shape:', len(x_train))
print('Resample train set shape :', len(x_o))



# print('\nBalance of positive and negative classes (%):')
# y_sm.value_counts(normalize=True) * 100

Original train set shape: 6951
Resample train set shape : 12442


In [21]:
# Experimenting with classifiers

In [22]:
import joblib

In [40]:
def normal_classifier(clf,x_train = x_train,y_train = y_train,x_test = x_test,y_test = y_test):
    
    print("Classification on data from dataset\n")
    clf.fit(x_train,y_train)
    print("\n\nTraining report")
    train_report = classification_report(y_train,clf.predict(x_train))
    print(train_report)
    print("\n\nTesting report")
    test_report = classification_report(y_test,clf.predict(x_test))
    print(test_report)


In [41]:
def undersampled_classifier(clf,x_u = x_u,y_u = y_u,x_test = x_test,y_test = y_test):
    print("Classification on undersampled data\n")
    clf.fit(x_u,y_u)
    print("\n\nTraining report")
    train_report = classification_report(y_u,clf.predict(x_u))
    print(train_report)
    print("\n\nTesting report")
    test_report = classification_report(y_test,clf.predict(x_test))
    print(test_report)


In [42]:
def oversampled_classifier(clf,x_o=x_o,y_=y_o,x_test=x_test,y_test=y_test):
    print("\n\n For Oversampled data\n")
    clf.fit(x_o,y_o)
    print("\n\nTraining report")
    train_report = classification_report(y_o,clf.predict(x_o))
    print(train_report)
    print("\n\nTesting report")
    test_report = classification_report(y_test,clf.predict(x_test))
    print(test_report)



In [45]:
# LightGBM classifier
import lightgbm as ltb
model = ltb.LGBMClassifier()

In [46]:
normal_classifier(model)

Classification on data from dataset



Training report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      6221
         1.0       1.00      0.97      0.98       730

    accuracy                           1.00      6951
   macro avg       1.00      0.98      0.99      6951
weighted avg       1.00      1.00      1.00      6951



Testing report
              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95      3094
         1.0       0.42      0.05      0.10       331

    accuracy                           0.90      3425
   macro avg       0.66      0.52      0.52      3425
weighted avg       0.86      0.90      0.87      3425



In [47]:
undersampled_classifier(model)

Classification on undersampled data



Training report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       730
         1.0       1.00      1.00      1.00       730

    accuracy                           1.00      1460
   macro avg       1.00      1.00      1.00      1460
weighted avg       1.00      1.00      1.00      1460



Testing report
              precision    recall  f1-score   support

         0.0       0.95      0.64      0.76      3094
         1.0       0.17      0.69      0.27       331

    accuracy                           0.64      3425
   macro avg       0.56      0.67      0.52      3425
weighted avg       0.88      0.64      0.72      3425



In [48]:
oversampled_classifier(model)



 For Oversampled data



Training report
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      6221
         1.0       1.00      0.98      0.99      6221

    accuracy                           0.99     12442
   macro avg       0.99      0.99      0.99     12442
weighted avg       0.99      0.99      0.99     12442



Testing report
              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95      3094
         1.0       0.39      0.08      0.14       331

    accuracy                           0.90      3425
   macro avg       0.65      0.54      0.54      3425
weighted avg       0.86      0.90      0.87      3425



In [49]:
# SVM

In [50]:
from sklearn.svm import SVC
model = SVC(gamma='auto')

In [51]:
normal_classifier(model)

Classification on data from dataset



Training report
              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95      6221
         1.0       1.00      0.02      0.03       730

    accuracy                           0.90      6951
   macro avg       0.95      0.51      0.49      6951
weighted avg       0.91      0.90      0.85      6951



Testing report
              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95      3094
         1.0       0.00      0.00      0.00       331

    accuracy                           0.90      3425
   macro avg       0.45      0.50      0.47      3425
weighted avg       0.82      0.90      0.86      3425

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [52]:
undersampled_classifier(model)

Classification on undersampled data



Training report
              precision    recall  f1-score   support

         0.0       0.84      0.82      0.83       730
         1.0       0.83      0.85      0.84       730

    accuracy                           0.83      1460
   macro avg       0.84      0.83      0.83      1460
weighted avg       0.84      0.83      0.83      1460



Testing report
              precision    recall  f1-score   support

         0.0       0.95      0.63      0.76      3094
         1.0       0.17      0.72      0.28       331

    accuracy                           0.64      3425
   macro avg       0.56      0.67      0.52      3425
weighted avg       0.88      0.64      0.71      3425



In [53]:
oversampled_classifier(model)



 For Oversampled data



Training report
              precision    recall  f1-score   support

         0.0       0.97      0.92      0.94      6221
         1.0       0.92      0.97      0.95      6221

    accuracy                           0.94     12442
   macro avg       0.95      0.94      0.94     12442
weighted avg       0.95      0.94      0.94     12442



Testing report
              precision    recall  f1-score   support

         0.0       0.92      0.86      0.89      3094
         1.0       0.21      0.34      0.26       331

    accuracy                           0.81      3425
   macro avg       0.57      0.60      0.57      3425
weighted avg       0.85      0.81      0.83      3425



In [54]:
# Decision trees

In [55]:
from sklearn import tree
model = tree.DecisionTreeClassifier()

In [56]:
normal_classifier(model)

Classification on data from dataset



Training report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      6221
         1.0       1.00      1.00      1.00       730

    accuracy                           1.00      6951
   macro avg       1.00      1.00      1.00      6951
weighted avg       1.00      1.00      1.00      6951



Testing report
              precision    recall  f1-score   support

         0.0       0.91      0.89      0.90      3094
         1.0       0.15      0.18      0.16       331

    accuracy                           0.82      3425
   macro avg       0.53      0.54      0.53      3425
weighted avg       0.84      0.82      0.83      3425



In [57]:
undersampled_classifier(model)

Classification on undersampled data



Training report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       730
         1.0       1.00      1.00      1.00       730

    accuracy                           1.00      1460
   macro avg       1.00      1.00      1.00      1460
weighted avg       1.00      1.00      1.00      1460



Testing report
              precision    recall  f1-score   support

         0.0       0.93      0.54      0.69      3094
         1.0       0.13      0.61      0.21       331

    accuracy                           0.55      3425
   macro avg       0.53      0.58      0.45      3425
weighted avg       0.85      0.55      0.64      3425



In [58]:
oversampled_classifier(model)



 For Oversampled data



Training report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      6221
         1.0       1.00      1.00      1.00      6221

    accuracy                           1.00     12442
   macro avg       1.00      1.00      1.00     12442
weighted avg       1.00      1.00      1.00     12442



Testing report
              precision    recall  f1-score   support

         0.0       0.91      0.82      0.87      3094
         1.0       0.15      0.28      0.19       331

    accuracy                           0.77      3425
   macro avg       0.53      0.55      0.53      3425
weighted avg       0.84      0.77      0.80      3425



In [59]:
# Neural Networks

In [97]:
import tensorflow as tf
# Without sampling
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=x_train[0].shape),
    tf.keras.layers.Dense(100,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
model.compile(metrics = 'accuracy',optimizer='adam',loss='binary_crossentropy')

In [98]:
history = model.fit(x_train,y_train,validation_data=(x_test,y_test),epochs=10,verbose = 4)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [99]:
c = classification_report(y_train,model.predict_classes(x_train))
print("\n\nTraining\n",c)
c = classification_report(y_test,model.predict_classes(x_test))
print("Testing\n",c)



Training
               precision    recall  f1-score   support

         0.0       0.94      1.00      0.97      6221
         1.0       0.95      0.45      0.61       730

    accuracy                           0.94      6951
   macro avg       0.94      0.72      0.79      6951
weighted avg       0.94      0.94      0.93      6951

Testing
               precision    recall  f1-score   support

         0.0       0.91      0.98      0.94      3094
         1.0       0.33      0.09      0.14       331

    accuracy                           0.89      3425
   macro avg       0.62      0.53      0.54      3425
weighted avg       0.85      0.89      0.87      3425



In [100]:
# With undersampling

model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=x_u[0].shape),
    tf.keras.layers.Dense(100,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])


model.compile(metrics = 'accuracy',optimizer='adam',loss='binary_crossentropy')
model.fit(x_u,y_u,validation_data=(x_test,y_test),epochs=10,verbose = 5)
 



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1ff1cb958c8>

In [101]:
c = classification_report(y_u,model.predict_classes(x_u))
print("Training\n",c)
c = classification_report(y_test,model.predict_classes(x_test))
print("Testing\n",c)

Training
               precision    recall  f1-score   support

         0.0       0.92      0.96      0.94       730
         1.0       0.96      0.91      0.94       730

    accuracy                           0.94      1460
   macro avg       0.94      0.94      0.94      1460
weighted avg       0.94      0.94      0.94      1460

Testing
               precision    recall  f1-score   support

         0.0       0.94      0.67      0.78      3094
         1.0       0.17      0.63      0.26       331

    accuracy                           0.66      3425
   macro avg       0.56      0.65      0.52      3425
weighted avg       0.87      0.66      0.73      3425



In [103]:
# With oversampling

model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=x_o[0].shape),
    tf.keras.layers.Dense(100,activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(50,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])


model.compile(metrics = 'accuracy',optimizer='adam',loss='binary_crossentropy')
model.fit(x_o,y_o,validation_data=(x_test,y_test),epochs=10,verbose = 5)
 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1ff1cf1d708>

In [104]:
c = classification_report(y_o,model.predict_classes(x_o))
print("Training\n",c)
c = classification_report(y_test,model.predict_classes(x_test))
print("Testing\n",c)

Training
               precision    recall  f1-score   support

         0.0       1.00      0.99      1.00      6221
         1.0       0.99      1.00      1.00      6221

    accuracy                           1.00     12442
   macro avg       1.00      1.00      1.00     12442
weighted avg       1.00      1.00      1.00     12442

Testing
               precision    recall  f1-score   support

         0.0       0.92      0.92      0.92      3094
         1.0       0.24      0.24      0.24       331

    accuracy                           0.85      3425
   macro avg       0.58      0.58      0.58      3425
weighted avg       0.85      0.85      0.85      3425



In [106]:
# Logistic regression

In [108]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [109]:
normal_classifier(model)

Classification on data from dataset



Training report
              precision    recall  f1-score   support

         0.0       0.90      0.99      0.95      6221
         1.0       0.62      0.11      0.18       730

    accuracy                           0.90      6951
   macro avg       0.76      0.55      0.56      6951
weighted avg       0.87      0.90      0.87      6951



Testing report
              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95      3094
         1.0       0.40      0.08      0.13       331

    accuracy                           0.90      3425
   macro avg       0.65      0.53      0.54      3425
weighted avg       0.86      0.90      0.87      3425

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit

In [110]:
undersampled_classifier(model)

Classification on undersampled data



Training report
              precision    recall  f1-score   support

         0.0       0.75      0.77      0.76       730
         1.0       0.76      0.75      0.76       730

    accuracy                           0.76      1460
   macro avg       0.76      0.76      0.76      1460
weighted avg       0.76      0.76      0.76      1460



Testing report
              precision    recall  f1-score   support

         0.0       0.95      0.64      0.76      3094
         1.0       0.17      0.68      0.27       331

    accuracy                           0.64      3425
   macro avg       0.56      0.66      0.51      3425
weighted avg       0.87      0.64      0.71      3425

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit

In [111]:
oversampled_classifier(model)



 For Oversampled data



Training report
              precision    recall  f1-score   support

         0.0       0.78      0.74      0.76      6221
         1.0       0.75      0.79      0.77      6221

    accuracy                           0.76     12442
   macro avg       0.77      0.76      0.76     12442
weighted avg       0.77      0.76      0.76     12442



Testing report
              precision    recall  f1-score   support

         0.0       0.95      0.70      0.81      3094
         1.0       0.18      0.62      0.28       331

    accuracy                           0.69      3425
   macro avg       0.56      0.66      0.54      3425
weighted avg       0.87      0.69      0.75      3425

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/s

In [112]:
# Random Forest

In [113]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [114]:
normal_classifier(model)

Classification on data from dataset



Training report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      6221
         1.0       1.00      1.00      1.00       730

    accuracy                           1.00      6951
   macro avg       1.00      1.00      1.00      6951
weighted avg       1.00      1.00      1.00      6951



Testing report
              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95      3094
         1.0       0.00      0.00      0.00       331

    accuracy                           0.90      3425
   macro avg       0.45      0.50      0.47      3425
weighted avg       0.82      0.90      0.86      3425

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [115]:
undersampled_classifier(model)

Classification on undersampled data



Training report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       730
         1.0       1.00      1.00      1.00       730

    accuracy                           1.00      1460
   macro avg       1.00      1.00      1.00      1460
weighted avg       1.00      1.00      1.00      1460



Testing report
              precision    recall  f1-score   support

         0.0       0.95      0.64      0.77      3094
         1.0       0.17      0.70      0.28       331

    accuracy                           0.65      3425
   macro avg       0.56      0.67      0.52      3425
weighted avg       0.88      0.65      0.72      3425



In [116]:
oversampled_classifier(model)



 For Oversampled data



Training report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      6221
         1.0       1.00      1.00      1.00      6221

    accuracy                           1.00     12442
   macro avg       1.00      1.00      1.00     12442
weighted avg       1.00      1.00      1.00     12442



Testing report
              precision    recall  f1-score   support

         0.0       0.91      0.98      0.94      3094
         1.0       0.31      0.08      0.12       331

    accuracy                           0.89      3425
   macro avg       0.61      0.53      0.53      3425
weighted avg       0.85      0.89      0.86      3425

