In [1]:
#Importing all the library
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix , classification_report

In [2]:
df_encoded = pd.read_csv('df_encoded.csv')

In [3]:
df_true = df_encoded[df_encoded['not.fully.paid'] == 0]
df_false = df_encoded[df_encoded['not.fully.paid'] == 1]

In [4]:
print(df_true.shape)
print(df_false.shape)

(5635, 20)
(924, 20)


In [5]:
#Dividing into features and label 
y, X = df_encoded['not.fully.paid'], df_encoded.drop('not.fully.paid', axis = 'columns')

In [6]:
# Spliting into train data and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15, stratify=y)

In [26]:
y_train.value_counts()

0    4508
1     739
Name: not.fully.paid, dtype: int64

In [8]:
# model1 --> class1(739) + class0(0, 739)

# model2 --> class1(739) + class0(740, 1478)

# model3 --> class1(739) + class0(1479, 2217)

# model4 --> class1(739) + class0(2218, 2956)

# model5 --> class1(739) + class0(2957, 3695)

# model6 --> class1(739) + class0(3696, 4434)

In [9]:
x = X_train.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(x)
df_X_train_scaled = pd.DataFrame(X_train_scaled)

In [10]:
#scaling the test data
x = X_test.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
X_test_scaled = min_max_scaler.fit_transform(x)
df_X_test_scaled = pd.DataFrame(X_test_scaled)

In [27]:
y_train.value_counts()

0    4508
1     739
Name: not.fully.paid, dtype: int64

In [11]:
df3 = df_X_train_scaled.copy()
df3['not_fully_paid'] = y_train

In [12]:
df3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,not_fully_paid
0,1.0,0.705716,0.541013,0.358999,0.49699,0.243902,0.228367,0.076424,0.504699,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.555399,0.506579,0.45318,0.858194,0.487805,0.395841,0.284403,0.460526,0.25,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.800282,0.614788,0.439329,0.549833,0.170732,0.14209,0.108216,0.864662,0.375,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.460833,0.351749,0.468028,0.112709,0.487805,0.206883,0.069514,0.296053,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.534227,0.102518,0.260121,0.585284,0.317073,0.210604,0.074933,0.81015,0.125,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 

In [13]:
df3_class0 = df3[df3.not_fully_paid==0]
df3_class1 = df3[df3.not_fully_paid==1]

In [25]:
df3_class1.shape

(498, 20)

In [14]:
def get_train_batch(df_majority, df_minority, start, end):
    df_train = pd.concat([df_majority[start:end], df_minority], axis=0)

    X_train = df_train.drop('not_fully_paid', axis='columns')
    y_train = df_train.not_fully_paid
    return X_train, y_train 

In [15]:
X_train1, y_train1 = get_train_batch(df3_class0, df3_class1, 0, 739)
y_train1.value_counts()

0.0    739
1.0    498
Name: not_fully_paid, dtype: int64

In [16]:
X_train1, y_train1 = get_train_batch(df3_class0, df3_class1, 0, 739)
modelLR = LogisticRegression()
modelLR.fit(X_train1, y_train1)
y_pred1 = modelLR.predict(df_X_test_scaled)

In [17]:
X_train2, y_train2 = get_train_batch(df3_class0, df3_class1, 740, 1478)
modelLR = LogisticRegression()
modelLR.fit(X_train2, y_train2)
y_pred2 = modelLR.predict(df_X_test_scaled)

In [18]:
X_train3, y_train3 = get_train_batch(df3_class0, df3_class1, 1479, 2217)
modelLR = LogisticRegression()
modelLR.fit(X_train3, y_train3)
y_pred3 = modelLR.predict(df_X_test_scaled)

In [19]:
X_train4, y_train4 = get_train_batch(df3_class0, df3_class1, 2218, 2956)
modelLR = LogisticRegression()
modelLR.fit(X_train4, y_train4)
y_pred4 = modelLR.predict(df_X_test_scaled)

In [20]:
X_train5, y_train5 = get_train_batch(df3_class0, df3_class1, 2957, 3695)
modelLR = LogisticRegression()
modelLR.fit(X_train5, y_train5)
y_pred5 = modelLR.predict(df_X_test_scaled)

In [21]:
# X_train6, y_train6 = get_train_batch(df3_class0, df3_class1, 3696, 4434)
# modelLR = LogisticRegression()
# modelLR.fit(X_train6, y_train6)
# y_pred6 = modelLR.predict(df_X_test_scaled)

In [22]:
y_pred_final = y_pred1.copy()
for i in range(len(y_pred1)):
    n_ones = y_pred1[i] + y_pred2[i] + y_pred3[i] + y_pred4[i] + y_pred5[i]
    if n_ones>2:
        y_pred_final[i] = 1
    else:
        y_pred_final[i] = 0

In [23]:
cl_rep = classification_report(y_test, y_pred_final)
print("Ensamble_logistic: \n", cl_rep)

Ensamble_logistic: 
               precision    recall  f1-score   support

           0       0.86      1.00      0.92      1127
           1       0.17      0.01      0.01       185

    accuracy                           0.86      1312
   macro avg       0.51      0.50      0.47      1312
weighted avg       0.76      0.86      0.79      1312

