In [2]:
import pandas as pd

train = pd.read_csv("Datasets/filtered_data_sk.csv")
test = pd.read_csv("Datasets/final_test_data.csv")

In [3]:
train.head()

Unnamed: 0,No_of_prevloans,Total_amount_due,Average_termdays,loannumber,good_bad_flag,age,bank_account_type_Current,loan_approval_time,loan_firstPayment_time
0,1.0,11500.0,15.0,2.0,1,50.0,0,1.018611,15.0
1,1.0,13000.0,30.0,2.0,0,38.0,0,1.018889,32.0
2,3.0,36000.0,20.0,4.0,1,37.0,0,1.001944,32.0
3,1.0,11500.0,15.0,2.0,1,32.0,0,1.018889,33.0
4,5.0,88000.0,30.0,6.0,1,33.0,0,1.018889,24.0


In [4]:
test['bank_account_type_Current'] = test['bank_account_type_Current'].astype(int)

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3264 entries, 0 to 3263
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   No_of_prevloans            3264 non-null   float64
 1   Total_amount_due           3264 non-null   float64
 2   Average_termdays           3264 non-null   float64
 3   loannumber                 3264 non-null   float64
 4   good_bad_flag              3264 non-null   int64  
 5   age                        3264 non-null   float64
 6   bank_account_type_Current  3264 non-null   int64  
 7   loan_approval_time         3264 non-null   float64
 8   loan_firstPayment_time     3264 non-null   float64
dtypes: float64(7), int64(2)
memory usage: 229.6 KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   No_of_prevloans            384 non-null    float64
 1   Total_amount_due           384 non-null    float64
 2   Average_termdays           384 non-null    float64
 3   loannumber                 384 non-null    float64
 4   age                        384 non-null    float64
 5   bank_account_type_Current  384 non-null    int32  
 6   loan_approval_time         384 non-null    float64
 7   loan_firstPayment_time     384 non-null    float64
dtypes: float64(7), int32(1)
memory usage: 22.6 KB


Both datasets do not have any null values since both were already cleaned and only the most relevant features are included in both since feature selection was already done  as in the "training by simon" notebook

In [7]:
# we shall be evaluating the models - LogisticRegression, KNeighborsClassifier, DecisionTreeClassifier, SVC, GaussianNB, RandomForestClassifier and GradientBoostingClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [8]:
x_train = train.drop(columns=['good_bad_flag'])
y_train = train['good_bad_flag']

In [9]:
x_train.head()

Unnamed: 0,No_of_prevloans,Total_amount_due,Average_termdays,loannumber,age,bank_account_type_Current,loan_approval_time,loan_firstPayment_time
0,1.0,11500.0,15.0,2.0,50.0,0,1.018611,15.0
1,1.0,13000.0,30.0,2.0,38.0,0,1.018889,32.0
2,3.0,36000.0,20.0,4.0,37.0,0,1.001944,32.0
3,1.0,11500.0,15.0,2.0,32.0,0,1.018889,33.0
4,5.0,88000.0,30.0,6.0,33.0,0,1.018889,24.0


In [10]:
y_train

0       1
1       0
2       1
3       1
4       1
       ..
3259    0
3260    1
3261    1
3262    1
3263    1
Name: good_bad_flag, Length: 3264, dtype: int64

In [11]:
x_test = test

In [12]:
# Standardize the features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [13]:
# Classification Algorithms
models = [] # list to contain the models
models.append(('LogisticR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('SVM', SVC()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('GB', GradientBoostingClassifier()))

In [14]:
# evaluation of each model
results = []
names = []
for name, model in models:
    cv_results = cross_val_score(model, x_train, y_train, cv=10, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print(f"{name}: {cv_results.mean()} ({cv_results.std()})")


LogisticR: 0.7861512917206056 (0.004730069565549709)
KNN: 0.7555064632933717 (0.017002822872004924)
CART: 0.6798324609294385 (0.020555273982730715)
SVM: 0.7846203635954299 (0.005464804479577677)
NB: 0.7362507270032458 (0.06094162290373558)
RF: 0.7647061030749892 (0.011514489603893072)
GB: 0.7815575692763738 (0.012002111007974296)


Logistic Regression performs best with the least standard deviation and highest accuracy score.

In [15]:
# prediction using Logistic Regression

predictions = {}
for name, model in models:
    if name == 'LogisticR':
      model.fit(x_train, y_train)
      y_pred = model.predict(x_test)
      predictions['good_bad_flag'] = y_pred

In [16]:
# Save predictions
predictions_df = pd.DataFrame(predictions)
predictions_df.head()

Unnamed: 0,good_bad_flag
0,1
1,1
2,1
3,1
4,1


In [17]:
original_test_data = pd.read_csv('Datasets/final_test_data_temp.csv')
original_test_data.head()

Unnamed: 0,customerid,No_of_prevloans,Total_amount_due,Average_termdays,loannumber,age,bank_account_type_Current,loan_approval_time,loan_firstPayment_time
0,8a858f305c8dd672015c93b1db645db4,1.0,13000.0,30.0,2.0,47.0,False,1.018611,31.0
1,8a858fde56eb02280156eb6dafc128ac,13.0,412800.0,23.076923,14.0,42.0,False,1.001944,29.0
2,8a858e695775665c015779a1a5cc1192,7.0,89500.0,27.857143,8.0,35.0,False,1.001944,32.0
3,8a858fcf5b39c3ba015b3d9f215c3922,2.0,26000.0,30.0,3.0,45.0,False,1.001667,39.0
4,8a858edc5ceea2e6015ceea77c5c0300,1.0,13000.0,30.0,2.0,36.0,False,1.018889,13.0


In [18]:
print(original_test_data.shape, predictions_df.shape, test.shape)

(384, 9) (384, 1) (384, 8)


In [19]:
customerid = original_test_data[['customerid']]

In [20]:
customerid.head()

Unnamed: 0,customerid
0,8a858f305c8dd672015c93b1db645db4
1,8a858fde56eb02280156eb6dafc128ac
2,8a858e695775665c015779a1a5cc1192
3,8a858fcf5b39c3ba015b3d9f215c3922
4,8a858edc5ceea2e6015ceea77c5c0300


In [21]:
# merge the test and results data frames
submission_file = pd.concat([ customerid,predictions_df], axis=1)
submission_file.to_csv('Datasets/submission.csv',index=False)

In [22]:
import pandas as pd
sample_submission =pd.read_csv('Datasets/SampleSubmission.csv')
submission_file=pd.read_csv('Datasets/submission.csv')
test_submission=pd.concat([sample_submission,submission_file],axis=0)
test_submission.head()


Unnamed: 0,customerid,Good_Bad_flag,good_bad_flag
0,8a28afc7474813a40147639ec637156b,1.0,
1,8a3735d5518aba7301518ac34413010d,1.0,
2,8a76e7d443e6e97c0143ed099d102b1d,1.0,
3,8a818823525dceef01525deda2480384,1.0,
4,8a818926522ea5ef01523aff15c37482,1.0,


In [23]:
final_test =test_submission.drop('Good_Bad_flag',axis=1)


In [24]:
final_test.to_csv('Datasets/final_submission.csv',index=False)

In [25]:
# Imputing the NaN values with the modes
mode = final_test['good_bad_flag'].mode()[0]
mode

1.0

In [26]:
final_test['good_bad_flag'].fillna(mode, inplace=True)

In [27]:
final_test.to_csv('Datasets/final_submission_version_2.csv',index=False)