In [79]:
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [80]:
# Read the training data, test data, and solution CSV files
path = "./datasets_kaggle/loan-10k/"
train_data = pd.read_csv("%sloan-10k.lrn.csv" % path)
competition_data = pd.read_csv("%sloan-10k.tes.csv" % path)
#solution = pd.read_csv("%sloan-10k.tes.csv" % path)

train_data

Unnamed: 0,ID,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,home_ownership,annual_inc,...,debt_settlement_flag,issue_d_month,issue_d_year,earliest_cr_line_month,earliest_cr_line_year,last_pymnt_d_month,last_pymnt_d_year,last_credit_pull_d_month,last_credit_pull_d_year,grade
0,24341,12500.0,12500.0,12500.0,36 months,7.21,387.17,< 1 year,MORTGAGE,81000.0,...,N,6,2018,6,2000,2,2019,2,2019,A
1,67534,33850.0,33850.0,33775.0,60 months,20.99,915.57,1 year,MORTGAGE,80000.0,...,N,10,2015,9,1984,2,2019,2,2019,E
2,35080,10000.0,10000.0,10000.0,60 months,20.00,264.94,< 1 year,RENT,36580.0,...,N,9,2017,10,2006,1,2018,11,2018,D
3,4828,20250.0,20250.0,20250.0,36 months,14.31,695.15,9 years,RENT,48700.0,...,N,0,2015,6,1996,6,2016,9,2017,C
4,59259,25000.0,25000.0,25000.0,36 months,14.99,866.52,1 year,MORTGAGE,85000.0,...,N,11,2016,0,2002,2,2019,2,2019,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,6644,20000.0,20000.0,20000.0,60 months,16.02,486.58,5 years,OWN,54000.0,...,N,8,2017,11,1991,2,2019,2,2019,C
9996,25910,7500.0,7500.0,7500.0,36 months,9.49,240.22,10+ years,RENT,46386.0,...,N,10,2014,4,2004,3,2017,6,2018,B
9997,95698,20750.0,20750.0,20750.0,60 months,15.05,494.19,< 1 year,RENT,185000.0,...,N,4,2017,2,2004,2,2019,2,2019,C
9998,27371,11000.0,11000.0,11000.0,60 months,16.29,269.20,1 year,RENT,38500.0,...,N,9,2014,8,2006,3,2016,10,2016,D


In [81]:
for column in train_data.columns:
    # check if column is numeric
    if np.issubdtype(train_data[column].dtype, np.number):
        min_value = train_data[column].min()
        max_value = train_data[column].max()
        #print(f"{column}: min = {min_value}, max = {max_value}")
    else:
        # if column is non-numeric, keep it as it is
        unique_values = train_data[column].apply(lambda x: x.decode() if isinstance(x, bytes) else x).unique()
        print(column + ": " + str(unique_values))

term: [' 36 months' ' 60 months']
emp_length: ['< 1 year' '1 year' '9 years' '10+ years' '3 years' '4 years' '7 years'
 '2 years' '5 years' '6 years' '8 years']
home_ownership: ['MORTGAGE' 'RENT' 'OWN' 'ANY' 'OTHER']
verification_status: ['Not Verified' 'Source Verified' 'Verified']
loan_status: ['Current' 'Fully Paid' 'Charged Off' 'Late (31-120 days)'
 'In Grace Period' 'Late (16-30 days)']
pymnt_plan: ['n' 'y']
purpose: ['debt_consolidation' 'car' 'credit_card' 'other' 'major_purchase'
 'home_improvement' 'small_business' 'medical' 'vacation' 'moving' 'house'
 'renewable_energy' 'wedding']
addr_state: ['IL' 'OK' 'AZ' 'CA' 'FL' 'TX' 'PA' 'MA' 'NC' 'NV' 'SC' 'GA' 'NY' 'OR'
 'AL' 'VT' 'MD' 'KY' 'CO' 'IN' 'LA' 'MN' 'MI' 'WA' 'NJ' 'VA' 'WI' 'OH'
 'TN' 'CT' 'DC' 'HI' 'MO' 'DE' 'AR' 'KS' 'ME' 'MS' 'NM' 'AK' 'UT' 'WV'
 'MT' 'NH' 'SD' 'ID' 'NE' 'WY' 'ND' 'RI']
initial_list_status: ['w' 'f']
application_type: ['Individual' 'Joint App']
hardship_flag: ['N' 'Y']
disbursement_method: ['Cash' 'Di

In [82]:
feature_mappings = {    # all mappings for ordinal features
    'term': {' 36 months': 0, ' 60 months': 1},
    'grade': {'A': 6, 'E': 5, 'D': 4, 'C': 3, 'B': 2, 'G': 1, 'F': 0},
    'emp_length': {'10+ years': 10, '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4, '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9},
}


one_hot_encoding_limit = 5

label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder()

for feature in train_data.columns:
    print(feature)
    if train_data[feature].dtype == 'object':  # Check if the feature contains categorical data
        unique_values = train_data[feature].nunique()  # Count the number of unique values in the feature

        # Drop any feature which only has 1 distinct value (adds no information)
        if unique_values == 1:
            train_data.drop(feature)
            continue

        # If feature mapping is defined, use ordinal encoder
        if feature in feature_mappings:
            mapping = feature_mappings[feature]
            train_data[feature] = train_data[feature].map(mapping)
        else: 
            if unique_values <= one_hot_encoding_limit:
                encoded_features = one_hot_encoder.fit_transform(train_data[[feature]])
                encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=one_hot_encoder.get_feature_names_out([feature]))
                train_data = pd.concat([train_data, encoded_features_df], axis=1)
                train_data.drop([feature], axis=1, inplace=True)
            else:
                train_data[feature] = label_encoder.fit_transform(train_data[feature])
                

train_data

ID
loan_amnt
funded_amnt
funded_amnt_inv
term
int_rate
installment
emp_length
home_ownership
annual_inc
verification_status
loan_status
pymnt_plan
purpose
addr_state
dti
delinq_2yrs
fico_range_low
fico_range_high
inq_last_6mths
open_acc
pub_rec
revol_bal
revol_util
total_acc
initial_list_status
out_prncp
out_prncp_inv
total_pymnt
total_pymnt_inv
total_rec_prncp
total_rec_int
total_rec_late_fee
recoveries
collection_recovery_fee
last_pymnt_amnt
last_fico_range_high
last_fico_range_low
collections_12_mths_ex_med
policy_code
application_type
acc_now_delinq
tot_coll_amt
tot_cur_bal
total_rev_hi_lim
acc_open_past_24mths
avg_cur_bal
bc_open_to_buy
bc_util
chargeoff_within_12_mths
delinq_amnt
mo_sin_old_il_acct
mo_sin_old_rev_tl_op
mo_sin_rcnt_rev_tl_op
mo_sin_rcnt_tl
mort_acc
mths_since_recent_bc
mths_since_recent_inq
num_accts_ever_120_pd
num_actv_bc_tl
num_actv_rev_tl
num_bc_sats
num_bc_tl
num_il_tl
num_op_rev_tl
num_rev_accts
num_rev_tl_bal_gt_0
num_sats
num_tl_120dpd_2m
num_tl_30dpd
num_

Unnamed: 0,ID,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,annual_inc,loan_status,...,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,disbursement_method_Cash,disbursement_method_DirectPay,debt_settlement_flag_N,debt_settlement_flag_Y
0,24341,12500.0,12500.0,12500.0,0,7.21,387.17,0,81000.0,1,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,67534,33850.0,33850.0,33775.0,1,20.99,915.57,1,80000.0,1,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,35080,10000.0,10000.0,10000.0,1,20.00,264.94,0,36580.0,2,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,4828,20250.0,20250.0,20250.0,0,14.31,695.15,9,48700.0,0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
4,59259,25000.0,25000.0,25000.0,0,14.99,866.52,1,85000.0,1,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,6644,20000.0,20000.0,20000.0,1,16.02,486.58,5,54000.0,1,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
9996,25910,7500.0,7500.0,7500.0,0,9.49,240.22,10,46386.0,0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
9997,95698,20750.0,20750.0,20750.0,1,15.05,494.19,0,185000.0,1,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
9998,27371,11000.0,11000.0,11000.0,1,16.29,269.20,1,38500.0,0,...,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [83]:
# split dataframe to features and target
X = train_data.drop('grade', axis=1)  # Features
y = train_data[('grade')] 

# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(train_data, y, test_size=0.2, random_state=42)

In [85]:

X_train = X_train.drop(columns=['ID',])


X_test = X_test.drop(columns=['ID',])



In [86]:
# scale numeric features
scaler = MinMaxScaler()
#scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)

X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)


# Neural Network

In [71]:
clf = MLPClassifier(solver='adam', activation='relu', alpha=1e-5, hidden_layer_sizes=(28, 28, 28,), random_state=69, max_iter=2000, verbose=True, tol=1e-4)

# Here we use 5-fold cross-validation
cv_scores = cross_val_score(clf, X_train_scaled, y_train, cv=4)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

clf.fit(X_train_scaled, y_train)

Iteration 1, loss = 1.86069370
Iteration 2, loss = 1.49013612
Iteration 3, loss = 1.31278903
Iteration 4, loss = 1.17502473
Iteration 5, loss = 1.02385535
Iteration 6, loss = 0.86717050
Iteration 7, loss = 0.72770083
Iteration 8, loss = 0.61126829
Iteration 9, loss = 0.52618939
Iteration 10, loss = 0.46042791
Iteration 11, loss = 0.40964440
Iteration 12, loss = 0.37456949
Iteration 13, loss = 0.34916383
Iteration 14, loss = 0.32625200
Iteration 15, loss = 0.30785549
Iteration 16, loss = 0.28524925
Iteration 17, loss = 0.26988269
Iteration 18, loss = 0.25890839
Iteration 19, loss = 0.24914898
Iteration 20, loss = 0.23526224
Iteration 21, loss = 0.22962608
Iteration 22, loss = 0.22164007
Iteration 23, loss = 0.20866979
Iteration 24, loss = 0.20054873
Iteration 25, loss = 0.19472424
Iteration 26, loss = 0.18517917
Iteration 27, loss = 0.18085716
Iteration 28, loss = 0.17473860
Iteration 29, loss = 0.16779193
Iteration 30, loss = 0.16212513
Iteration 31, loss = 0.15682848
Iteration 32, los



Iteration 1, loss = 1.85798167
Iteration 2, loss = 1.49081640
Iteration 3, loss = 1.31024258
Iteration 4, loss = 1.17889700
Iteration 5, loss = 1.03265666
Iteration 6, loss = 0.87983074
Iteration 7, loss = 0.74288282
Iteration 8, loss = 0.62705504
Iteration 9, loss = 0.53664096
Iteration 10, loss = 0.46592649
Iteration 11, loss = 0.41661259
Iteration 12, loss = 0.38034276




Cross-validation scores: [0.8736 0.8724 0.8688 0.8216]
Mean cross-validation score: 0.8591
Iteration 1, loss = 1.78552053
Iteration 2, loss = 1.39777532
Iteration 3, loss = 1.20423068
Iteration 4, loss = 1.00857933
Iteration 5, loss = 0.81969918
Iteration 6, loss = 0.66397647
Iteration 7, loss = 0.54472211
Iteration 8, loss = 0.46339978
Iteration 9, loss = 0.40549850
Iteration 10, loss = 0.36791210
Iteration 11, loss = 0.33708614
Iteration 12, loss = 0.31058128
Iteration 13, loss = 0.29400437




In [87]:
from sklearn.metrics import accuracy_score, f1_score

clf = MLPClassifier(solver='sgd', activation='tanh', alpha=1e-5, hidden_layer_sizes=(42, 42, 42, 42, 42), random_state=69, max_iter=3000, verbose=True, tol=1e-5)
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

Iteration 1, loss = 1.88017774
Iteration 2, loss = 1.66423732
Iteration 3, loss = 1.59943306
Iteration 4, loss = 1.56560488
Iteration 5, loss = 1.53935152
Iteration 6, loss = 1.51288753
Iteration 7, loss = 1.48263387
Iteration 8, loss = 1.44690979
Iteration 9, loss = 1.40591036
Iteration 10, loss = 1.36108941
Iteration 11, loss = 1.31460681
Iteration 12, loss = 1.26805982
Iteration 13, loss = 1.22194401
Iteration 14, loss = 1.17717922
Iteration 15, loss = 1.13338279
Iteration 16, loss = 1.09036807
Iteration 17, loss = 1.04707994
Iteration 18, loss = 1.00534099
Iteration 19, loss = 0.96301495
Iteration 20, loss = 0.92082567
Iteration 21, loss = 0.87782202
Iteration 22, loss = 0.83430522
Iteration 23, loss = 0.78949109
Iteration 24, loss = 0.74511573
Iteration 25, loss = 0.70040612
Iteration 26, loss = 0.65589849
Iteration 27, loss = 0.61258219
Iteration 28, loss = 0.57121269
Iteration 29, loss = 0.53148211
Iteration 30, loss = 0.49464378
Iteration 31, loss = 0.46020945
Iteration 32, los

ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].