In [176]:
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix


In [177]:
# Read the training data, test data, and solution CSV files
path = "./datasets_kaggle/loan-10k/"
train_data = pd.read_csv("%sloan-10k.lrn.csv" % path)
competition_data = pd.read_csv("%sloan-10k.tes.csv" % path)
#solution = pd.read_csv("%sloan-10k.tes.csv" % path)

train_data.replace(['NONE', '?', '', 'None', b''], pd.NA, inplace=True)
competition_data.replace(['NONE', '?', '', 'None', b''], pd.NA, inplace=True)

In [178]:
train_data.drop(columns=['ID',], inplace=True)
competition_data.drop(columns=['ID',], inplace=True)

In [179]:
for column in train_data.columns:
    # check if column is numeric
    if np.issubdtype(train_data[column].dtype, np.number):
        min_value = train_data[column].min()
        max_value = train_data[column].max()
        #print(f"{column}: min = {min_value}, max = {max_value}")
    else:
        # if column is non-numeric, keep it as it is
        unique_values = train_data[column].apply(lambda x: x.decode() if isinstance(x, bytes) else x).unique()
        print(column + ": " + str(unique_values))

term: [' 36 months' ' 60 months']
emp_length: ['< 1 year' '1 year' '9 years' '10+ years' '3 years' '4 years' '7 years'
 '2 years' '5 years' '6 years' '8 years']
home_ownership: ['MORTGAGE' 'RENT' 'OWN' 'ANY' 'OTHER']
verification_status: ['Not Verified' 'Source Verified' 'Verified']
loan_status: ['Current' 'Fully Paid' 'Charged Off' 'Late (31-120 days)'
 'In Grace Period' 'Late (16-30 days)']
pymnt_plan: ['n' 'y']
purpose: ['debt_consolidation' 'car' 'credit_card' 'other' 'major_purchase'
 'home_improvement' 'small_business' 'medical' 'vacation' 'moving' 'house'
 'renewable_energy' 'wedding']
addr_state: ['IL' 'OK' 'AZ' 'CA' 'FL' 'TX' 'PA' 'MA' 'NC' 'NV' 'SC' 'GA' 'NY' 'OR'
 'AL' 'VT' 'MD' 'KY' 'CO' 'IN' 'LA' 'MN' 'MI' 'WA' 'NJ' 'VA' 'WI' 'OH'
 'TN' 'CT' 'DC' 'HI' 'MO' 'DE' 'AR' 'KS' 'ME' 'MS' 'NM' 'AK' 'UT' 'WV'
 'MT' 'NH' 'SD' 'ID' 'NE' 'WY' 'ND' 'RI']
initial_list_status: ['w' 'f']
application_type: ['Individual' 'Joint App']
hardship_flag: ['N' 'Y']
disbursement_method: ['Cash' 'Di

In [180]:
y = train_data[('grade')] 
X = train_data.drop('grade', axis=1)  # Features


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# y_comp = competition_data['grade']  # Target variable does not exist for competition data
X_comp = competition_data
print(X_train.dtypes)

loan_amnt                   float64
funded_amnt                 float64
funded_amnt_inv             float64
term                         object
int_rate                    float64
                             ...   
earliest_cr_line_year         int64
last_pymnt_d_month            int64
last_pymnt_d_year             int64
last_credit_pull_d_month      int64
last_credit_pull_d_year       int64
Length: 90, dtype: object


In [181]:
print("null values in train data: "+ str(X_train.isnull().sum().sum()))
print("null values in test data: "+ str(X_test.isnull().sum().sum()))
print("null values in competition data: "+ str(X_comp.isnull().sum().sum()))

null values in train data: 0
null values in test data: 0
null values in competition data: 1


In [182]:
imputer = SimpleImputer(strategy='most_frequent', missing_values=pd.NA)

imputer.fit(X_train)

original_dtypes = X_train.dtypes
X_train_np = imputer.transform(X_train)
X_train = pd.DataFrame(X_train_np, index=X_train.index, columns=X_train.columns)
X_train = X_train.astype(original_dtypes)


original_dtypes = X_test.dtypes
X_test_np = imputer.transform(X_test)
X_test = pd.DataFrame(X_test_np, index=X_test.index, columns=X_test.columns)
X_test = X_test.astype(original_dtypes)

original_dtypes = X_comp.dtypes
X_comp_np = imputer.transform(X_comp)
X_comp = pd.DataFrame(X_comp_np, index=X_comp.index, columns=X_comp.columns)
X_comp = X_comp.astype(original_dtypes)

In [183]:
print("null values in train data: "+ str(X_train.isnull().sum().sum()))
print("null values in test data: "+ str(X_test.isnull().sum().sum()))
print("null values in competition data: "+ str(X_comp.isnull().sum().sum()))

null values in train data: 0
null values in test data: 0
null values in competition data: 0


In [184]:
# find all numerical columns in X_train
numerical_columns = X_train.select_dtypes(include="number").columns
# convert numerical columns to list
numerical_columns = numerical_columns.tolist()
numerical_columns

['loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'int_rate',
 'installment',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'last_fico_range_high',
 'last_fico_range_low',
 'collections_12_mths_ex_med',
 'policy_code',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'total_rev_hi_lim',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl',
 'mort_acc',
 'mths_since_recent_bc',
 'mths_since_recent_inq',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_bc_tl',
 'num_il_tl',
 'num_op_rev

In [185]:
feature_mappings = {
    'term': [' 36 months', ' 60 months'],
    'emp_length': ['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']
}

ordinal_columns = [col for col in X_train.columns if col in feature_mappings.keys()]
ordinal_categories = [feature_mappings[col] for col in ordinal_columns]
ordinal_columns

['term', 'emp_length']

In [186]:
categorical_columns = [col for col in X_train.columns if col not in ordinal_columns and col not in numerical_columns] 

one_hot_encoding_limit = 5
one_hot_columns = [col for col in categorical_columns if X_train[col].nunique() <= one_hot_encoding_limit]

label_columns = [col for col in categorical_columns if col not in one_hot_columns]

print("one hot columns: ", one_hot_columns)
print("label columns: ", label_columns)

one hot columns:  ['home_ownership', 'verification_status', 'pymnt_plan', 'initial_list_status', 'application_type', 'hardship_flag', 'disbursement_method', 'debt_settlement_flag']
label columns:  ['loan_status', 'purpose', 'addr_state']


In [187]:
all = numerical_columns + ordinal_columns + one_hot_columns + label_columns
all.sort()
all

['acc_now_delinq',
 'acc_open_past_24mths',
 'addr_state',
 'annual_inc',
 'application_type',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'collection_recovery_fee',
 'collections_12_mths_ex_med',
 'debt_settlement_flag',
 'delinq_2yrs',
 'delinq_amnt',
 'disbursement_method',
 'dti',
 'earliest_cr_line_month',
 'earliest_cr_line_year',
 'emp_length',
 'fico_range_high',
 'fico_range_low',
 'funded_amnt',
 'funded_amnt_inv',
 'hardship_flag',
 'home_ownership',
 'initial_list_status',
 'inq_last_6mths',
 'installment',
 'int_rate',
 'issue_d_month',
 'issue_d_year',
 'last_credit_pull_d_month',
 'last_credit_pull_d_year',
 'last_fico_range_high',
 'last_fico_range_low',
 'last_pymnt_amnt',
 'last_pymnt_d_month',
 'last_pymnt_d_year',
 'loan_amnt',
 'loan_status',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl',
 'mort_acc',
 'mths_since_recent_bc',
 'mths_since_recent_inq',
 'num_accts_ever_120_pd',
 'num_actv

In [188]:
preprocessor = ColumnTransformer([
    ('ordinal', OrdinalEncoder(categories=ordinal_categories), ordinal_columns),
    ('categorical', OneHotEncoder(handle_unknown="ignore"), one_hot_columns),
    ('label', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan), label_columns)
],
    remainder='passthrough'  # passthrough columns not listed in any pipeline
)

In [189]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('imputer', SimpleImputer(strategy='most_frequent', missing_values=np.nan)),
    #('scaler', MinMaxScaler()),
    ('scaler', StandardScaler())
])

# Preprocess training data
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)
X_comp = pipeline.transform(X_comp)

In [190]:
# Encode target variable
label_encoder = LabelEncoder()

#label = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6}
#y_train = y_train.map(label)
#y_test = y_test.map(label)
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Neural Network

In [191]:

clf = MLPClassifier(solver='adam', activation='tanh', alpha=1e-4, hidden_layer_sizes=(80, 80), random_state=69, max_iter=3000, verbose=True, tol=5e-5)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

# confusion matrix
confusion_matrix(y_test, y_pred)

Iteration 1, loss = 1.57879205
Iteration 2, loss = 1.12893027
Iteration 3, loss = 0.91737868
Iteration 4, loss = 0.76336364
Iteration 5, loss = 0.63583763
Iteration 6, loss = 0.53879528
Iteration 7, loss = 0.46624455
Iteration 8, loss = 0.41590962
Iteration 9, loss = 0.37583773
Iteration 10, loss = 0.34753114
Iteration 11, loss = 0.32314776
Iteration 12, loss = 0.30187841
Iteration 13, loss = 0.28575396
Iteration 14, loss = 0.26690964
Iteration 15, loss = 0.26040245
Iteration 16, loss = 0.24456459
Iteration 17, loss = 0.23264191
Iteration 18, loss = 0.21870385
Iteration 19, loss = 0.20807902
Iteration 20, loss = 0.19925212
Iteration 21, loss = 0.19664929
Iteration 22, loss = 0.18518790
Iteration 23, loss = 0.17655062
Iteration 24, loss = 0.16580022
Iteration 25, loss = 0.15659877
Iteration 26, loss = 0.15059743
Iteration 27, loss = 0.14436577
Iteration 28, loss = 0.13648288
Iteration 29, loss = 0.12946863
Iteration 30, loss = 0.12504426
Iteration 31, loss = 0.11999162
Iteration 32, los

array([[498,  43,   0,   0,   0,   0,   0],
       [ 47, 778,  38,   0,   0,   0,   0],
       [  0,  51, 797,  45,   0,   0,   0],
       [  1,   0,  33, 374,  31,   2,   0],
       [  0,   0,   0,  57, 125,  11,   5],
       [  0,   0,   0,   0,  13,  30,   3],
       [  0,   0,   0,   0,   0,  10,   8]])

In [164]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

# confusion matrix
confusion_matrix(y_test, y_pred)

Accuracy: 0.837
Recall: 0.837
Precision: 0.8291426198358631
F1 Score: 0.8194771490028775


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


array([[332,  25,   0,   0,   0,   0,   0],
       [ 13, 523,  45,   0,   0,   0,   0],
       [  0,  21, 560,   6,   0,   0,   0],
       [  0,   3,  63, 226,   1,   0,   0],
       [  0,   1,   8,  93,  31,   0,   0],
       [  0,   0,   0,  18,  15,   2,   0],
       [  0,   0,   0,   4,  10,   0,   0]])

In [165]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

# confusion matrix
confusion_matrix(y_test, y_pred)

Accuracy: 0.0495
Recall: 0.0495
Precision: 0.34030555444425514
F1 Score: 0.06424394668771045


array([[ 72,   1,   0,   1,  11,   1, 271],
       [ 27,   3,   0,   0,  28,   6, 517],
       [ 12,   0,   0,   2,  50,   8, 515],
       [  0,   0,   1,   0,  29,   2, 261],
       [  0,   0,   0,   2,  10,   4, 117],
       [  0,   0,   0,   0,   1,   2,  32],
       [  0,   0,   0,   0,   2,   0,  12]])

In [174]:
# classification using svm
from sklearn.svm import SVC

svm_classifier = SVC(kernel='rbf')
svm_classifier.fit(X_train, y_train)
y_pred = svm_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

# confusion matrix
confusion_matrix(y_test, y_pred)

Accuracy: 0.7765
Recall: 0.7765
Precision: 0.7566652023591621
F1 Score: 0.7649579872147863


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


array([[293,  61,   3,   0,   0,   0,   0],
       [ 37, 478,  66,   0,   0,   0,   0],
       [  0,  56, 514,  17,   0,   0,   0],
       [  0,   1,  74, 210,   8,   0,   0],
       [  0,   0,   3,  72,  58,   0,   0],
       [  0,   0,   0,   8,  27,   0,   0],
       [  0,   0,   0,   1,  12,   1,   0]])