In [405]:
from scipy.io import arff
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [406]:
# Read the training data, test data, and solution CSV files
path = "./datasets_kaggle/loan-10k/"
train_data = pd.read_csv("%sloan-10k.lrn.csv" % path)
competition_data = pd.read_csv("%sloan-10k.tes.csv" % path)
#solution = pd.read_csv("%sloan-10k.tes.csv" % path)

train_data.replace(['NONE', '?', ''], pd.NA, inplace=True)
competition_data.replace(['NONE', '?', ''], pd.NA, inplace=True)

In [407]:
train_data.drop(columns=['ID',], inplace=True)
competition_data.drop(columns=['ID',], inplace=True)

In [408]:
for column in train_data.columns:
    # check if column is numeric
    if np.issubdtype(train_data[column].dtype, np.number):
        min_value = train_data[column].min()
        max_value = train_data[column].max()
        #print(f"{column}: min = {min_value}, max = {max_value}")
    else:
        # if column is non-numeric, keep it as it is
        unique_values = train_data[column].apply(lambda x: x.decode() if isinstance(x, bytes) else x).unique()
        print(column + ": " + str(unique_values))

term: [' 36 months' ' 60 months']
emp_length: ['< 1 year' '1 year' '9 years' '10+ years' '3 years' '4 years' '7 years'
 '2 years' '5 years' '6 years' '8 years']
home_ownership: ['MORTGAGE' 'RENT' 'OWN' 'ANY' 'OTHER']
verification_status: ['Not Verified' 'Source Verified' 'Verified']
loan_status: ['Current' 'Fully Paid' 'Charged Off' 'Late (31-120 days)'
 'In Grace Period' 'Late (16-30 days)']
pymnt_plan: ['n' 'y']
purpose: ['debt_consolidation' 'car' 'credit_card' 'other' 'major_purchase'
 'home_improvement' 'small_business' 'medical' 'vacation' 'moving' 'house'
 'renewable_energy' 'wedding']
addr_state: ['IL' 'OK' 'AZ' 'CA' 'FL' 'TX' 'PA' 'MA' 'NC' 'NV' 'SC' 'GA' 'NY' 'OR'
 'AL' 'VT' 'MD' 'KY' 'CO' 'IN' 'LA' 'MN' 'MI' 'WA' 'NJ' 'VA' 'WI' 'OH'
 'TN' 'CT' 'DC' 'HI' 'MO' 'DE' 'AR' 'KS' 'ME' 'MS' 'NM' 'AK' 'UT' 'WV'
 'MT' 'NH' 'SD' 'ID' 'NE' 'WY' 'ND' 'RI']
initial_list_status: ['w' 'f']
application_type: ['Individual' 'Joint App']
hardship_flag: ['N' 'Y']
disbursement_method: ['Cash' 'Di

In [409]:
y = train_data[('grade')] 
X = train_data.drop('grade', axis=1)  # Features


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# y_comp = competition_data['grade']  # Target variable does not exist for competition data
X_comp = competition_data
print(X_train.dtypes)

loan_amnt                   float64
funded_amnt                 float64
funded_amnt_inv             float64
term                         object
int_rate                    float64
                             ...   
earliest_cr_line_year         int64
last_pymnt_d_month            int64
last_pymnt_d_year             int64
last_credit_pull_d_month      int64
last_credit_pull_d_year       int64
Length: 90, dtype: object


In [410]:
print("null values in train data: "+ str(X_train.isnull().sum().sum()))
print("null values in test data: "+ str(X_test.isnull().sum().sum()))
print("null values in competition data: "+ str(X_comp.isnull().sum().sum()))

null values in train data: 0
null values in test data: 0
null values in competition data: 1


In [411]:
imputer = SimpleImputer(strategy='most_frequent', missing_values=pd.NA)

imputer.fit(X_train)

original_dtypes = X_train.dtypes
X_train_np = imputer.transform(X_train)
X_train = pd.DataFrame(X_train_np, index=X_train.index, columns=X_train.columns)
X_train = X_train.astype(original_dtypes)


X_test_np = imputer.transform(X_test)
X_test = pd.DataFrame(X_test_np, index=X_test.index, columns=X_test.columns)
X_test = X_test.astype(original_dtypes)

X_comp_np = imputer.transform(X_comp)
X_comp = pd.DataFrame(X_comp_np, index=X_comp.index, columns=X_comp.columns)
X_comp = X_comp.astype(original_dtypes)

In [412]:
print("null values in train data: "+ str(X_train.isnull().sum().sum()))
print("null values in test data: "+ str(X_test.isnull().sum().sum()))
print("null values in competition data: "+ str(X_comp.isnull().sum().sum()))

null values in train data: 0
null values in test data: 0
null values in competition data: 0


In [413]:
# find all numerical columns in X_train
numerical_columns = X_train.select_dtypes(include="number").columns
# convert numerical columns to list
numerical_columns = numerical_columns.tolist()
numerical_columns

['loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'int_rate',
 'installment',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc',
 'out_prncp',
 'out_prncp_inv',
 'total_pymnt',
 'total_pymnt_inv',
 'total_rec_prncp',
 'total_rec_int',
 'total_rec_late_fee',
 'recoveries',
 'collection_recovery_fee',
 'last_pymnt_amnt',
 'last_fico_range_high',
 'last_fico_range_low',
 'collections_12_mths_ex_med',
 'policy_code',
 'acc_now_delinq',
 'tot_coll_amt',
 'tot_cur_bal',
 'total_rev_hi_lim',
 'acc_open_past_24mths',
 'avg_cur_bal',
 'bc_open_to_buy',
 'bc_util',
 'chargeoff_within_12_mths',
 'delinq_amnt',
 'mo_sin_old_il_acct',
 'mo_sin_old_rev_tl_op',
 'mo_sin_rcnt_rev_tl_op',
 'mo_sin_rcnt_tl',
 'mort_acc',
 'mths_since_recent_bc',
 'mths_since_recent_inq',
 'num_accts_ever_120_pd',
 'num_actv_bc_tl',
 'num_actv_rev_tl',
 'num_bc_sats',
 'num_bc_tl',
 'num_il_tl',
 'num_op_rev

In [414]:
feature_mappings = {
    'term': [' 36 months', ' 60 months'],
    'emp_length': ['< 1 year', '1 year', '2 years', '3 years', '4 years', '5 years', '6 years', '7 years', '8 years', '9 years', '10+ years']
}

ordinal_columns = [col for col in X_train.columns if col in feature_mappings.keys()]
ordinal_categories = [feature_mappings[col] for col in ordinal_columns]
ordinal_columns

['term', 'emp_length']

In [415]:
categorical_columns = [col for col in X_train.columns if col not in ordinal_columns and col not in numerical_columns] 

one_hot_encoding_limit = 10
one_hot_columns = [col for col in categorical_columns if X_train[col].nunique() <= one_hot_encoding_limit]

label_columns = [col for col in categorical_columns if col not in one_hot_columns]


In [416]:
preprocessor = ColumnTransformer([
    ('ordinal', OrdinalEncoder(categories=ordinal_categories), ordinal_columns),
    ('categorical', OneHotEncoder(handle_unknown="ignore"), one_hot_columns),
    ('label', OrdinalEncoder(), label_columns)
],
    remainder='passthrough'  # passthrough columns not listed in any pipeline
)

In [417]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

# Preprocess training data
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)
X_comp = pipeline.transform(X_comp)

In [418]:
# Encode target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Neural Network

In [None]:
clf = MLPClassifier(solver='adam', activation='relu', alpha=1e-5, hidden_layer_sizes=(28, 28, 28,), random_state=69, max_iter=2000, verbose=True, tol=1e-4)

# Here we use 5-fold cross-validation
cv_scores = cross_val_score(clf, X_train, y_train, cv=4)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

clf.fit(X_train, y_train)

In [423]:
from sklearn.metrics import accuracy_score, f1_score

clf = MLPClassifier(solver='adam', activation='tanh', alpha=1e-5, hidden_layer_sizes=(82, 82, 82, 41, 20), random_state=69, max_iter=1000, verbose=True)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

Iteration 1, loss = 1.39812485
Iteration 2, loss = 0.87581696
Iteration 3, loss = 0.61340219
Iteration 4, loss = 0.48953635
Iteration 5, loss = 0.41529986
Iteration 6, loss = 0.37186995
Iteration 7, loss = 0.33035869
Iteration 8, loss = 0.30002630
Iteration 9, loss = 0.28551984
Iteration 10, loss = 0.25626617
Iteration 11, loss = 0.23605300
Iteration 12, loss = 0.22037542
Iteration 13, loss = 0.19648267
Iteration 14, loss = 0.18064842
Iteration 15, loss = 0.17339789
Iteration 16, loss = 0.15622064
Iteration 17, loss = 0.13709980
Iteration 18, loss = 0.12925479
Iteration 19, loss = 0.12897187
Iteration 20, loss = 0.10731117
Iteration 21, loss = 0.11207149
Iteration 22, loss = 0.10500144
Iteration 23, loss = 0.10384489
Iteration 24, loss = 0.09651901
Iteration 25, loss = 0.08541802
Iteration 26, loss = 0.07292397
Iteration 27, loss = 0.08096710
Iteration 28, loss = 0.06830596
Iteration 29, loss = 0.09051802
Iteration 30, loss = 0.07863376
Iteration 31, loss = 0.06184478
Iteration 32, los

In [425]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)

rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.827
