In [8]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.io import arff
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler, OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

# Preprocessing

In [9]:
data = arff.loadarff('./datasets/credit-g/dataset_31_credit-g.arff')
data = pd.DataFrame(data[0])

for column in data.columns:
    # replace missing values with pd.NA
    data[column] = data[column].replace(b'?', pd.NA)
    data[column] = data[column].replace(b'', pd.NA)
    data[column] = data[column].replace(b'NONE', pd.NA)
    # decode bytes to string for each column
    data[column] = data[column].apply(lambda x: x.decode() if isinstance(x, bytes) else x)

In [10]:
data

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12.0,existing paid,furniture/equipment,1736.0,<100,4<=X<7,3.0,female div/dep/mar,none,...,real estate,31.0,none,own,1.0,unskilled resident,1.0,none,yes,good
996,<0,30.0,existing paid,used car,3857.0,<100,1<=X<4,4.0,male div/sep,none,...,life insurance,40.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes,good
997,no checking,12.0,existing paid,radio/tv,804.0,<100,>=7,4.0,male single,none,...,car,38.0,none,own,1.0,skilled,1.0,none,yes,good
998,<0,45.0,existing paid,radio/tv,1845.0,<100,1<=X<4,4.0,male single,none,...,no known property,23.0,none,for free,1.0,skilled,1.0,yes,yes,bad


In [11]:
y = data[('class')] 
X = data.drop('class', axis=1)  # Features


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
print("null values in train data: "+ str(X_train.isnull().sum().sum()))
print("null values in test data: "+ str(X_test.isnull().sum().sum()))

null values in train data: 0
null values in test data: 0


In [13]:
# find all numerical columns in X_train
numerical_columns = X_train.select_dtypes(include="number").columns
# convert numerical columns to list
numerical_columns = numerical_columns.tolist()
numerical_columns

['duration',
 'credit_amount',
 'installment_commitment',
 'residence_since',
 'age',
 'existing_credits',
 'num_dependents']

In [14]:
feature_mappings = {
    'checking_status': ['no checking', '<0', '0<=X<200', '>=200'],
    'saving_status': ['no known savings', '<100', '100<=X<500', '500<=X<1000', '>=1000'],
    'employment': ['unemployed', '<1', '1<=X<4', '4<=X<7', '>=7'],
    'own_telephone': ['none', 'yes'],
    'foreign_worker': ['no', 'yes'],
}

ordinal_columns = [col for col in X_train.columns if col in feature_mappings.keys()]
ordinal_categories = [feature_mappings[col] for col in ordinal_columns]
ordinal_columns

['checking_status', 'employment', 'own_telephone', 'foreign_worker']

In [15]:
categorical_columns = [col for col in X_train.columns if col not in ordinal_columns and col not in numerical_columns] 

one_hot_encoding_limit = 10
one_hot_columns = [col for col in categorical_columns if X_train[col].nunique() <= one_hot_encoding_limit]

label_columns = [col for col in categorical_columns if col not in one_hot_columns]

In [16]:
preprocessor = ColumnTransformer([
    ('ordinal', OrdinalEncoder(categories=ordinal_categories), ordinal_columns),
    ('categorical', OneHotEncoder(handle_unknown="ignore"), one_hot_columns),
    ('label', OrdinalEncoder(), label_columns)
],
    remainder='passthrough'  # passthrough columns not listed in any pipeline
)

In [17]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

# Preprocess training data
X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [18]:
# Encode target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Neural Network

In [19]:
clf = MLPClassifier(solver='sgd', activation='tanh', alpha=1e-5, hidden_layer_sizes=(42, 42, 42, 42, 42), random_state=69, max_iter=3000, verbose=True, tol=1e-5)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

Iteration 1, loss = 0.68221461
Iteration 2, loss = 0.67755450
Iteration 3, loss = 0.67063108
Iteration 4, loss = 0.66296165
Iteration 5, loss = 0.65543025
Iteration 6, loss = 0.64837737
Iteration 7, loss = 0.64176066
Iteration 8, loss = 0.63539397
Iteration 9, loss = 0.63002879
Iteration 10, loss = 0.62524780
Iteration 11, loss = 0.62113344
Iteration 12, loss = 0.61736335
Iteration 13, loss = 0.61413769
Iteration 14, loss = 0.61127626
Iteration 15, loss = 0.60852881
Iteration 16, loss = 0.60607819
Iteration 17, loss = 0.60398344
Iteration 18, loss = 0.60201802
Iteration 19, loss = 0.60009480
Iteration 20, loss = 0.59831830
Iteration 21, loss = 0.59672158
Iteration 22, loss = 0.59519982
Iteration 23, loss = 0.59374363
Iteration 24, loss = 0.59221689
Iteration 25, loss = 0.59089260
Iteration 26, loss = 0.58959496
Iteration 27, loss = 0.58830492
Iteration 28, loss = 0.58715042
Iteration 29, loss = 0.58586506
Iteration 30, loss = 0.58473283
Iteration 31, loss = 0.58356975
Iteration 32, los

In [20]:
clf = MLPClassifier(solver='sgd', activation='tanh', alpha=1e-5, hidden_layer_sizes=(42, 42, 42,), random_state=69, max_iter=2000, verbose=True, tol=1e-4)

# Here we use 5-fold cross-validation
cv_scores = cross_val_score(clf, X_train, y_train, cv=4)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

Iteration 1, loss = 0.74273948
Iteration 2, loss = 0.73852487
Iteration 3, loss = 0.73190717
Iteration 4, loss = 0.72444167
Iteration 5, loss = 0.71590772
Iteration 6, loss = 0.70718959
Iteration 7, loss = 0.69861143
Iteration 8, loss = 0.68991461
Iteration 9, loss = 0.68131191
Iteration 10, loss = 0.67390615
Iteration 11, loss = 0.66621901
Iteration 12, loss = 0.65915837
Iteration 13, loss = 0.65257807
Iteration 14, loss = 0.64661661
Iteration 15, loss = 0.64100222
Iteration 16, loss = 0.63582553
Iteration 17, loss = 0.63072822
Iteration 18, loss = 0.62621387
Iteration 19, loss = 0.62198118
Iteration 20, loss = 0.61804855
Iteration 21, loss = 0.61410304
Iteration 22, loss = 0.61092368
Iteration 23, loss = 0.60728485
Iteration 24, loss = 0.60417534
Iteration 25, loss = 0.60140473
Iteration 26, loss = 0.59853493
Iteration 27, loss = 0.59602282
Iteration 28, loss = 0.59344890
Iteration 29, loss = 0.59108634
Iteration 30, loss = 0.58893749
Iteration 31, loss = 0.58663030
Iteration 32, los

In [10]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm_df)

Confusion Matrix:
     0   1
0  100  41
1   23  36


# Random Forrest

In [11]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=4)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_train)


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm_df)


Cross-validation scores: [0.725 0.695 0.735 0.77 ]
Mean cross-validation score: 0.73125
Accuracy: 0.78
F1 Score: 0.5416666666666666
Confusion Matrix:
     0   1
0  130  11
1   33  26


# Naive Bayes

In [12]:
nb_classifier = MultinomialNB()
cv_scores = cross_val_score(nb_classifier, X_train, y_train, cv=4)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_train)


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm_df)

Cross-validation scores: [0.735 0.665 0.72  0.735]
Mean cross-validation score: 0.71375
Accuracy: 0.725
F1 Score: 0.42105263157894735
Confusion Matrix:
     0   1
0  125  16
1   39  20
