In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.io import arff
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

# Preprocessing

In [2]:
data = arff.loadarff('./datasets/credit-g/dataset_31_credit-g.arff')
data = pd.DataFrame(data[0])

for column in data.columns:
    # decode bytes to string for each column
    data[column] = data[column].apply(lambda x: x.decode() if isinstance(x, bytes) else x)

In [3]:
data

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12.0,existing paid,furniture/equipment,1736.0,<100,4<=X<7,3.0,female div/dep/mar,none,...,real estate,31.0,none,own,1.0,unskilled resident,1.0,none,yes,good
996,<0,30.0,existing paid,used car,3857.0,<100,1<=X<4,4.0,male div/sep,none,...,life insurance,40.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes,good
997,no checking,12.0,existing paid,radio/tv,804.0,<100,>=7,4.0,male single,none,...,car,38.0,none,own,1.0,skilled,1.0,none,yes,good
998,<0,45.0,existing paid,radio/tv,1845.0,<100,1<=X<4,4.0,male single,none,...,no known property,23.0,none,for free,1.0,skilled,1.0,yes,yes,bad


In [4]:
feature_mappings = {
    'checking_status': {'no checking': 0, '<0': 1, '0<=X<200': 2, '>=200': 3},
    'saving_status': {'no known savings': 0, '<100': 1, '100<=X<500': 2, '500<=X<1000': 3, '>=1000': 4},
    'employment': {'unemployed': 0, '<1': 1, '1<=X<4': 2, '4<=X<7': 3, '>=7': 4},
    'own_telephone': {'none': 0, 'yes': 1},
    'foreign_worker': {'no': 0, 'yes': 1},
    'class': {'good': 0, 'bad': 1}
}

In [5]:
one_hot_encoding_limit = 10

label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder()

for feature in data.columns:
    if data[feature].dtype == 'object':  # Check if the feature contains categorical data
        unique_values = data[feature].nunique()  # Count the number of unique values in the feature

        # Drop any feature which only has 1 distinct value (adds no information)
        if unique_values == 1:
            data.drop(feature)
            continue
        
        # If feature mapping is defined, use ordinal encoder
        if feature in feature_mappings:
            mapping = feature_mappings[feature]
            data[feature] = data[feature].map(mapping)
        else: 
            if unique_values <= one_hot_encoding_limit:
                encoded_features = one_hot_encoder.fit_transform(data[[feature]])
                encoded_features_df = pd.DataFrame(encoded_features.toarray(), columns=one_hot_encoder.get_feature_names_out([feature]))
                data = pd.concat([data, encoded_features_df], axis=1)
                data.drop([feature], axis=1, inplace=True)
            else:
                data[feature] = label_encoder.fit_transform(data[feature])
            

In [6]:
data

Unnamed: 0,checking_status,duration,credit_amount,employment,installment_commitment,residence_since,age,existing_credits,num_dependents,own_telephone,...,other_payment_plans_bank,other_payment_plans_none,other_payment_plans_stores,housing_for free,housing_own,housing_rent,job_high qualif/self emp/mgmt,job_skilled,job_unemp/unskilled non res,job_unskilled resident
0,1,6.0,1169.0,4,4.0,4.0,67.0,2.0,1.0,1,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,2,48.0,5951.0,2,2.0,2.0,22.0,1.0,1.0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0,12.0,2096.0,3,2.0,3.0,49.0,1.0,2.0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,1,42.0,7882.0,3,2.0,4.0,45.0,1.0,2.0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1,24.0,4870.0,2,3.0,4.0,53.0,2.0,2.0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,12.0,1736.0,3,3.0,4.0,31.0,1.0,1.0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
996,1,30.0,3857.0,2,4.0,4.0,40.0,1.0,1.0,1,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
997,0,12.0,804.0,4,4.0,4.0,38.0,1.0,1.0,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
998,1,45.0,1845.0,2,4.0,4.0,23.0,1.0,1.0,1,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [7]:
# split dataframe to features and target
X = data.drop('class', axis=1)  # Features
y = data['class']            #target 

# split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# scale numeric features
scaler = MinMaxScaler()
#scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)

X_test_scaled = scaler.transform(X_test)
X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)

# Neural Network

In [9]:
clf = MLPClassifier(solver='sgd', activation='tanh', alpha=1e-5, hidden_layer_sizes=(42, 42, 42, 42, 42), random_state=69, max_iter=3000, verbose=True, tol=1e-5)
clf.fit(X_train_scaled, y_train)

y_pred = clf.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

Iteration 1, loss = 0.95933614
Iteration 2, loss = 0.88799355
Iteration 3, loss = 0.79956058
Iteration 4, loss = 0.72236578
Iteration 5, loss = 0.66955071
Iteration 6, loss = 0.63910636
Iteration 7, loss = 0.62380877
Iteration 8, loss = 0.61515934
Iteration 9, loss = 0.61286047
Iteration 10, loss = 0.61203718
Iteration 11, loss = 0.61232865
Iteration 12, loss = 0.61233844
Iteration 13, loss = 0.61218328
Iteration 14, loss = 0.61187363
Iteration 15, loss = 0.61140206
Iteration 16, loss = 0.61101438
Iteration 17, loss = 0.61036809
Iteration 18, loss = 0.60998756
Iteration 19, loss = 0.60962021
Iteration 20, loss = 0.60935789
Iteration 21, loss = 0.60894536
Iteration 22, loss = 0.60862641
Iteration 23, loss = 0.60835764
Iteration 24, loss = 0.60810143
Iteration 25, loss = 0.60781065
Iteration 26, loss = 0.60755569
Iteration 27, loss = 0.60725488
Iteration 28, loss = 0.60700470
Iteration 29, loss = 0.60672288
Iteration 30, loss = 0.60647390
Iteration 31, loss = 0.60620992
Iteration 32, los

In [117]:
clf = MLPClassifier(solver='sgd', activation='tanh', alpha=1e-5, hidden_layer_sizes=(42, 42, 42,), random_state=69, max_iter=2000, verbose=True, tol=1e-4)

# Here we use 5-fold cross-validation
cv_scores = cross_val_score(clf, X_train_scaled, y_train, cv=4)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

Iteration 1, loss = 0.76263406
Iteration 2, loss = 0.75804082
Iteration 3, loss = 0.75094014
Iteration 4, loss = 0.74290888
Iteration 5, loss = 0.73377879
Iteration 6, loss = 0.72441395
Iteration 7, loss = 0.71519559
Iteration 8, loss = 0.70642658
Iteration 9, loss = 0.69696630
Iteration 10, loss = 0.68890419
Iteration 11, loss = 0.68109423
Iteration 12, loss = 0.67380374
Iteration 13, loss = 0.66705541
Iteration 14, loss = 0.66086686
Iteration 15, loss = 0.65480942
Iteration 16, loss = 0.64938580
Iteration 17, loss = 0.64425980
Iteration 18, loss = 0.63948947
Iteration 19, loss = 0.63508995
Iteration 20, loss = 0.63098494
Iteration 21, loss = 0.62701498
Iteration 22, loss = 0.62360972
Iteration 23, loss = 0.61993195
Iteration 24, loss = 0.61664294
Iteration 25, loss = 0.61362783
Iteration 26, loss = 0.61064709
Iteration 27, loss = 0.60799318
Iteration 28, loss = 0.60532805
Iteration 29, loss = 0.60285042
Iteration 30, loss = 0.60049357
Iteration 31, loss = 0.59815956
Iteration 32, los

In [10]:
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm_df)

Confusion Matrix:
     0   1
0  100  41
1   23  36


# Random Forrest

In [11]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
cv_scores = cross_val_score(rf_classifier, X_train_scaled, y_train, cv=4)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

rf_classifier.fit(X_train_scaled, y_train)
y_pred = rf_classifier.predict(X_test_scaled)


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm_df)


Cross-validation scores: [0.725 0.695 0.735 0.77 ]
Mean cross-validation score: 0.73125
Accuracy: 0.78
F1 Score: 0.5416666666666666
Confusion Matrix:
     0   1
0  130  11
1   33  26


# Naive Bayes

In [12]:
nb_classifier = MultinomialNB()
cv_scores = cross_val_score(nb_classifier, X_train_scaled, y_train, cv=4)

print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())

nb_classifier.fit(X_train_scaled, y_train)
y_pred = nb_classifier.predict(X_test_scaled)


# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)
# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(cm)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm_df)

Cross-validation scores: [0.735 0.665 0.72  0.735]
Mean cross-validation score: 0.71375
Accuracy: 0.725
F1 Score: 0.42105263157894735
Confusion Matrix:
     0   1
0  125  16
1   39  20
