In [8]:
# XGBoost Classifier on Wallet Dataset

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import xgboost as xgb
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.float_format', '{:.3f}'.format)

In [9]:
## Step 2: Load Dataset
df = pd.read_excel("Wallet.xlsx")
print(df.head())

## Step 3: Preprocess Data
X = df.drop("wallet", axis=1)
y = df["wallet"] - 1  # convert to 0,1,2

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=6721)

   wallet  male  business  punish  explain
0       2     0         0       2        0
1       2     0         0       2        1
2       3     0         0       1        1
3       3     0         0       2        0
4       1     1         0       1        1


In [15]:
## Step 4: Train XGBoost Classifier
model = XGBClassifier(
    learning_rate=0.1,
    max_depth=2,
    n_estimators=100,
    objective="multi:softmax",
    num_class=3,
    eval_metric="mlogloss",
    use_label_encoder=False
)

model.fit(X_train, y_train)

## Step 5: Evaluate Model
def evaluate_model(y_true, y_pred, dataset_name):
    print(f"--- {dataset_name} ---")
    print("Accuracy:", round(accuracy_score(y_true, y_pred),3))
    print("Precision (macro):", round(precision_score(y_true, y_pred, average='macro'),3))
    print("Recall (macro):", round(recall_score(y_true, y_pred, average='macro'),3))
    print("F1 Score (macro):", round(f1_score(y_true, y_pred, average='macro'),3))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    print("Classification Report:")
    print(classification_report(y_true, y_pred))
    
# Predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

evaluate_model(y_train, train_pred, "Training Set")
evaluate_model(y_test, test_pred, "Test Set")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- Training Set ---
Accuracy: 0.678
Precision (macro): 0.702
Recall (macro): 0.519
F1 Score (macro): 0.514
Confusion Matrix:
[[10  0 10]
 [ 2  4 30]
 [ 4  1 85]]
Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.50      0.56        20
           1       0.80      0.11      0.20        36
           2       0.68      0.94      0.79        90

    accuracy                           0.68       146
   macro avg       0.70      0.52      0.51       146
weighted avg       0.70      0.68      0.61       146

--- Test Set ---
Accuracy: 0.571
Precision (macro): 0.286
Recall (macro): 0.374
F1 Score (macro): 0.324
Confusion Matrix:
[[ 1  0  3]
 [ 3  0 11]
 [ 1  3 27]]
Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.25      0.22         4
           1       0.00      0.00      0.00        14
           2       0.66      0.87      0.75        31

    accuracy                     

In [16]:
## Step 6: Analyze Learning Rate and Max Depth
results = []
for lr in [0.1, 0.5]:
    for depth in [2, 5]:
        clf = xgb.XGBClassifier(learning_rate=lr, max_depth=depth, n_estimators=100,
                                objective="multi:softmax", num_class=4, eval_metric="mlogloss")
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        acc_test = accuracy_score(y_test, pred)
        acc_train = accuracy_score(y_train, clf.predict(X_train))
        f1_test = f1_score(y_test, pred, average='macro')
        f1_train = f1_score(y_train, clf.predict(X_train), average='macro')
        results.append((lr, depth, acc_test, f1_test, acc_train, f1_train))

results_df = pd.DataFrame(results, columns=["Learning Rate", "Max Depth", "Test - Accuracy", "Test - F1 Score", "Train - Accuracy", "Train - F1 Score"])
print("\nComparison of Different Learning Rates and Tree Depths:")
with pd.option_context('expand_frame_repr', False):
    print (results_df)
# print(results_df)
print("\nNames (ID):")
print("Huzaifa Mohammed (40242080),\nMohammed Shurrab (40323793),\nOleksandr Yasinovskyy (40241188)")


# ## Step 7: Visualize Results
# # test
# sns.barplot(data=results_df[0:4], x="Learning Rate", y="Test Accuracy", hue="Max Depth")
# plt.title("Test Accuracy by Learning Rate and Max Depth")
# plt.show()

# sns.barplot(data=results_df[0:4], x="Learning Rate", y="F1 Score", hue="Max Depth")
# plt.title("F1 Score by Learning Rate and Max Depth")
# plt.show()


Comparison of Different Learning Rates and Tree Depths:
   Learning Rate  Max Depth  Test - Accuracy  Test - F1 Score  Train - Accuracy  Train - F1 Score
0          0.100          2            0.571            0.324             0.678             0.514
1          0.100          5            0.551            0.318             0.692             0.543
2          0.500          2            0.571            0.324             0.678             0.525
3          0.500          5            0.551            0.318             0.692             0.580

Names (ID):
Huzaifa Mohammed (40242080),
Mohammed Shurrab (40323793),
Oleksandr Yasinovskyy (40241188)
