<a href="https://colab.research.google.com/github/Kellozr/XGboost-Multiclass-Classification/blob/main/XGboost_Multiclass_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install category_encoders scikit-optimize

# Imports
import pandas as pd
from sklearn.pipeline import Pipeline
from category_encoders.target_encoder import TargetEncoder
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from skopt import BayesSearchCV
from skopt.space import Real, Integer

# Step 1: Load and preprocess data
df = pd.read_csv('/content/simulated_transaction.csv', engine='python', on_bad_lines='skip')

# Clean and rename columns
df = df.drop(columns=['customer_id']).rename(columns={
    'industry': 'industry_ID',
    'total_inflow': 'totalin',
    'avg_inflow': 'avgin',
    'num_inflows': 'noofin',
    'total_outflow': 'totalout',
    'avg_outflow': 'avgout',
    'num_outflows': 'noout',
    'avg_old_balance': 'oldbal',
    'avg_new_balance': 'newbal'
})

# Convert numeric fields
num_cols = ['totalin', 'avgin', 'noofin', 'avgout', 'noout', 'oldbal', 'newbal']
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop NaNs
df.dropna(subset=['totalout'], inplace=True)
df.dropna(inplace=True)

# Step 2: Convert totalout into multiclass label
df['outflow_class'] = pd.qcut(df['totalout'], q=3, labels=[0, 1, 2])
df['outflow_class'] = df['outflow_class'].astype(int)

# Step 3: Features and target split
X = df.drop(columns=['totalout', 'outflow_class'])
y = df['outflow_class']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Pipeline with target encoder + XGBoost
pipeline = Pipeline([
    ('encoder', TargetEncoder(cols=['industry_ID'])),
    ('classifier', XGBClassifier(objective='multi:softprob', num_class=3, use_label_encoder=False, eval_metric='mlogloss', random_state=42))
])

# Step 5: Hyperparameter tuning space
search_space = {
    'classifier__n_estimators': Integer(100, 500),
    'classifier__max_depth': Integer(3, 10),
    'classifier__learning_rate': Real(0.01, 0.3, prior='log-uniform'),
    'classifier__subsample': Real(0.5, 1.0),
    'classifier__colsample_bytree': Real(0.5, 1.0)
}

# Step 6: Bayesian Optimization
opt = BayesSearchCV(
    estimator=pipeline,
    search_spaces=search_space,
    n_iter=20,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

# Step 7: Fit model
opt.fit(x_train, y_train)

# Step 8: Predict and evaluate
y_pred = opt.predict(x_test)

print("✅ Best Parameters:", opt.best_params_)
print("🎯 Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("📋 Classification Report:\n", classification_report(y_test, y_pred))




Parameters: { "use_label_encoder" } are not used.



✅ Best Parameters: OrderedDict([('classifier__colsample_bytree', 0.9127725236332915), ('classifier__learning_rate', 0.29999999999999993), ('classifier__max_depth', 3), ('classifier__n_estimators', 100), ('classifier__subsample', 0.7119199561300377)])
🎯 Accuracy: 0.9989
📋 Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     66642
           1       1.00      1.00      1.00     66726
           2       1.00      1.00      1.00     66632

    accuracy                           1.00    200000
   macro avg       1.00      1.00      1.00    200000
weighted avg       1.00      1.00      1.00    200000

