In [2]:
import joblib
import numpy as np

X_reduced = joblib.load("../artifacts/X_reduced.pkl")  # shape (31420, 150)
Y = joblib.load("../artifacts/Y.pkl")                  # shape (31420, 3694)

print("Loaded reduced features and labels")
print(f"X_reduced shape: {X_reduced.shape}")
print(f"Y shape: {Y.shape}")


Loaded reduced features and labels
X_reduced shape: (31420, 150)
Y shape: (31420, 3694)


In [3]:
# Quick sanity check
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, hamming_loss, accuracy_score

X_train, X_test, Y_train, Y_test = train_test_split(X_reduced, Y, test_size=0.2, random_state=42)

model = MultiOutputClassifier(LGBMClassifier(n_estimators=50, random_state=42))
model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

print("✨ Initial Results")
print(classification_report(Y_test, Y_pred, zero_division=0))
print("Hamming Loss:", hamming_loss(Y_test, Y_pred))
print("Subset Accuracy:", accuracy_score(Y_test, Y_pred))


[LightGBM] [Info] Number of positive: 14, number of negative: 25122
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009520 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38250
[LightGBM] [Info] Number of data points in the train set: 25136, number of used features: 150
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000557 -> initscore=-7.492442
[LightGBM] [Info] Start training from score -7.492442
[LightGBM] [Info] Number of positive: 13, number of negative: 25123
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009459 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 38250
[LightGBM] [Info] Number of data points in the train set: 25136, number of used features: 150
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.000517 -> initscore=-7.566590
[LightGBM] [Info] Start training from score -7.566590
[LightGBM] [In

KeyboardInterrupt: 

In [8]:
import pandas as pd

# Show the number of positive samples for each label in Y
label_counts = pd.Series(Y.sum(axis=0))
print(label_counts)

# Show the number of positive samples for each label in X_reduced
feature_counts = pd.Series(X_reduced.sum(axis=0))
print(feature_counts)


0       16
1       16
2       20
3       17
4       16
        ..
3689     7
3690     8
3691     7
3692     7
3693     7
Length: 3694, dtype: int64
0      4.936482e+08
1     -2.051534e+06
2      5.079715e+05
3      7.630596e+04
4      1.382950e+05
           ...     
145   -9.010126e+01
146    1.976139e+01
147   -1.374587e+02
148   -3.276409e+02
149   -5.425242e+01
Length: 150, dtype: float64


In [9]:
import numpy as np

# Threshold: keep labels that have at least 50 positive samples
min_label_count = 50
label_mask = np.array(label_counts >= min_label_count)

# Filter Y
Y_filtered = Y[:, label_mask]

# Check how many labels remain
print("Labels after filtering:", Y_filtered.shape[1])


Labels after filtering: 18


In [10]:
# Identify samples (rows) that still have at least one label
row_mask = Y_filtered.sum(axis=1) > 0

# Apply the mask to filter both X and Y
X_filtered = X_reduced[row_mask]
Y_filtered = Y_filtered[row_mask]

# Check the new shapes
print("Filtered X shape:", X_filtered.shape)
print("Filtered Y shape:", Y_filtered.shape)


Filtered X shape: (31194, 150)
Filtered Y shape: (31194, 18)


In [11]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, Y_train, Y_test = train_test_split(
    X_filtered, Y_filtered, test_size=0.2, random_state=42
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])


Training samples: 24955
Testing samples: 6239


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, classification_report

# Initialize One-vs-Rest Logistic Regression
model = OneVsRestClassifier(LogisticRegression(solver='liblinear'))

# Train the model
model.fit(X_train, Y_train)

# Predict on test set
Y_pred = model.predict(X_test)

# Evaluate
print("F1 Score (micro):", f1_score(Y_test, Y_pred, average='micro'))
print("F1 Score (macro):", f1_score(Y_test, Y_pred, average='macro'))

# Optional detailed report
print("\nClassification report:")
print(classification_report(Y_test, Y_pred))


F1 Score (micro): 0.97574253462493
F1 Score (macro): 0.9627964528955424

Classification report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       724
           1       0.95      0.94      0.95       122
           2       1.00      0.98      0.99       694
           3       1.00      0.92      0.96        13
           4       1.00      0.88      0.94        17
           5       0.91      0.90      0.90       118
           6       0.97      0.96      0.97       133
           7       0.98      0.95      0.97       111
           8       0.99      0.95      0.97       131
           9       0.99      0.96      0.97       732
          10       0.95      0.91      0.93       126
          11       0.99      0.97      0.98       739
          12       0.97      0.90      0.93       123
          13       1.00      0.98      0.99       774
          14       0.99      0.98      0.98       126
          15       0.97      0.96      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score

# Initialize One-vs-Rest Random Forest
rf_model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, n_jobs=-1))

# Train
rf_model.fit(X_train, Y_train)

# Predict
Y_pred_rf = rf_model.predict(X_test)

# Evaluate
print("Random Forest F1 Score (micro):", f1_score(Y_test, Y_pred_rf, average='micro'))
print("Random Forest F1 Score (macro):", f1_score(Y_test, Y_pred_rf, average='macro'))


Random Forest F1 Score (micro): 0.9781425205167791
Random Forest F1 Score (macro): 0.9367462732543592


In [14]:
%pip install xgboost

from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

xgb_model = OneVsRestClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_jobs=-1))

xgb_model.fit(X_train, Y_train)

Y_pred_xgb = xgb_model.predict(X_test)

print("XGBoost F1 Score (micro):", f1_score(Y_test, Y_pred_xgb, average='micro'))
print("XGBoost F1 Score (macro):", f1_score(Y_test, Y_pred_xgb, average='macro'))


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
    --------------------------------------- 2.9/150.0 MB 33.2 MB/s eta 0:00:05
   - -------------------------------------- 4.2/150.0 MB 13.9 MB/s eta 0:00:11
   - -------------------------------------- 5.5/150.0 MB 9.9 MB/s eta 0:00:15
   - -------------------------------------- 6.6/150.0 MB 8.4 MB/s eta 0:00:18
   -- ------------------------------------- 7.6/150.0 MB 7.7 MB/s eta 0:00:19
   -- ------------------------------------- 8.7/150.0 MB 7.1 MB/s eta 0:00:21
   -- ------------------------------------- 9.7/150.0 MB 6.9 MB/s eta 0:00:21
   -- ------------------------------------- 10.7/150.0 MB 6.7 MB/s eta 0:00:21
   --- ------------------------------------ 11.3/150.0 MB 6.3 MB/s eta 0:00:22
   --- ------------------------------------ 11.5/150.0 MB 6.0 MB/s eta 0:00

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


XGBoost F1 Score (micro): 0.9950526651771465
XGBoost F1 Score (macro): 0.9853669410282397


In [15]:
from sklearn.metrics import classification_report

print("Classification Report for XGBoost:")
print(classification_report(Y_test, Y_pred_xgb))


Classification Report for XGBoost:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       724
           1       0.97      0.98      0.98       122
           2       1.00      1.00      1.00       694
           3       1.00      1.00      1.00        13
           4       1.00      0.88      0.94        17
           5       0.98      0.97      0.98       118
           6       0.99      0.95      0.97       133
           7       0.99      0.98      0.99       111
           8       0.99      0.98      0.98       131
           9       1.00      1.00      1.00       732
          10       0.98      0.94      0.96       126
          11       1.00      0.99      1.00       739
          12       0.99      0.95      0.97       123
          13       1.00      1.00      1.00       774
          14       1.00      0.98      0.99       126
          15       0.99      0.97      0.98       118
          16       1.00      1.00      1.00   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
import joblib

# Save the model
joblib.dump(xgb_model, 'xgb_multilabel_model.pkl')

# To load it later
# loaded_model = joblib.load('xgb_multilabel_model.pkl')


['xgb_multilabel_model.pkl']

In [18]:
import os
import joblib

# Create a directory in the project root
save_path = "../saved_model"
os.makedirs(save_path, exist_ok=True)

# Define model save location
model_path = os.path.join(save_path, "xgb_multilabel_model.pkl")

# Save the trained XGBoost model
joblib.dump(xgb_model, model_path)

print(f"Model saved successfully to: {model_path}")


Model saved successfully to: ../saved_model\xgb_multilabel_model.pkl
