In [1]:
# src/train_classifier.py

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score


In [2]:
# 1. Load dataset
df = pd.read_csv(r"../data/leads.csv")

# Features (X) and target (y)
X = df.drop(columns=["heat_label", "lead_id"])  # drop target + ID
y = df["heat_label"]


In [3]:
# 2. Train/Val/Test Split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# 3. Preprocessing: One-hot encode categorical vars
categorical_features = ["source", "region", "role", "campaign", "last_touch"]
numeric_features = ["recency_days", "page_views", "prior_course_interest"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features),
    ]
)



In [4]:
# 4. Build pipeline: preprocessing + logistic regression
clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LogisticRegression(class_weight="balanced", max_iter=1000)),
    ]
)

# 5. Train model
clf.fit(X_train, y_train)

# 6. Validation results
y_val_pred = clf.predict(X_val)

print("\nClassification Report (Validation):")
print(classification_report(y_val, y_val_pred))

print("\nConfusion Matrix (Validation):")
print(confusion_matrix(y_val, y_val_pred))

# 7. Macro F1 Score
macro_f1 = f1_score(y_val, y_val_pred, average="macro")
print(f"\nMacro F1 Score (Validation): {macro_f1:.3f}")



Classification Report (Validation):
              precision    recall  f1-score   support

        Cold       0.74      0.88      0.80        90
         Hot       0.70      0.93      0.80        28
        Warm       0.92      0.79      0.85       182

    accuracy                           0.83       300
   macro avg       0.79      0.86      0.82       300
weighted avg       0.84      0.83      0.83       300


Confusion Matrix (Validation):
[[ 79   0  11]
 [  0  26   2]
 [ 28  11 143]]

Macro F1 Score (Validation): 0.816


In [9]:
from sklearn.preprocessing import LabelEncoder

# Encode y
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)
y_test_enc = le.transform(y_test)  # for later

# Now y_train_enc: 0,1,2 instead of 'Cold','Hot','Warm'


In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Build XGBoost pipeline
xgb_clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", XGBClassifier(
            objective="multi:softmax",
            eval_metric="mlogloss",
            use_label_encoder=False,
            random_state=42
        )),
    ]
)

# Train XGBoost with encoded labels
xgb_clf.fit(X_train, y_train_enc)

# Predictions
y_val_pred_xgb = xgb_clf.predict(X_val)

# Convert predictions back to original labels
y_val_pred_xgb_labels = le.inverse_transform(y_val_pred_xgb)

# Classification report
print("\nXGBoost Classification Report (Validation):")
print(classification_report(y_val, y_val_pred_xgb_labels))

# Confusion matrix
print("\nXGBoost Confusion Matrix (Validation):")
print(confusion_matrix(y_val, y_val_pred_xgb_labels))

# Macro F1 score
macro_f1_xgb = f1_score(y_val, y_val_pred_xgb_labels, average="macro")
print(f"\nXGBoost Macro F1 Score (Validation): {macro_f1_xgb:.3f}")



XGBoost Classification Report (Validation):
              precision    recall  f1-score   support

        Cold       1.00      0.99      0.99        90
         Hot       1.00      1.00      1.00        28
        Warm       0.99      1.00      1.00       182

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300


XGBoost Confusion Matrix (Validation):
[[ 89   0   1]
 [  0  28   0]
 [  0   0 182]]

XGBoost Macro F1 Score (Validation): 0.997


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
