### Install libraries

In [56]:
!pip install pandas scikit-learn joblib matplotlib seaborn xgboost lightgbm catboost shap imbalanced-learn optuna


Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.3-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.41-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Collecting PyYAML (from optuna)
  Using cached PyYAML-6.0.2-cp311-cp311-win_amd64.whl.metadata (2.1 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.2.3-cp311-cp311-win_amd64.whl.metadata (4.2 kB)
Collecting MarkupSafe>=0.9.2 (from Mako->alembic>=1.5.0->optuna)
  Using cached MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl.metadata (4.1 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
   ---------------------------------------- 0.0/395.9 kB ? eta -:--:-


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\User\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


## Imports


In [22]:
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
import shap
import optuna

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, log_loss, f1_score
from imblearn.over_sampling import SMOTENC
from sklearn.feature_selection import VarianceThreshold


from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE

## Preprocessing & Feature Engineering

In [23]:
def preprocess_features(df):
    df = df.copy()
    drop_cols = [
        'LeadID', 'CustomerID', 'OBSFullName', 'OBSEmail', 'Domain',
        'InFinanceProcessSystemApp', 'FinanceApplied', 'FinanceApproved'
    ]
    df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True, errors='ignore')

    if 'DTLeadCreated' in df.columns:
        df['DTLeadCreated'] = pd.to_datetime(df['DTLeadCreated'])
        df['lead_hour'] = df['DTLeadCreated'].dt.hour
        df['lead_dayofweek'] = df['DTLeadCreated'].dt.dayofweek
        df['lead_weekend'] = (df['lead_dayofweek'] >= 5).astype(int)

    for col in df.select_dtypes(include='object').columns:
        freq = df[col].value_counts(normalize=True)
        df[f"{col}_freq"] = df[col].map(freq)

    if {'Province', 'SourceCategory'}.issubset(df.columns):
        df['Province_Source'] = df['Province'] + "_" + df['SourceCategory']

    cat_cols = df.select_dtypes(include='object').columns.tolist()
    num_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

    return df, cat_cols, num_cols

## Transformation Pipeline

In [24]:
def fit_transform_pipeline(df, cat_cols, num_cols):
    encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
    df[cat_cols] = encoder.fit_transform(df[cat_cols].astype(str))

    imputer = SimpleImputer(strategy='mean')
    df[num_cols] = imputer.fit_transform(df[num_cols])

    scaler = StandardScaler()
    df[num_cols] = scaler.fit_transform(df[num_cols])

    return df, encoder, scaler, imputer

def transform_pipeline(df, cat_cols, num_cols, encoder, scaler, imputer):
    df[cat_cols] = encoder.transform(df[cat_cols].astype(str))
    df[num_cols] = imputer.transform(df[num_cols])
    df[num_cols] = scaler.transform(df[num_cols])
    return df

## SMOTENC Balancing Function

In [25]:
def balance_data(X, y, cat_cols):
    cat_indices = [X.columns.get_loc(col) for col in cat_cols if col in X.columns]
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)
    return X_res, y_res


## Optuna Hyperparameter Tuning Functions

In [26]:
def objective(trial, X, y):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 20, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'class_weight': 'balanced',
        'random_state': 42
    }
    model = LGBMClassifier(**params)
    score = cross_val_score(model, X, y, scoring='average_precision', cv=5, n_jobs=-1)
    return score.mean()

def tune_model_with_optuna(X, y):
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X, y), n_trials=30)
    return study.best_params


## Stacking Ensemble Function

In [27]:
def get_stacked_model(xgb, lgbm, cat, gbc):
    base_models = [
        ('xgb', xgb),
        ('lgbm', lgbm),
        ('cat', cat),
        ('gbc', gbc)
    ]
    stacker = LogisticRegression(class_weight='balanced', solver='liblinear')
    stacked_model = StackingClassifier(estimators=base_models, final_estimator=stacker, cv=5, passthrough=True)
    return stacked_model


## SHAP Feature Selection Function

In [39]:
def select_top_features_with_shap_from_stacker(stacker, X, model_name='lgbm', top_n=15):
    # Get the base model
    base_model = stacker.named_estimators_[model_name]

    # Ensure the input columns match training
    assert all(col in X.columns for col in base_model.feature_name_), "Mismatch in feature columns"

    # Create SHAP explainer
    explainer = shap.Explainer(base_model, X)

    # Disable additivity check to avoid numerical assertion errors
    shap_values = explainer(X, check_additivity=False)

    # Compute mean absolute SHAP values
    mean_shap = np.abs(shap_values.values).mean(axis=0)

    # Select top N important features
    selected_indices = np.argsort(mean_shap)[-top_n:]
    selected_features = X.columns[selected_indices]

    return selected_features


## Threshold Optimization Function

In [29]:
def find_best_threshold(y_true, y_probs, metric='pr'):
    thresholds = np.linspace(0.01, 0.99, 100)
    scores = []
    for t in thresholds:
        preds = (y_probs >= t).astype(int)
        if metric == 'f1':
            scores.append(f1_score(y_true, preds))
        else:
            scores.append(average_precision_score(y_true, preds))
    best_idx = np.argmax(scores)
    return thresholds[best_idx], scores[best_idx]


## Preparing training data

In [30]:
train_df = pd.read_csv("https://www.mxhackathon.co.za/docs/TrainData.csv")
y = train_df['VehicleSold']
X = train_df.drop(columns=['VehicleSold'])

### Preprocess

In [31]:
X, cat_cols, num_cols = preprocess_features(X)
X, encoder, scaler, imputer = fit_transform_pipeline(X, cat_cols, num_cols)

### Balance

In [32]:
datetime_cols = X.select_dtypes(include='datetime64[ns]').columns.tolist()
if datetime_cols:
    print("Dropping datetime columns before SMOTENC:", datetime_cols)
    X = X.drop(columns=datetime_cols)

X_bal, y_bal = balance_data(X, y, cat_cols)

Dropping datetime columns before SMOTENC: ['DTLeadCreated']


### Tune LightGBM

In [33]:
best_params = tune_model_with_optuna(X_bal, y_bal)
lgbm_model = LGBMClassifier(**best_params)

[I 2025-07-10 23:38:34,224] A new study created in memory with name: no-name-b1b5247f-9253-478f-9329-d532ab9f75e5
[I 2025-07-10 23:38:59,413] Trial 0 finished with value: 0.9615860983110007 and parameters: {'num_leaves': 67, 'max_depth': 3, 'learning_rate': 0.032153267040331425, 'n_estimators': 433}. Best is trial 0 with value: 0.9615860983110007.
[I 2025-07-10 23:39:20,725] Trial 1 finished with value: 0.9753869403937283 and parameters: {'num_leaves': 75, 'max_depth': 11, 'learning_rate': 0.22718682405971538, 'n_estimators': 442}. Best is trial 1 with value: 0.9753869403937283.
[I 2025-07-10 23:39:37,137] Trial 2 finished with value: 0.976406486280362 and parameters: {'num_leaves': 60, 'max_depth': 9, 'learning_rate': 0.11567706373835887, 'n_estimators': 419}. Best is trial 2 with value: 0.976406486280362.
[I 2025-07-10 23:40:03,453] Trial 3 finished with value: 0.9772920895568976 and parameters: {'num_leaves': 42, 'max_depth': 7, 'learning_rate': 0.14663740504001274, 'n_estimators': 

### Other base models

In [34]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=3, random_state=42)
cat_model = CatBoostClassifier(verbose=0, class_weights=[1, 3], random_state=42)
gbc_model = GradientBoostingClassifier(random_state=42)

### Stacked ensemble

In [35]:
stacked_model = get_stacked_model(xgb_model, lgbm_model, cat_model, gbc_model)
stacked_model.fit(X_bal, y_bal)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 70058, number of negative: 70058
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003276 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4883
[LightGBM] [Info] Number of data points in the train set: 140116, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 56046, number of negative: 56046
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005653 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4883
[LightGBM] [Info] Number of data points in the train set: 112092, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 56047, number of negative: 56046
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007851 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4883
[LightGBM] [Info] Number of data points in the train set: 112093, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500004 -> initscore=0.000018
[LightGBM] [Info] Start training from score 0.000018
[LightGBM] [Info] Number of positive: 56047, number of negative: 56

### SHAP feature selection

In [40]:
selected_features = select_top_features_with_shap_from_stacker(stacked_model, X_bal, model_name='lgbm', top_n=15)
X_selected = X_bal[selected_features]



## Retrain on selected features

In [41]:
stacked_model.fit(X_selected, y_bal)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 70058, number of negative: 70058
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 140116, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 56046, number of negative: 56046
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001026 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 112092, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 56047, number of negative: 56046
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 112093, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500004 -> initscore=0

### Cross-validation predictions

In [42]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
y_oof = np.zeros(len(X_selected))

In [43]:
for train_idx, val_idx in skf.split(X_selected, y_bal):
    X_train, X_val = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
    y_train, y_val = y_bal[train_idx], y_bal[val_idx]
    stacked_model.fit(X_train, y_train)
    y_oof[val_idx] = stacked_model.predict_proba(X_val)[:, 1]

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 56046, number of negative: 56046
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000876 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 112092, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 44837, number of negative: 44836
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003055 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 89673, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500006 -> initscore=0.000022
[LightGBM] [Info] Start training from score 0.000022
[LightGBM] [Info] Number of positive: 44836, number of negative: 44837
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002779 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 89673, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499994 -> initscore=-0.000022
[LightGBM] [Info] Start training from score -0.000022
[LightGBM] [In

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 56047, number of negative: 56046
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001174 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 112093, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500004 -> initscore=0.000018
[LightGBM] [Info] Start training from score 0.000018


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 44838, number of negative: 44836
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000687 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 89674, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500011 -> initscore=0.000045
[LightGBM] [Info] Start training from score 0.000045
[LightGBM] [Info] Number of positive: 44837, number of negative: 44837
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001046 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 89674, number of used features: 15
[LightGBM] [Info] [b

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 56047, number of negative: 56046
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000948 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 112093, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500004 -> initscore=0.000018
[LightGBM] [Info] Start training from score 0.000018


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 44838, number of negative: 44836
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001899 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 89674, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500011 -> initscore=0.000045
[LightGBM] [Info] Start training from score 0.000045
[LightGBM] [Info] Number of positive: 44837, number of negative: 44837
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 89674, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[Ligh

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 56046, number of negative: 56047
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 112093, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499996 -> initscore=-0.000018
[LightGBM] [Info] Start training from score -0.000018


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 44837, number of negative: 44837
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004366 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 89674, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 44837, number of negative: 44837
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004333 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 89674, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 44836, number of negative: 44838
[LightGBM] [Info] Auto-choosing col-wise multi-thre

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 56046, number of negative: 56047
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000828 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 112093, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499996 -> initscore=-0.000018
[LightGBM] [Info] Start training from score -0.000018


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 44837, number of negative: 44837
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001860 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 89674, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 44837, number of negative: 44837
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000680 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3594
[LightGBM] [Info] Number of data points in the train set: 89674, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 44836, number of neg

## Evaluation

In [44]:
print("📈 PR-AUC:", average_precision_score(y_bal, y_oof))
print("📉 Log Loss:", log_loss(y_bal, y_oof))

📈 PR-AUC: 0.9850177334920769
📉 Log Loss: 0.1101132781765245


## Threshold Optimization

In [45]:
best_thresh, best_score = find_best_threshold(y_bal, y_oof)
print(f"🔍 Best Threshold: {best_thresh} with score {best_score}")

🔍 Best Threshold: 0.792020202020202 with score 0.9742906507350121


## Preparing Test Data

In [46]:
test_df = pd.read_csv("https://www.mxhackathon.co.za/docs/TestData.csv")

## Predicting Test Probabilities

In [48]:
lead_ids = test_df['LeadID']

X_test, _, _ = preprocess_features(test_df)
X_test = transform_pipeline(X_test, cat_cols, num_cols, encoder, scaler, imputer)
X_test = X_test[selected_features]

final_probs = stacked_model.predict_proba(X_test)[:, 1]
final_preds = (final_probs >= best_thresh).astype(int)

submission = pd.DataFrame({
    "LeadID": lead_ids,
    "VehicleSoldProbability": final_probs,
})

submission.to_csv("ThouCSV1.csv", index=False)