### Package Installation

In [27]:
!pip install -r requirements.txt
!pip install -q optuna




[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\User\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: C:\Users\User\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


## Imports

In [28]:
import numpy as np
import pandas as pd
import warnings
import random
import os
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import average_precision_score
import lightgbm as lgb
import optuna

from lightgbm import early_stopping, log_evaluation

In [29]:
warnings.filterwarnings('ignore')
seed = 42
random.seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

## Loading Train and Test Data

In [61]:
URL_TRAIN = "https://www.mxhackathon.co.za/docs/TrainData.csv"
URL_TEST  = "https://www.mxhackathon.co.za/docs/TestData.csv"
dfTrain = pd.read_csv(URL_TRAIN)
dfTest  = pd.read_csv(URL_TEST)
print("Data successfully loaded:", dfTrain.shape, dfTest.shape)

Data successfully loaded: (73286, 21) (15611, 17)


##  Targeting and dropping Leakage Columns

In [31]:
leak_cols = ["InFinanceProcessSystemApp", "FinanceApplied", "FinanceApproved"]
dfTrain.drop(columns=leak_cols, inplace=True, errors="ignore")

## Feature Engineering

In [32]:
def feature_engineering(df):
    df = df.copy()
    df["DTLeadCreated"] = pd.to_datetime(df["DTLeadCreated"], errors='coerce')
    df["DTLeadAllocated"] = pd.to_datetime(df["DTLeadAllocated"], errors='coerce')
    
    # Enhanced temporal features
    df["LeadCreationHour"] = df["DTLeadCreated"].dt.hour
    df["LeadCreationDay"] = df["DTLeadCreated"].dt.dayofweek
    df["LeadCreationMonth"] = df["DTLeadCreated"].dt.month
    df["LeadCreationDayOfMonth"] = df["DTLeadCreated"].dt.day
    df["IsWeekend"] = df["LeadCreationDay"].isin([5, 6]).astype(int)
    df["IsMorning"] = (df["LeadCreationHour"] >= 8) & (df["LeadCreationHour"] <= 12)
    df["IsAfternoon"] = (df["LeadCreationHour"] > 12) & (df["LeadCreationHour"] <= 17)
    df["IsEvening"] = (df["LeadCreationHour"] > 17) | (df["LeadCreationHour"] < 8)
    
    # Time difference features
    df["AllocationLagHours"] = (df["DTLeadAllocated"] - df["DTLeadCreated"]).dt.total_seconds() / 3600
    df["AllocationLagMinutes"] = (df["DTLeadAllocated"] - df["DTLeadCreated"]).dt.total_seconds() / 60
    
    # Email domain features
    df["EmailDomainLength"] = df["Domain"].astype(str).apply(len)
    df["IsGmailOrYahoo"] = df["Domain"].str.lower().isin(["gmail.com", "yahoo.com"]).astype(int)
    
    # Dropping datetime and text-heavy columns
    drop_cols = ["DTLeadCreated", "DTLeadAllocated", "OBSFullName", "OBSEmail", "CustomerID"]
    df.drop(columns=drop_cols, inplace=True, errors="ignore")
    return df

### Applying feature_engineering to datasets

In [33]:
dfTrain = feature_engineering(dfTrain)
dfTest = feature_engineering(dfTest)

## Adding frequency encoding for important categorical features

In [34]:
for col in ["LeadCreationHour", "LeadCreationDay", "Domain", "LeadSource", "Make", "Model"]:
    if col in dfTrain.columns:
        freq = dfTrain[col].value_counts() / len(dfTrain)
        dfTrain[col + "_freq"] = dfTrain[col].map(freq)
        # Using training distributions for test data to avoid data leakage
        dfTest[col + "_freq"] = dfTest[col].map(freq).fillna(0)

## Creating interaction features

In [35]:
dfTrain["Lag_per_EmailLen"] = dfTrain["AllocationLagHours"] / (dfTrain["EmailDomainLength"] + 1)
dfTest["Lag_per_EmailLen"] = dfTest["AllocationLagHours"] / (dfTest["EmailDomainLength"] + 1)

if "Make" in dfTrain.columns and "Model" in dfTrain.columns:
    dfTrain["Make_Model_Interaction"] = dfTrain["Make"].astype(str) + "_" + dfTrain["Model"].astype(str)
    dfTest["Make_Model_Interaction"] = dfTest["Make"].astype(str) + "_" + dfTest["Model"].astype(str)

In [36]:
all_data = pd.concat([dfTrain.drop(columns="VehicleSold"), dfTest], axis=0)
cat_cols = all_data.select_dtypes(include="object").columns

## Encoding categorical features

In [37]:
encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    all_data[col] = le.fit_transform(all_data[col].astype(str))
    encoders[col] = le

X_train = all_data.iloc[:len(dfTrain), :]
X_test = all_data.iloc[len(dfTrain):, :]
y_train = dfTrain["VehicleSold"]

### Train/Test split

In [38]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=seed)

In [39]:
def objective(trial):
    params = {
        "objective": "binary",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "random_state": seed,
        "n_estimators": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 5)
    }

    model = lgb.LGBMClassifier(**params)

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric="average_precision",
        callbacks=[
            early_stopping(stopping_rounds=50),
            log_evaluation(period=0)
        ]
    )

    preds = model.predict_proba(X_val)[:, 1]
    score = average_precision_score(y_val, preds)
    return score


## Running Optimization

In [40]:
# this will take about 5 min
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=45)

[I 2025-07-14 17:36:38,718] A new study created in memory with name: no-name-52a69656-9d61-4752-ae94-0a90ec13c096


Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:39,034] Trial 0 finished with value: 0.07886955063893974 and parameters: {'learning_rate': 0.09057914218578224, 'num_leaves': 46, 'min_child_samples': 19, 'feature_fraction': 0.8042082363356532, 'bagging_fraction': 0.8012015072878704, 'bagging_freq': 3, 'reg_alpha': 3.176542763261815, 'reg_lambda': 2.5295023877215788}. Best is trial 0 with value: 0.07886955063893974.


Early stopping, best iteration is:
[18]	valid_0's average_precision: 0.0788696	valid_0's binary_logloss: 0.174586
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:39,590] Trial 1 finished with value: 0.08028379602642843 and parameters: {'learning_rate': 0.08140109882828014, 'num_leaves': 177, 'min_child_samples': 98, 'feature_fraction': 0.773822291842254, 'bagging_fraction': 0.8815219843799699, 'bagging_freq': 1, 'reg_alpha': 4.831712819343149, 'reg_lambda': 1.1513995366184133}. Best is trial 1 with value: 0.08028379602642843.


Early stopping, best iteration is:
[32]	valid_0's average_precision: 0.0802838	valid_0's binary_logloss: 0.174205
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:40,491] Trial 2 finished with value: 0.08407486215079951 and parameters: {'learning_rate': 0.024974932987905013, 'num_leaves': 114, 'min_child_samples': 26, 'feature_fraction': 0.639724767565023, 'bagging_fraction': 0.8906119646303012, 'bagging_freq': 7, 'reg_alpha': 4.160645345788222, 'reg_lambda': 4.349885825802383}. Best is trial 2 with value: 0.08407486215079951.


Early stopping, best iteration is:
[105]	valid_0's average_precision: 0.0840749	valid_0's binary_logloss: 0.173852
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:41,176] Trial 3 finished with value: 0.08009229260431183 and parameters: {'learning_rate': 0.021152467698541046, 'num_leaves': 89, 'min_child_samples': 59, 'feature_fraction': 0.9313590449966008, 'bagging_fraction': 0.9795322145519638, 'bagging_freq': 8, 'reg_alpha': 3.4086739322168236, 'reg_lambda': 1.9217353585697965}. Best is trial 2 with value: 0.08407486215079951.


Early stopping, best iteration is:
[55]	valid_0's average_precision: 0.0800923	valid_0's binary_logloss: 0.174791
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:41,613] Trial 4 finished with value: 0.07736451403627158 and parameters: {'learning_rate': 0.0809162276464322, 'num_leaves': 98, 'min_child_samples': 68, 'feature_fraction': 0.7005890873502845, 'bagging_fraction': 0.7763056100167179, 'bagging_freq': 10, 'reg_alpha': 0.6781785178634903, 'reg_lambda': 2.8369133762931824}. Best is trial 2 with value: 0.08407486215079951.


Early stopping, best iteration is:
[26]	valid_0's average_precision: 0.0773645	valid_0's binary_logloss: 0.174158
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:42,691] Trial 5 finished with value: 0.08166907817018491 and parameters: {'learning_rate': 0.010526420006706852, 'num_leaves': 142, 'min_child_samples': 54, 'feature_fraction': 0.635148463736331, 'bagging_fraction': 0.7630762225809322, 'bagging_freq': 7, 'reg_alpha': 0.3790499154089433, 'reg_lambda': 4.267283699343372}. Best is trial 2 with value: 0.08407486215079951.


Early stopping, best iteration is:
[115]	valid_0's average_precision: 0.0816691	valid_0's binary_logloss: 0.174829
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:43,216] Trial 6 finished with value: 0.07844307011949977 and parameters: {'learning_rate': 0.08110371712720046, 'num_leaves': 70, 'min_child_samples': 20, 'feature_fraction': 0.8819567488968163, 'bagging_fraction': 0.853527877827262, 'bagging_freq': 7, 'reg_alpha': 3.635945641208029, 'reg_lambda': 3.674256777563904}. Best is trial 2 with value: 0.08407486215079951.


Early stopping, best iteration is:
[50]	valid_0's average_precision: 0.0784431	valid_0's binary_logloss: 0.174231
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:43,648] Trial 7 finished with value: 0.07867917228731028 and parameters: {'learning_rate': 0.07827995252072179, 'num_leaves': 63, 'min_child_samples': 19, 'feature_fraction': 0.8630524170492018, 'bagging_fraction': 0.8628606438977373, 'bagging_freq': 6, 'reg_alpha': 1.9554619022624042, 'reg_lambda': 3.1407421439739522}. Best is trial 2 with value: 0.08407486215079951.


Early stopping, best iteration is:
[43]	valid_0's average_precision: 0.0786792	valid_0's binary_logloss: 0.173735
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:44,209] Trial 8 finished with value: 0.08441214916196588 and parameters: {'learning_rate': 0.061503873955098765, 'num_leaves': 100, 'min_child_samples': 99, 'feature_fraction': 0.6514564127326442, 'bagging_fraction': 0.8827946786278817, 'bagging_freq': 2, 'reg_alpha': 2.195634618382334, 'reg_lambda': 1.845241969413674}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[34]	valid_0's average_precision: 0.0844121	valid_0's binary_logloss: 0.173848
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:44,720] Trial 9 finished with value: 0.07583762466333374 and parameters: {'learning_rate': 0.08590999679976516, 'num_leaves': 138, 'min_child_samples': 37, 'feature_fraction': 0.8373105508750986, 'bagging_fraction': 0.7709066025605449, 'bagging_freq': 10, 'reg_alpha': 0.16996636959875244, 'reg_lambda': 2.045807327788141}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[21]	valid_0's average_precision: 0.0758376	valid_0's binary_logloss: 0.174476
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:45,416] Trial 10 finished with value: 0.07334589547705457 and parameters: {'learning_rate': 0.04886886449287388, 'num_leaves': 248, 'min_child_samples': 99, 'feature_fraction': 0.9900185384934804, 'bagging_fraction': 0.6480219508415882, 'bagging_freq': 3, 'reg_alpha': 1.9524687959207887, 'reg_lambda': 0.27408533034223126}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[37]	valid_0's average_precision: 0.0733459	valid_0's binary_logloss: 0.175117
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:46,251] Trial 11 finished with value: 0.08382847982826501 and parameters: {'learning_rate': 0.04890599649391322, 'num_leaves': 181, 'min_child_samples': 79, 'feature_fraction': 0.6136607947321815, 'bagging_fraction': 0.9494719645640437, 'bagging_freq': 4, 'reg_alpha': 4.976060663055681, 'reg_lambda': 4.793111802661341}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[69]	valid_0's average_precision: 0.0838285	valid_0's binary_logloss: 0.173736
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:46,988] Trial 12 finished with value: 0.08204116958593871 and parameters: {'learning_rate': 0.03555788043317811, 'num_leaves': 27, 'min_child_samples': 38, 'feature_fraction': 0.7068995430625553, 'bagging_fraction': 0.927083176106584, 'bagging_freq': 1, 'reg_alpha': 2.469358433538285, 'reg_lambda': 1.5169460560853505}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[161]	valid_0's average_precision: 0.0820412	valid_0's binary_logloss: 0.173371
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:47,466] Trial 13 finished with value: 0.07657271803830476 and parameters: {'learning_rate': 0.06364556350111826, 'num_leaves': 111, 'min_child_samples': 39, 'feature_fraction': 0.6883430185154122, 'bagging_fraction': 0.9013556571971678, 'bagging_freq': 5, 'reg_alpha': 4.178176271070168, 'reg_lambda': 0.7203922902768793}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[15]	valid_0's average_precision: 0.0765727	valid_0's binary_logloss: 0.175525
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:48,055] Trial 14 finished with value: 0.0729257014623462 and parameters: {'learning_rate': 0.06440308378424053, 'num_leaves': 183, 'min_child_samples': 84, 'feature_fraction': 0.7520642597566632, 'bagging_fraction': 0.6886222618675149, 'bagging_freq': 8, 'reg_alpha': 1.2763548832086846, 'reg_lambda': 3.6796516652468796}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[20]	valid_0's average_precision: 0.0729257	valid_0's binary_logloss: 0.175246
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:48,854] Trial 15 finished with value: 0.08186070612761534 and parameters: {'learning_rate': 0.034365720937716046, 'num_leaves': 115, 'min_child_samples': 10, 'feature_fraction': 0.6433833538018608, 'bagging_fraction': 0.997846652506627, 'bagging_freq': 2, 'reg_alpha': 2.757018950024221, 'reg_lambda': 4.688269663917691}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[76]	valid_0's average_precision: 0.0818607	valid_0's binary_logloss: 0.173544
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:49,676] Trial 16 finished with value: 0.08162262347498052 and parameters: {'learning_rate': 0.033798361988639586, 'num_leaves': 159, 'min_child_samples': 49, 'feature_fraction': 0.6760886100730755, 'bagging_fraction': 0.8166358657973902, 'bagging_freq': 5, 'reg_alpha': 4.136196630085417, 'reg_lambda': 3.4687097974959245}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[73]	valid_0's average_precision: 0.0816226	valid_0's binary_logloss: 0.174055
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:50,169] Trial 17 finished with value: 0.07712282829190548 and parameters: {'learning_rate': 0.06325024446449076, 'num_leaves': 235, 'min_child_samples': 83, 'feature_fraction': 0.6010375465611106, 'bagging_fraction': 0.7282530457186692, 'bagging_freq': 4, 'reg_alpha': 1.4614847288214632, 'reg_lambda': 2.1197423540733156}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[11]	valid_0's average_precision: 0.0771228	valid_0's binary_logloss: 0.176248
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:50,996] Trial 18 finished with value: 0.08002220808828799 and parameters: {'learning_rate': 0.02327585326703116, 'num_leaves': 209, 'min_child_samples': 29, 'feature_fraction': 0.7443327734671431, 'bagging_fraction': 0.8368919186716313, 'bagging_freq': 9, 'reg_alpha': 2.5739019502056157, 'reg_lambda': 4.1700863303935725}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[37]	valid_0's average_precision: 0.0800222	valid_0's binary_logloss: 0.175772
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:51,714] Trial 19 finished with value: 0.0765640540747595 and parameters: {'learning_rate': 0.09675596327344278, 'num_leaves': 85, 'min_child_samples': 68, 'feature_fraction': 0.660056707547333, 'bagging_fraction': 0.9104774474632444, 'bagging_freq': 6, 'reg_alpha': 4.162298878682263, 'reg_lambda': 1.3253890510377282}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[30]	valid_0's average_precision: 0.0765641	valid_0's binary_logloss: 0.17434
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:52,729] Trial 20 finished with value: 0.07957600374981533 and parameters: {'learning_rate': 0.052778705887596804, 'num_leaves': 124, 'min_child_samples': 47, 'feature_fraction': 0.7272781067306273, 'bagging_fraction': 0.9553628611443961, 'bagging_freq': 3, 'reg_alpha': 2.04863760348229, 'reg_lambda': 0.095816050683184}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[47]	valid_0's average_precision: 0.079576	valid_0's binary_logloss: 0.173971
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:53,865] Trial 21 finished with value: 0.08430401648846306 and parameters: {'learning_rate': 0.04019214168146862, 'num_leaves': 202, 'min_child_samples': 89, 'feature_fraction': 0.6038831005713898, 'bagging_fraction': 0.9323561065751248, 'bagging_freq': 4, 'reg_alpha': 4.993768566129774, 'reg_lambda': 4.70225560321455}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[73]	valid_0's average_precision: 0.084304	valid_0's binary_logloss: 0.173596
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:54,554] Trial 22 finished with value: 0.08403912272977089 and parameters: {'learning_rate': 0.038438516993644194, 'num_leaves': 155, 'min_child_samples': 91, 'feature_fraction': 0.6342592432237072, 'bagging_fraction': 0.8834857983981855, 'bagging_freq': 2, 'reg_alpha': 4.67172918858401, 'reg_lambda': 4.970187367228483}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[22]	valid_0's average_precision: 0.0840391	valid_0's binary_logloss: 0.175924
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:55,664] Trial 23 finished with value: 0.08295096268661337 and parameters: {'learning_rate': 0.04278772803151829, 'num_leaves': 213, 'min_child_samples': 92, 'feature_fraction': 0.6096734520042835, 'bagging_fraction': 0.9302137158957187, 'bagging_freq': 4, 'reg_alpha': 4.385240334189578, 'reg_lambda': 4.345289005424987}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[71]	valid_0's average_precision: 0.082951	valid_0's binary_logloss: 0.173726
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:57,395] Trial 24 finished with value: 0.08336243134742892 and parameters: {'learning_rate': 0.02203503086342514, 'num_leaves': 202, 'min_child_samples': 75, 'feature_fraction': 0.6646346221730252, 'bagging_fraction': 0.9645594142216012, 'bagging_freq': 2, 'reg_alpha': 3.724482102162914, 'reg_lambda': 3.9523251075139867}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[115]	valid_0's average_precision: 0.0833624	valid_0's binary_logloss: 0.17359
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:36:58,641] Trial 25 finished with value: 0.08214777627829632 and parameters: {'learning_rate': 0.028492503082956967, 'num_leaves': 105, 'min_child_samples': 92, 'feature_fraction': 0.6012364770098371, 'bagging_fraction': 0.8260563592516666, 'bagging_freq': 7, 'reg_alpha': 3.0303519579070164, 'reg_lambda': 3.125788205232295}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[100]	valid_0's average_precision: 0.0821478	valid_0's binary_logloss: 0.173827
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:00,752] Trial 26 finished with value: 0.0801188748198406 and parameters: {'learning_rate': 0.011037009193486848, 'num_leaves': 128, 'min_child_samples': 100, 'feature_fraction': 0.7157981034224259, 'bagging_fraction': 0.6021552975628935, 'bagging_freq': 5, 'reg_alpha': 4.509131417570429, 'reg_lambda': 4.615311844080217}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[187]	valid_0's average_precision: 0.0801189	valid_0's binary_logloss: 0.174678
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:01,440] Trial 27 finished with value: 0.07954329418077785 and parameters: {'learning_rate': 0.07280833699269168, 'num_leaves': 65, 'min_child_samples': 67, 'feature_fraction': 0.650116965829396, 'bagging_fraction': 0.8672160437744415, 'bagging_freq': 4, 'reg_alpha': 3.7344389246908314, 'reg_lambda': 1.6529563698733867}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[43]	valid_0's average_precision: 0.0795433	valid_0's binary_logloss: 0.174117
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:02,270] Trial 28 finished with value: 0.07903130805878868 and parameters: {'learning_rate': 0.058241537823468706, 'num_leaves': 154, 'min_child_samples': 88, 'feature_fraction': 0.7813362385226509, 'bagging_fraction': 0.9114836766304598, 'bagging_freq': 6, 'reg_alpha': 1.4161203562241165, 'reg_lambda': 2.3352005284707387}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[27]	valid_0's average_precision: 0.0790313	valid_0's binary_logloss: 0.174309
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:03,042] Trial 29 finished with value: 0.08052287397883534 and parameters: {'learning_rate': 0.04291219497790991, 'num_leaves': 39, 'min_child_samples': 29, 'feature_fraction': 0.6873853878543755, 'bagging_fraction': 0.8920768741038223, 'bagging_freq': 3, 'reg_alpha': 3.141577733143262, 'reg_lambda': 2.6281628041922245}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[79]	valid_0's average_precision: 0.0805229	valid_0's binary_logloss: 0.173834
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:04,117] Trial 30 finished with value: 0.07483745057581179 and parameters: {'learning_rate': 0.07226721132033086, 'num_leaves': 230, 'min_child_samples': 75, 'feature_fraction': 0.6305024275358232, 'bagging_fraction': 0.9355537227104931, 'bagging_freq': 3, 'reg_alpha': 2.3013555050231664, 'reg_lambda': 0.7215399737611121}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[28]	valid_0's average_precision: 0.0748375	valid_0's binary_logloss: 0.174706
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:04,880] Trial 31 finished with value: 0.08325980913826041 and parameters: {'learning_rate': 0.04067467604137394, 'num_leaves': 159, 'min_child_samples': 92, 'feature_fraction': 0.6299298954885422, 'bagging_fraction': 0.8769892455483538, 'bagging_freq': 2, 'reg_alpha': 4.62506874195639, 'reg_lambda': 4.936637467634596}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[29]	valid_0's average_precision: 0.0832598	valid_0's binary_logloss: 0.175123
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:06,548] Trial 32 finished with value: 0.08264044779304017 and parameters: {'learning_rate': 0.026492916606183945, 'num_leaves': 191, 'min_child_samples': 93, 'feature_fraction': 0.6613686192803305, 'bagging_fraction': 0.8421404639281656, 'bagging_freq': 1, 'reg_alpha': 4.7698792431797985, 'reg_lambda': 4.961552765187654}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[113]	valid_0's average_precision: 0.0826404	valid_0's binary_logloss: 0.173748
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:07,572] Trial 33 finished with value: 0.08270574455658791 and parameters: {'learning_rate': 0.03826649767890426, 'num_leaves': 171, 'min_child_samples': 85, 'feature_fraction': 0.6299671456176547, 'bagging_fraction': 0.8045782655285142, 'bagging_freq': 2, 'reg_alpha': 4.987554046886839, 'reg_lambda': 4.417832555323083}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[47]	valid_0's average_precision: 0.0827057	valid_0's binary_logloss: 0.174594
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:09,033] Trial 34 finished with value: 0.0822153324919816 and parameters: {'learning_rate': 0.01636931420237011, 'num_leaves': 96, 'min_child_samples': 96, 'feature_fraction': 0.8130698784058433, 'bagging_fraction': 0.8847609173192997, 'bagging_freq': 1, 'reg_alpha': 4.406445253555685, 'reg_lambda': 4.567692268565029}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[111]	valid_0's average_precision: 0.0822153	valid_0's binary_logloss: 0.174145
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:10,564] Trial 35 finished with value: 0.08219104875575532 and parameters: {'learning_rate': 0.030039519699662485, 'num_leaves': 148, 'min_child_samples': 63, 'feature_fraction': 0.681363146280002, 'bagging_fraction': 0.9739462452783733, 'bagging_freq': 8, 'reg_alpha': 4.055387509913395, 'reg_lambda': 4.040588938021424}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[95]	valid_0's average_precision: 0.082191	valid_0's binary_logloss: 0.173667
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:11,532] Trial 36 finished with value: 0.08166057462214975 and parameters: {'learning_rate': 0.04676028863441558, 'num_leaves': 77, 'min_child_samples': 76, 'feature_fraction': 0.6219405990910496, 'bagging_fraction': 0.7869451841625293, 'bagging_freq': 2, 'reg_alpha': 3.406705784488558, 'reg_lambda': 3.835467030216008}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[55]	valid_0's average_precision: 0.0816606	valid_0's binary_logloss: 0.174243
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:12,276] Trial 37 finished with value: 0.0819507222632399 and parameters: {'learning_rate': 0.0576698751328818, 'num_leaves': 123, 'min_child_samples': 57, 'feature_fraction': 0.6500538621753311, 'bagging_fraction': 0.8534214934589985, 'bagging_freq': 1, 'reg_alpha': 3.878652451371659, 'reg_lambda': 3.318546052008541}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[36]	valid_0's average_precision: 0.0819507	valid_0's binary_logloss: 0.174135
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:14,660] Trial 38 finished with value: 0.082071898218694 and parameters: {'learning_rate': 0.015432391076966234, 'num_leaves': 168, 'min_child_samples': 87, 'feature_fraction': 0.9157051663097402, 'bagging_fraction': 0.9984155789118252, 'bagging_freq': 9, 'reg_alpha': 4.740922706438963, 'reg_lambda': 2.784121654679477}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[170]	valid_0's average_precision: 0.0820719	valid_0's binary_logloss: 0.173821
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:15,444] Trial 39 finished with value: 0.07845746327211216 and parameters: {'learning_rate': 0.05226358196954604, 'num_leaves': 54, 'min_child_samples': 11, 'feature_fraction': 0.7334705983339621, 'bagging_fraction': 0.9137436753658011, 'bagging_freq': 7, 'reg_alpha': 0.9833634124363004, 'reg_lambda': 1.8385185697989717}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[56]	valid_0's average_precision: 0.0784575	valid_0's binary_logloss: 0.173881
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:16,788] Trial 40 finished with value: 0.08054724081966154 and parameters: {'learning_rate': 0.04674608359055871, 'num_leaves': 136, 'min_child_samples': 80, 'feature_fraction': 0.7011045199765774, 'bagging_fraction': 0.946393122898772, 'bagging_freq': 3, 'reg_alpha': 2.9670011265405902, 'reg_lambda': 4.4448649972448}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[54]	valid_0's average_precision: 0.0805472	valid_0's binary_logloss: 0.173933
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:17,930] Trial 41 finished with value: 0.08102348037808323 and parameters: {'learning_rate': 0.050394255865250884, 'num_leaves': 192, 'min_child_samples': 79, 'feature_fraction': 0.616184193491297, 'bagging_fraction': 0.9447726599661836, 'bagging_freq': 4, 'reg_alpha': 4.958691748863952, 'reg_lambda': 4.792548363041343}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[51]	valid_0's average_precision: 0.0810235	valid_0's binary_logloss: 0.174066
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:19,064] Trial 42 finished with value: 0.07896630806777222 and parameters: {'learning_rate': 0.05647107934162469, 'num_leaves': 180, 'min_child_samples': 96, 'feature_fraction': 0.6403436364961427, 'bagging_fraction': 0.887532663974317, 'bagging_freq': 5, 'reg_alpha': 4.641824243805602, 'reg_lambda': 4.938817514797199}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[35]	valid_0's average_precision: 0.0789663	valid_0's binary_logloss: 0.174347
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:20,306] Trial 43 finished with value: 0.08385273902822113 and parameters: {'learning_rate': 0.03799187763199631, 'num_leaves': 223, 'min_child_samples': 88, 'feature_fraction': 0.6018539385131686, 'bagging_fraction': 0.9239752040448184, 'bagging_freq': 4, 'reg_alpha': 4.963226993715288, 'reg_lambda': 4.20910134720847}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[70]	valid_0's average_precision: 0.0838527	valid_0's binary_logloss: 0.173795
Training until validation scores don't improve for 50 rounds


[I 2025-07-14 17:37:21,543] Trial 44 finished with value: 0.08380879505799238 and parameters: {'learning_rate': 0.030248775622088206, 'num_leaves': 255, 'min_child_samples': 88, 'feature_fraction': 0.6752727266271814, 'bagging_fraction': 0.868272259292451, 'bagging_freq': 4, 'reg_alpha': 3.402778526945394, 'reg_lambda': 4.164819112158622}. Best is trial 8 with value: 0.08441214916196588.


Early stopping, best iteration is:
[32]	valid_0's average_precision: 0.0838088	valid_0's binary_logloss: 0.175239


### Cross-Fold Ensembling with Best Parameters

In [41]:
best_params = study.best_params
best_params.update({"objective": "binary", "metric": "average_precision", "verbosity": -1, "seed": seed})

## Calculating class weight for imbalance handling

In [42]:
from collections import Counter
counter = Counter(y_train)
scale_pos_weight = counter[0] / counter[1]
print("Scale pos weight:", scale_pos_weight)

# Adding scale_pos_weight to best params
best_params["scale_pos_weight"] = scale_pos_weight

Scale pos weight: 21.703221809169765


### Training with cross-validation and create model ensemble

In [43]:
n_folds = 5
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
cv_models = []
cv_scores = []
oof_preds = np.zeros(len(X_train))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
    X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    fold_model = lgb.LGBMClassifier(**best_params)
    fold_model.fit(
        X_fold_train, y_fold_train,
        eval_set=[(X_fold_val, y_fold_val)],
        eval_metric="average_precision",
        callbacks=[
            early_stopping(stopping_rounds=100),
            log_evaluation(period=0)
        ]
    )
    
    # Store out-of-fold predictions for threshold optimization
    oof_preds[val_idx] = fold_model.predict_proba(X_fold_val)[:, 1]
    
    fold_preds = fold_model.predict_proba(X_fold_val)[:, 1]
    fold_score = average_precision_score(y_fold_val, fold_preds)
    print(f"Fold {fold+1} PR-AUC: {fold_score:.6f}")
    
    cv_models.append(fold_model)
    cv_scores.append(fold_score)

print(f"Mean CV PR-AUC: {np.mean(cv_scores):.6f} ± {np.std(cv_scores):.6f}")

Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[10]	valid_0's average_precision: 0.0792678
Fold 1 PR-AUC: 0.079268
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[42]	valid_0's average_precision: 0.0749151
Fold 2 PR-AUC: 0.074915
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[22]	valid_0's average_precision: 0.0729026
Fold 3 PR-AUC: 0.072903
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[24]	valid_0's average_precision: 0.0790928
Fold 4 PR-AUC: 0.079093
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[10]	valid_0's average_precision: 0.073329
Fold 5 PR-AUC: 0.073329
Mean CV PR-AUC: 0.075901 ± 0.002760


### Training single model on all data

In [44]:
final_model = lgb.LGBMClassifier(**best_params)
final_model.fit(X_train, y_train)

## Generating ensemble predictions (average of CV models)

In [45]:
cv_test_preds = np.zeros(len(X_test))
for model in cv_models:
    cv_test_preds += model.predict_proba(X_test)[:, 1]
cv_test_preds /= len(cv_models)

## Generating single model predictions

In [46]:
single_test_preds = final_model.predict_proba(X_test)[:, 1]

### Trying different ensemble weights to find optimal blend

In [47]:
blend_weights = [0.5, 0.6, 0.7, 0.8, 0.9]
blend_preds_dict = {}
for weight in blend_weights:
    blend_preds_dict[weight] = weight * cv_test_preds + (1-weight) * single_test_preds

### Defaulting to 0.8 weight for ensemble

In [48]:
probabilities = 0.8 * cv_test_preds + 0.2 * single_test_preds

## Feature importance analysis

In [49]:
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': final_model.feature_importances_
}).sort_values(by='Importance', ascending=False)
print("Top 20 features by importance:")
print(feature_importance.head(20))

Top 20 features by importance:
                   Feature  Importance
0                   LeadID         993
6            InterestModel         896
1                   Dealer         756
2               LeadSource         653
8               CellPrefix         623
27         LeadSource_freq         620
10           HourOfEnquiry         615
11            DayOfEnquiry         610
5             InterestMake         585
28        Lag_per_EmailLen         553
20      AllocationLagHours         550
24   LeadCreationHour_freq         499
13         LeadCreationDay         332
15  LeadCreationDayOfMonth         257
7                   Domain         216
26             Domain_freq         197
25    LeadCreationDay_freq         191
3                 LeadType         186
21    AllocationLagMinutes         163
12        LeadCreationHour         150


## Predicting On Test Data

In [50]:
dfTest["LeadID"] = dfTest["LeadID"].astype(int)

In [51]:
def save_submission(df: pd.DataFrame, probs: pd.Series | np.ndarray):
    """
    Writes the CSV exactly as the leaderboard expects.
    """
    out = pd.DataFrame(
        {
            "LeadID":                 df["LeadID"].values,
            "VehicleSoldProbability": probs
        }
    )

    # minimal sanity checks
    assert len(out) == len(df),                "Length mismatch."
    assert out["VehicleSoldProbability"].between(0, 1).all(), "Probs out of range."
    assert out["LeadID"].is_unique,            "Duplicate LeadID values."
    assert not out.isna().any().any(),         "NaNs detected."

    fname = f"submission.csv"
    out.to_csv(fname, index=False)

save_submission(dfTest, probabilities)