# **Model Building**

### **Importing Libs**

In [2]:
!pip install lightgbm xgboost catboost polars numpy matplotlib seaborn

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

### **Load final datasets**

In [4]:
DATA_DIR = r"D:\Case studies competitions\Amex\CSV\final_data"
print(f"Data directory set to: {DATA_DIR}")

Data directory set to: D:\Case studies competitions\Amex\CSV\final_data


In [5]:
# Option 1: Use os.path.join (recommended)
train_path = os.path.join(DATA_DIR, "final_train.parquet")
test_path = os.path.join(DATA_DIR, "final_test.parquet")


In [6]:
# Check if files exist before loading
if os.path.exists(train_path):
    print(f"Loading train data from: {train_path}")
    train = pl.read_parquet(train_path)
else:
    print(f"Train file not found at: {train_path}")

if os.path.exists(test_path):
    print(f"Loading test data from: {test_path}")
    test = pl.read_parquet(test_path)
else:
    print(f"Test file not found at: {test_path}")

Loading train data from: D:\Case studies competitions\Amex\CSV\final_data\final_train.parquet
Loading test data from: D:\Case studies competitions\Amex\CSV\final_data\final_test.parquet


In [7]:
target_col = "y"
X_train = train.drop(target_col)
y_train = train[target_col]


In [8]:
import polars as pl
import numpy as np

# 1. Keep only allowed dtypes: Float64, Int64, Int32, Boolean
allowed_types = (pl.Float64, pl.Int64, pl.Int32, pl.Boolean)

# 2. Filter out non-numeric, list, struct, or duration columns
X_train_flat = X_train.select([
    col for col, dtype in X_train.schema.items()
    if isinstance(dtype, allowed_types)
])

# 3. Convert to NumPy (now it's small and safe)
X_np = X_train_flat.to_numpy()
y_np = y_train.to_numpy().ravel()

# 4. Split
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(X_np, y_np, test_size=0.2, random_state=42)

# Optional: Print shape to confirm
print("Train shape:", X_tr.shape)
print("Validation shape:", X_val.shape)


Train shape: (616131, 223)
Validation shape: (154033, 223)


#### **Confirming Target**

In [9]:
# Step 1: Confirm the target column
target_col = "y"

# Step 2: Exclude ID columns and target from training features
id_cols = ["id1", "id2", "id3"]  # Used for mapping predictions later
excluded_cols = id_cols + [target_col]

# Step 3: Select feature columns automatically
feature_cols = [col for col in train.columns if col not in excluded_cols]

print(f"Total Features Used: {len(feature_cols)}")
print("Sample Feature Columns:", feature_cols[:10])

# Step 4: Prepare training inputs
X_train_pl = train.select(feature_cols)
y_train_pl = train.select([target_col])

# Optional: save the IDs for later mapping
id_df = train.select(id_cols)


Total Features Used: 725
Sample Feature Columns: ['id4', 'id5', 'f1', 'f2', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10']


#### **Setting up LightGBM**

In [10]:
import lightgbm as lgb

# Create LightGBM Dataset
train_data = lgb.Dataset(X_tr, label=y_tr)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)


#### **Feature group mapping**

In [11]:
# Step 1: Define field groups
field_groups = {
    "user": ["id2", "f2", "f29", "f42", "f43"],  # example user features
    "offer": ["id3", "f5", "f6", "f7", "f8"],    # offer-related features
    "event": ["id10", "f11", "f12"],             # interaction/event
    "time": ["id4", "id5", "id12_year", "id13_day"],  # time info
    "behavior": ["f90", "f91", "f92", "f93"],    # derived behavior
    "interaction": ["id2_id3_combo", "id2_id10_combo", "id3_id10_combo"],  # combo encodings
    "conversion": ["conversion_ratio_user", "conversion_ratio_offer"]     # conversion features
}

In [12]:
# Step 2: Map all features to a field
all_feature_cols = [col for col in train.columns if col != "y"]

feature_to_field = {}
for field, features in field_groups.items():
    for feat in features:
        if feat in all_feature_cols:
            feature_to_field[feat] = field

# Step 3: Assign "other" to remaining ungrouped features
for col in all_feature_cols:
    if col not in feature_to_field:
        feature_to_field[col] = "other"

# Step 4: Group summary (optional print)
from collections import defaultdict
field_summary = defaultdict(list)
for feat, field in feature_to_field.items():
    field_summary[field].append(feat)

for field, feats in field_summary.items():
    print(f"🟩 {field.upper()} ({len(feats)} features): {feats[:10]}{'...' if len(feats) > 10 else ''}")


🟩 USER (5 features): ['id2', 'f2', 'f29', 'f42', 'f43']
🟩 OFFER (5 features): ['id3', 'f5', 'f6', 'f7', 'f8']
🟩 EVENT (3 features): ['id10', 'f11', 'f12']
🟩 TIME (4 features): ['id4', 'id5', 'id12_year', 'id13_day']
🟩 BEHAVIOR (3 features): ['f90', 'f91', 'f93']
🟩 INTERACTION (3 features): ['id2_id3_combo', 'id2_id10_combo', 'id3_id10_combo']
🟩 OTHER (705 features): ['id1', 'f1', 'f9', 'f10', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27']...


In [25]:
import polars as pl
import lightgbm as lgb
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import early_stopping, log_evaluation
import time

In [29]:
# STEP 2: Define columns to exclude (leakage-prone or non-numeric)
leakage_cols = [
    "id1", "id2", "id3", "id10", "id12", "id13", "y",
    "id2_id3_combo", "id2_id10_combo", "id3_id10_combo",
    "conversion_ratio_user", "conversion_ratio_offer"
]

In [30]:
# STEP 3: Select numeric-only + safe features
valid_cols = []
for col, dtype in train.schema.items():
    if col not in leakage_cols and isinstance(dtype, (pl.Float32, pl.Float64, pl.Int32, pl.Int64)):
        valid_cols.append(col)

In [31]:
# STEP 4: Prepare features and labels
X = train.select(valid_cols)
y = train["y"].to_numpy().ravel()
X_np = X.to_numpy()


In [32]:
# STEP 5: Train/Validation Split
X_train_np, X_val_np, y_train_np, y_val_np = train_test_split(X_np, y, test_size=0.2, random_state=42)

In [33]:

# STEP 6: LightGBM Dataset
lgb_train = lgb.Dataset(X_train_np, label=y_train_np)
lgb_val = lgb.Dataset(X_val_np, label=y_val_np, reference=lgb_train)

In [34]:
# STEP 7: LightGBM Parameters
params = {
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "seed": 42
}

In [35]:
# STEP 8: Training
print("⚙️ Starting LightGBM training...")
start = time.time()

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_train, lgb_val],
    valid_names=["train", "val"],
    num_boost_round=300,
    callbacks=[
        early_stopping(stopping_rounds=20),
        log_evaluation(period=20)
    ]
)

print(f"✅ Training completed in {time.time() - start:.2f} seconds")


⚙️ Starting LightGBM training...
Training until validation scores don't improve for 20 rounds
[20]	train's auc: 0.991589	val's auc: 0.990221
[40]	train's auc: 0.992333	val's auc: 0.990615
[60]	train's auc: 0.992998	val's auc: 0.99094
[80]	train's auc: 0.993576	val's auc: 0.991177
[100]	train's auc: 0.994101	val's auc: 0.99131
[120]	train's auc: 0.994584	val's auc: 0.991389
[140]	train's auc: 0.994931	val's auc: 0.991451
[160]	train's auc: 0.995294	val's auc: 0.991478
[180]	train's auc: 0.995654	val's auc: 0.991542
[200]	train's auc: 0.996033	val's auc: 0.991541
Early stopping, best iteration is:
[183]	train's auc: 0.995714	val's auc: 0.991558
✅ Training completed in 41.13 seconds


In [37]:
y_val_np = y_val_np.astype(int)

In [38]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, log_loss, confusion_matrix, roc_auc_score
)

# Convert y_val to int if needed
y_val_np = y_val_np.astype(int)

# Get predictions and probabilities
y_pred_proba = model.predict(X_val_np)
y_pred = (y_pred_proba >= 0.5).astype(int)

# Compute metrics
print("🔍 Evaluation Metrics on Validation Set:")
print(f"AUC:          {roc_auc_score(y_val_np, y_pred_proba):.5f}")
print(f"Accuracy:     {accuracy_score(y_val_np, y_pred):.5f}")
print(f"Precision:    {precision_score(y_val_np, y_pred):.5f}")
print(f"Recall:       {recall_score(y_val_np, y_pred):.5f}")
print(f"F1 Score:     {f1_score(y_val_np, y_pred):.5f}")
print(f"Log Loss:     {log_loss(y_val_np, y_pred_proba):.5f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_val_np, y_pred)}")


🔍 Evaluation Metrics on Validation Set:
AUC:          0.99156
Accuracy:     0.98341
Precision:    0.88024
Recall:       0.74862
F1 Score:     0.80911
Log Loss:     0.04319
Confusion Matrix:
[[146060    737]
 [  1819   5417]]
