In [None]:
import pandas as pd
import numpy as np

dtype_dict = {
    "OrdCat": "object",  # Explicitly set this column as categorical
}

df = pd.read_csv("data/Insurance_claim_train.csv", dtype=dtype_dict, low_memory=False)

# Replace '?' with NaN for proper missing value handling
df.replace("?", np.nan, inplace=True)




In [None]:
#Print info 
print(df.info())

In [None]:
# Drop columns Row_ID and Household_ID
df.drop(columns=["Row_ID"], inplace=True)

df["Claim_Binary"] = (df["Claim_Amount"] > 0).astype(int)



In [None]:
categorical_cols = [
    'Blind_Make','Blind_Model','Blind_Submodel',
    'Cat1', 'Cat2', 'Cat3', 'Cat4', 'Cat5', 'Cat6', 'Cat7', 'Cat8', 
    'Cat9', 'Cat10', 'Cat11', 'Cat12', 'NVCat', 'OrdCat'  # Exclude high-cardinality ones like Blind_Make
]

df[categorical_cols] = df[categorical_cols].astype('category')

print(df.info())

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Split data
X = df.drop(columns=["Claim_Amount", "Claim_Binary"])  # Features
y = df["Claim_Binary"]  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
import time
import lightgbm as lgb

# Set parameters for both models
params_baseline = {
    'boosting_type': 'gbdt',
    'enable_bundle': False,        
    'learning_rate': 0.1,
    'n_estimators': 1000,  # Set high, will stop early if needed
    'subsample': 1.0,
    'random_state': 42,
    'verbose': -1
}

params_efb_goss = {
    'boosting_type': 'goss',       
    'enable_bundle': True,         
    'learning_rate': 0.1,
    'n_estimators': 1000,  
    'subsample': 1.0,
    'random_state': 42,
    'verbose': -1
}

# Create models
model_baseline = lgb.LGBMClassifier(**params_baseline)
model_efb_goss = lgb.LGBMClassifier(**params_efb_goss)

# Dictionary to store evaluation results
evals_result_baseline = {}
evals_result_efb_goss = {}

# Track training time per iteration
time_per_iter_baseline = []
time_per_iter_efb_goss = []

# Callback function to track time per iteration and print the current estimator
def tracking_callback(time_list, model_name):
    def callback(env):
        current_iter = env.iteration  # Get current boosting round
        time_list.append(time.time())
        print(f"{model_name}: Iteration {current_iter}/{env.end_iteration}", end='\r', flush=True)
    return callback

# Train baseline model with tracking
print("Training baseline model...")
start_time_baseline = time.time()
model_baseline.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[
        lgb.record_evaluation(evals_result_baseline),
        lgb.early_stopping(stopping_rounds=10, verbose=True),
        tracking_callback(time_per_iter_baseline, "Baseline Model")
    ]
)
training_time_baseline = time.time() - start_time_baseline
best_iter_baseline = model_baseline.best_iteration_

# Train EFB+GOSS model with tracking
print("\nTraining EFB+GOSS model...")
start_time_efb_goss = time.time()
model_efb_goss.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[
        lgb.record_evaluation(evals_result_efb_goss),
        lgb.early_stopping(stopping_rounds=10, verbose=True),
        tracking_callback(time_per_iter_efb_goss, "EFB+GOSS Model")
    ]
)
training_time_efb_goss = time.time() - start_time_efb_goss
best_iter_efb_goss = model_efb_goss.best_iteration_

# Convert time tracking into actual iteration-based timings
time_points_baseline = [t - time_per_iter_baseline[0] for t in time_per_iter_baseline]
time_points_efb_goss = [t - time_per_iter_efb_goss[0] for t in time_per_iter_efb_goss]

# Extract AUC values only up to the best iteration
auc_baseline = evals_result_baseline['valid_0']['auc'][:best_iter_baseline]
auc_efb_goss = evals_result_efb_goss['valid_0']['auc'][:best_iter_efb_goss]

# Ensure both models have the same iteration count for comparison
min_iters = min(best_iter_baseline, best_iter_efb_goss)
auc_baseline = auc_baseline[:min_iters]
auc_efb_goss = auc_efb_goss[:min_iters]
time_points_baseline = time_points_baseline[:min_iters]
time_points_efb_goss = time_points_efb_goss[:min_iters]

# Print results
print(f"\nBaseline Model: Best Iteration = {best_iter_baseline}, Training Time = {training_time_baseline:.2f}s")
print(f"EFB+GOSS Model: Best Iteration = {best_iter_efb_goss}, Training Time = {training_time_efb_goss:.2f}s")


In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score

# Evaluate final models
def evaluate_model(name, model):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)
    print(f"\n{name} Results:")
    print(f"Accuracy: {acc:.4f}")
    print(f"ROC AUC: {auc:.4f}")
    print(f"Training Time: {training_time_baseline if name == 'Baseline' else training_time_efb_goss:.2f}s")

evaluate_model("Baseline", model_baseline)
evaluate_model("EFB+GOSS", model_efb_goss)

In [None]:
import matplotlib.pyplot as plt

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(time_points_baseline, auc_baseline, label='lgb_baseline', color='orange')
plt.plot(time_points_efb_goss, auc_efb_goss, label='LightGBM', color='blue')
plt.xlabel('Training Time (seconds)')
plt.ylabel('Test Set AUC')
plt.title('AUC Progression During Training')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import time
import lightgbm as lgb
import xgboost as xgb

# ========== LightGBM Parameters & Models ========== #
params_baseline = {
    'boosting_type': 'gbdt',
    'enable_bundle': False,        
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'subsample': 1.0,
    'random_state': 42,
    'verbose': -1
}

params_efb_goss = {
    'boosting_type': 'goss',       
    'enable_bundle': True,         
    'learning_rate': 0.1,
    'n_estimators': 1000,  
    'subsample': 1.0,
    'random_state': 42,
    'verbose': -1
}

model_baseline = lgb.LGBMClassifier(**params_baseline)
model_efb_goss = lgb.LGBMClassifier(**params_efb_goss)

# ========== XGBoost Parameters & Models ========== #
params_xgb_hist = {
    'objective': 'binary:logistic',
    'tree_method': 'hist',       # Histogram-based method
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'subsample': 1.0,
    'random_state': 42,
    'eval_metric': 'auc'
}

params_xgb_exact = {
    'objective': 'binary:logistic',
    'tree_method': 'exact',      # Exact greedy method
    'learning_rate': 0.1,
    'n_estimators': 1000,
    'subsample': 1.0,
    'random_state': 42,
    'eval_metric': 'auc'
}

model_xgb_hist = xgb.XGBClassifier(**params_xgb_hist)
model_xgb_exact = xgb.XGBClassifier(**params_xgb_exact)

# ========== Tracking Callbacks ========== #
# LightGBM Callback
def tracking_callback(time_list, model_name):
    def callback(env):
        current_iter = env.iteration
        time_list.append(time.time())
        print(f"{model_name}: Iteration {current_iter}/{env.end_iteration}", end='\r', flush=True)
    return callback

# XGBoost Callback (Custom Implementation)
class XGBTimeCallback(xgb.callback.TrainingCallback):
    def __init__(self, time_list, model_name):
        self.time_list = time_list
        self.model_name = model_name
        self.start_time = None

    def before_training(self, model):
        self.start_time = time.time()
        return model

    def after_iteration(self, model, epoch, evals_log):
        current_time = time.time() - self.start_time
        self.time_list.append(current_time)
        print(f"{self.model_name}: Iteration {epoch}", end='\r', flush=True)
        return False  # Do not stop training

# ========== Train Models & Track Results ========== #
# Dictionary to store results for all models
evals_result = {
    'baseline': {}, 
    'efb_goss': {},
    'xgb_hist': {},
    'xgb_exact': {}
}

time_tracking = {
    'baseline': [],
    'efb_goss': [],
    'xgb_hist': [],
    'xgb_exact': []
}

# Train LightGBM Baseline
print("Training LightGBM Baseline...")
start_time = time.time()
model_baseline.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[
        lgb.record_evaluation(evals_result['baseline']),
        lgb.early_stopping(stopping_rounds=10),
        tracking_callback(time_tracking['baseline'], "Baseline")
    ]
)
training_time_baseline = time.time() - start_time
best_iter_baseline = model_baseline.best_iteration_

# Train LightGBM EFB+GOSS
print("\nTraining LightGBM EFB+GOSS...")
start_time = time.time()
model_efb_goss.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[
        lgb.record_evaluation(evals_result['efb_goss']),
        lgb.early_stopping(stopping_rounds=10),
        tracking_callback(time_tracking['efb_goss'], "EFB+GOSS")
    ]
)
training_time_efb_goss = time.time() - start_time
best_iter_efb_goss = model_efb_goss.best_iteration_

# Train XGBoost Histogram
print("\nTraining XGBoost Histogram...")
start_time = time.time()
model_xgb_hist.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=False,
    callbacks=[XGBTimeCallback(time_tracking['xgb_hist'], "XGB Hist")],
    evals_result=evals_result['xgb_hist']
)
training_time_xgb_hist = time.time() - start_time
best_iter_xgb_hist = model_xgb_hist.best_iteration

# Train XGBoost Exact
print("\nTraining XGBoost Exact...")
start_time = time.time()
model_xgb_exact.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=10,
    verbose=False,
    callbacks=[XGBTimeCallback(time_tracking['xgb_exact'], "XGB Exact")],
    evals_result=evals_result['xgb_exact']
)
training_time_xgb_exact = time.time() - start_time
best_iter_xgb_exact = model_xgb_exact.best_iteration

# ========== Process Results ========== #
# Extract AUC values (LightGBM uses 'valid_0', XGBoost uses 'validation_0')
auc_baseline = evals_result['baseline']['valid_0']['auc'][:best_iter_baseline]
auc_efb_goss = evals_result['efb_goss']['valid_0']['auc'][:best_iter_efb_goss]
auc_xgb_hist = evals_result['xgb_hist']['validation_0']['auc'][:best_iter_xgb_hist]
auc_xgb_exact = evals_result['xgb_exact']['validation_0']['auc'][:best_iter_xgb_exact]

# Convert time tracking to cumulative seconds
time_points = {
    'baseline': [t - time_tracking['baseline'][0] for t in time_tracking['baseline']],
    'efb_goss': [t - time_tracking['efb_goss'][0] for t in time_tracking['efb_goss']],
    'xgb_hist': time_tracking['xgb_hist'],  # Already cumulative
    'xgb_exact': time_tracking['xgb_exact']
}

# ========== Print Results ========== #
print(f"\nLightGBM Baseline: Best Iter={best_iter_baseline}, Time={training_time_baseline:.2f}s")
print(f"LightGBM EFB+GOSS: Best Iter={best_iter_efb_goss}, Time={training_time_efb_goss:.2f}s")
print(f"XGBoost Histogram: Best Iter={best_iter_xgb_hist}, Time={training_time_xgb_hist:.2f}s")
print(f"XGBoost Exact:     Best Iter={best_iter_xgb_exact}, Time={training_time_xgb_exact:.2f}s")