In [1]:
import pandas as pd
import numpy as np

dtype_dict = {
    "OrdCat": "object",  #Set columns as object
}

df = pd.read_csv("data/Insurance_claim_train.csv", dtype=dtype_dict, low_memory=False)

# Replace '?' with NaN for proper missing value handling
df.replace("?", np.nan, inplace=True)




In [2]:
#Print info 
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 35 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   Row_ID          1000000 non-null  int64  
 1   Household_ID    1000000 non-null  int64  
 2   Vehicle         1000000 non-null  int64  
 3   Calendar_Year   1000000 non-null  int64  
 4   Model_Year      1000000 non-null  int64  
 5   Blind_Make      999609 non-null   object 
 6   Blind_Model     999609 non-null   object 
 7   Blind_Submodel  999609 non-null   object 
 8   Cat1            997475 non-null   object 
 9   Cat2            615951 non-null   object 
 10  Cat3            999664 non-null   object 
 11  Cat4            556566 non-null   object 
 12  Cat5            556202 non-null   object 
 13  Cat6            997475 non-null   object 
 14  Cat7            429802 non-null   object 
 15  Cat8            999785 non-null   object 
 16  Cat9            1000000 non-null  obj

In [3]:
# Drop columns Row_ID and Household_ID
df.drop(columns=["Row_ID", "Household_ID"], inplace=True)

df["Claim_Binary"] = (df["Claim_Amount"] > 0).astype(int)



In [4]:
categorical_cols = [
    'Blind_Make','Blind_Model','Blind_Submodel',
    'Cat1', 'Cat2', 'Cat3', 'Cat4', 'Cat5', 'Cat6', 'Cat7', 'Cat8', 
    'Cat9', 'Cat10', 'Cat11', 'Cat12', 'NVCat', 'OrdCat'  # Exclude high-cardinality ones like Blind_Make
]

df[categorical_cols] = df[categorical_cols].astype('category')

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 34 columns):
 #   Column          Non-Null Count    Dtype   
---  ------          --------------    -----   
 0   Vehicle         1000000 non-null  int64   
 1   Calendar_Year   1000000 non-null  int64   
 2   Model_Year      1000000 non-null  int64   
 3   Blind_Make      999609 non-null   category
 4   Blind_Model     999609 non-null   category
 5   Blind_Submodel  999609 non-null   category
 6   Cat1            997475 non-null   category
 7   Cat2            615951 non-null   category
 8   Cat3            999664 non-null   category
 9   Cat4            556566 non-null   category
 10  Cat5            556202 non-null   category
 11  Cat6            997475 non-null   category
 12  Cat7            429802 non-null   category
 13  Cat8            999785 non-null   category
 14  Cat9            1000000 non-null  category
 15  Cat10           999700 non-null   category
 16  Cat11           997

In [5]:
categorical_columns = df.select_dtypes(include=['category']).columns.tolist()
print("Categorical columns:", categorical_columns)

Categorical columns: ['Blind_Make', 'Blind_Model', 'Blind_Submodel', 'Cat1', 'Cat2', 'Cat3', 'Cat4', 'Cat5', 'Cat6', 'Cat7', 'Cat8', 'Cat9', 'Cat10', 'Cat11', 'Cat12', 'OrdCat', 'NVCat']


In [6]:
# One-hot encode ONLY categorical columns
df_encoded = pd.get_dummies(
    df,
    columns=categorical_columns,
    sparse=True,  # Use sparse data structure
    dtype='uint8'  # Reduce memory to 1 byte per value
)

print(f"Encoded shape: {df_encoded.shape}")
print(f"Memory usage: {df_encoded.memory_usage(deep=True).sum() / 1e9:.2f} GB")

Encoded shape: (1000000, 3767)
Memory usage: 0.21 GB


In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Assuming df is already loaded and preprocessed
X = df_encoded.drop(columns=["Claim_Amount", "Claim_Binary"])  # Features
y = df_encoded["Claim_Binary"]  # Target

# Split the dataset into training and test sets (stratify ensures class balance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
import lightgbm as lgb
import time

# Define LightGBM Parameters
params_with_efb = {
    'objective': 'binary',
    'metric': 'auc',
    'verbose': 1,
    'data_sample_strategy': 'goss',
}

# Create LightGBM Datasets
train_data_with_efb = lgb.Dataset(X_train, y_train,  params={'enable_bundle': True})
valid_data_with_efb = lgb.Dataset(X_test, y_test, params={'enable_bundle': True})

print("Training LightGBM Model with EFB Enabled...")
start_time_with_efb = time.time()

# Dictionary to store evaluation results
evals_result_with_efb = {}

# Train the model with EFB
model_with_efb = lgb.train(
    params_with_efb,
    train_data_with_efb,
    valid_sets=[valid_data_with_efb],
    valid_names=['validation'],
    num_boost_round=1000,
    callbacks=[
        lgb.record_evaluation(evals_result_with_efb),
        lgb.early_stopping(stopping_rounds=10, verbose=True)
    ]
)

training_time_with_efb = time.time() - start_time_with_efb
best_iter_with_efb = model_with_efb.best_iteration

print(f"\nLightGBM Model with EFB Enabled Training Complete. Best Iteration: {best_iter_with_efb}")
print(f"Training Time: {training_time_with_efb:.4f} seconds")


In [None]:
# Define LightGBM Parameters (EFB Disabled)
params_without_efb = {
    'objective': 'binary',
    'metric': 'auc',
    'verbose': 1,
}

# Create LightGBM Datasets
train_data_without_efb = lgb.Dataset(X_train, y_train, params={'enable_bundle': False, 'use_missing':True})
valid_data_without_efb = lgb.Dataset(X_test, y_test, params={'enable_bundle': False,'use_missing':True})

print("Training LightGBM Model with EFB Disabled...")
start_time_without_efb = time.time()

# Dictionary to store evaluation results
evals_result_without_efb = {}

# Train the model without EFB
model_without_efb = lgb.train(
    params_without_efb,
    train_data_without_efb,
    valid_sets=[valid_data_without_efb],
    valid_names=['validation'],
    num_boost_round=1000,
    callbacks=[
        lgb.record_evaluation(evals_result_without_efb),
        lgb.early_stopping(stopping_rounds=10, verbose=True)
    ]
)

training_time_without_efb = time.time() - start_time_without_efb
best_iter_without_efb = model_without_efb.best_iteration

print(f"\nLightGBM Model with EFB Disabled Training Complete. Best Iteration: {best_iter_without_efb}")
print(f"Training Time: {training_time_without_efb:.4f} seconds")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract AUC values from evaluation results
auc_with_efb = np.array(evals_result_with_efb['validation']['auc'])
auc_without_efb = np.array(evals_result_without_efb['validation']['auc'])

# Generate time per iteration assuming uniform distribution of total time
time_per_iter_with_efb = np.linspace(0, training_time_with_efb, len(auc_with_efb))
time_per_iter_without_efb = np.linspace(0, training_time_without_efb, len(auc_without_efb))

# Plot AUC vs. Training Time
plt.figure(figsize=(10, 6))
plt.plot(time_per_iter_with_efb, auc_with_efb, label='EFB Enabled', linestyle='-')
plt.plot(time_per_iter_without_efb, auc_without_efb, label='EFB Disabled', linestyle='--')

# Customize the plot
plt.xlabel("Training Time (seconds)")
plt.ylabel("Validation AUC")
plt.title("AUC Over Training Time (EFB Enabled vs. Disabled)")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import xgboost as xgb
import time  # ensure you have imported time

# For histogram splitting
params_xgb_hist = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'verbosity': 1,
    'tree_method': 'hist'  # Use histogram-based splitting
}


# Create XGBoost DMatrix
dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_test, y_test)

print("Training XGBoost Model Histogram...")
start_time_xgb = time.time()

# Dictionary to store evaluation results
evals_result_xgb_hist = {}

# Train the model
model_xgb = xgb.train(
    params_xgb_hist,
    dtrain,
    num_boost_round=1000,
    evals=[(dvalid, 'validation')],
    early_stopping_rounds=10,
    evals_result=evals_result_xgb_hist,
    verbose_eval=True
)

training_time_xgb = time.time() - start_time_xgb
best_iter_xgb = model_xgb.best_iteration

print(f"\nXGBoost Model Training Complete. Best Iteration: {best_iter_xgb}")
print(f"Training Time: {training_time_xgb:.4f} seconds")


In [None]:
# Alternatively, for exact splitting
params_xgb_exact = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'verbosity': 1,
    'tree_method': 'exact'  # Use exact splitting
}

print("Training XGBoost Model Exact...")
start_time_xgb_exact = time.time()

# Dictionary to store evaluation results
evals_result_xgb_exact = {}

# Train the model
model_xgb_exact = xgb.train(
    params_xgb_exact,
    dtrain,
    num_boost_round=1000,
    evals=[(dvalid, 'validation')],
    early_stopping_rounds=10,
    evals_result=evals_result_xgb_exact,
    verbose_eval=True
)

training_time_xgb_exact = time.time() - start_time_xgb_exact
best_iter_xgb_exact = model_xgb_exact.best_iteration

print(f"\nXGBoost Model Training Complete. Best Iteration: {best_iter_xgb_exact}")
print(f"Training Time: {training_time_xgb_exact:.4f} seconds")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# --- XGBoost Histogram Splitting ---
# Extract AUC values and generate time stamps for histogram splitting model
auc_hist = np.array(evals_result_xgb_hist['validation']['auc'])
time_per_iter_hist = np.linspace(0, training_time_xgb, len(auc_hist))

# --- XGBoost Exact Splitting ---
# Extract AUC values and generate time stamps for exact splitting model
auc_exact = np.array(evals_result_xgb_exact['validation']['auc'])
time_per_iter_exact = np.linspace(0, training_time_xgb_exact, len(auc_exact))

# --- Combined Plot ---
plt.figure(figsize=(10, 6))
plt.plot(time_per_iter_hist, auc_hist, label='XGBoost Histogram Splitting', color='blue', linestyle='-')
plt.plot(time_per_iter_exact, auc_exact, label='XGBoost Exact Splitting', color='red', linestyle='--')

plt.xlabel("Training Time (seconds)")
plt.ylabel("Validation AUC")
plt.title("XGBoost Splitting Methods Comparison")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# --- LightGBM Models ---
# Extract AUC values and generate time stamps for LightGBM (EFB Enabled and Disabled)
auc_with_efb = np.array(evals_result_with_efb['validation']['auc'])
auc_without_efb = np.array(evals_result_without_efb['validation']['auc'])
time_per_iter_with_efb = np.linspace(0, training_time_with_efb, len(auc_with_efb))
time_per_iter_without_efb = np.linspace(0, training_time_without_efb, len(auc_without_efb))

# --- XGBoost Models ---
# (Using the same variables from Cell 1)
auc_hist = np.array(evals_result_xgb_hist['validation']['auc'])
time_per_iter_hist = np.linspace(0, training_time_xgb, len(auc_hist))

auc_exact = np.array(evals_result_xgb_exact['validation']['auc'])
time_per_iter_exact = np.linspace(0, training_time_xgb_exact, len(auc_exact))

# --- Combined Plot ---
plt.figure(figsize=(12, 8))

# Plot LightGBM models
plt.plot(time_per_iter_with_efb, auc_with_efb, label='LightGBM EFB Enabled', color='green', linestyle='-')
plt.plot(time_per_iter_without_efb, auc_without_efb, label='LightGBM EFB Disabled', color='orange', linestyle='--')

# Plot XGBoost models
plt.plot(time_per_iter_hist, auc_hist, label='XGBoost Histogram Splitting', color='blue', linestyle='-')
plt.plot(time_per_iter_exact, auc_exact, label='XGBoost Exact Splitting', color='red', linestyle='--')

plt.xlabel("Training Time (seconds)")
plt.ylabel("Validation AUC")
plt.title("LightGBM vs. XGBoost Model Comparison")
plt.legend()
plt.grid(True)
plt.show()
