In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import root_mean_squared_error

from pathlib import Path
from xgboost import XGBRegressor

# -----------------------------
# Paths — CHANGE THESE
# -----------------------------
DATA_DIR = Path("../data")
DATA_SOURCE_DIR = Path("../data/playground-series-s4e12")
TRAIN_CSV = DATA_DIR / "train.csv"
TARGET = "Premium Amount"
RANDOM_STATE = 42

## Split the data

In [3]:
df = pd.read_csv(DATA_SOURCE_DIR / "train.csv", index_col=0)

# Split into train, validation, and test sets, # 60% train, 40% temp
train, temp = train_test_split(df, test_size=0.4, random_state=42)
# 20% validation, 20% test
validation, test = train_test_split(temp, test_size=0.5, random_state=42)

# Display the sizes of the splits
print("Length of train set:", len(train))
print("Length of validation set:", len(validation))
print("Length of test set:", len(test))

train.to_csv(DATA_DIR / "train.csv", index=True)
validation.to_csv(DATA_DIR / "validation.csv", index=True)
test.to_csv(DATA_DIR / "test.csv", index=True)

Length of train set: 720000
Length of validation set: 240000
Length of test set: 240000


## View Data Stats

In [4]:
df = train
df.describe()

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Premium Amount
count,708729.0,692939.0,654158.0,675469.0,501681.0,719996.0,637368.0,719999.0,720000.0
mean,41.158674,32761.44175,2.007617,25.617142,1.003373,9.570049,593.053023,5.015958,1102.863665
std,13.538379,32201.003672,1.417488,12.201577,0.98417,5.776467,149.87316,2.594761,865.591104
min,18.0,1.0,0.0,2.012237,0.0,0.0,300.0,1.0,20.0
25%,30.0,8020.0,1.0,15.923832,0.0,5.0,468.0,3.0,514.0
50%,41.0,23957.0,2.0,24.586569,1.0,10.0,595.0,5.0,872.0
75%,53.0,44637.0,3.0,34.534997,2.0,15.0,721.0,7.0,1508.0
max,64.0,149997.0,4.0,58.975914,8.0,19.0,849.0,9.0,4999.0


In [5]:
display(df.head())
print("\nShape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing values per column:\n", df.isna().sum())

Unnamed: 0_level_0,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
476061,53.0,Female,18116.0,Single,0.0,PhD,Employed,34.727739,Rural,Premium,,19.0,423.0,8.0,2023-07-26 15:21:39.083497,Poor,Yes,Rarely,Condo,1442.0
186666,29.0,Male,,Married,,Bachelor's,Self-Employed,29.684751,Suburban,Premium,,4.0,,4.0,2024-02-12 15:21:39.237118,Good,No,Weekly,Condo,417.0
742870,25.0,Male,3895.0,Single,,Master's,Self-Employed,12.953414,Urban,Basic,2.0,12.0,551.0,5.0,2024-04-17 15:21:39.133866,Poor,No,Monthly,Condo,397.0
173677,56.0,Male,40347.0,Single,1.0,Master's,,47.739073,Rural,Basic,0.0,4.0,323.0,6.0,2023-06-14 15:21:39.082499,,No,Weekly,House,993.0
215979,26.0,Male,1326.0,Married,2.0,PhD,,22.087884,Rural,Comprehensive,,18.0,,8.0,2023-04-06 15:21:39.194829,,Yes,Monthly,House,2472.0



Shape: (720000, 20)

Columns: ['Age', 'Gender', 'Annual Income', 'Marital Status', 'Number of Dependents', 'Education Level', 'Occupation', 'Health Score', 'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type', 'Premium Amount']

Missing values per column:
 Age                      11271
Gender                       0
Annual Income            27061
Marital Status           11160
Number of Dependents     65842
Education Level              0
Occupation              214718
Health Score             44531
Location                     0
Policy Type                  0
Previous Claims         218319
Vehicle Age                  4
Credit Score             82632
Insurance Duration           1
Policy Start Date            0
Customer Feedback        46648
Smoking Status               0
Exercise Frequency           0
Property Type                0
Premium A

## Separate features & Preprocessing

In [6]:
# Separate features/target
y = df[TARGET].astype(float)
X = df.drop(columns=[TARGET])

# Infer column types
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print("Categorical cols:", cat_cols)
print("Numeric cols:", num_cols)

Categorical cols: ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']
Numeric cols: ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration']


In [7]:
# -----------------------------
# Preprocessing & Model
# -----------------------------

# Numeric: median impute (XGBoost doesn't require scaling)
num_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
    ]
)

# Categorical: impute missing with constant and OHE with 'ignore' for unseen
cat_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=True)),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", num_pipe, num_cols),
        ("cat", cat_pipe, cat_cols),
    ]
)

# A conservative baseline XGBoost config (adjust freely)
xgb = XGBRegressor(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="reg:squarederror",
    random_state=RANDOM_STATE,
    n_jobs=-1,
    tree_method="hist",  # fast histogram algorithm
)

model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("xgb", xgb),
    ]
)

## Train model

In [9]:
# -----------------------------
# Train/Validation Split & Fit
# -----------------------------

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

model.fit(X_train, y_train)

# Evaluate
pred_valid = model.predict(X_valid)
rmse = root_mean_squared_error(y_valid, pred_valid)
print(f"Validation RMSE: {rmse:,.3f}")

Validation RMSE: 850.062
