In [22]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import TweedieRegressor
from sklearn.metrics import d2_tweedie_score, mean_absolute_error


In [None]:
df_freq = pq.read_table("clean_business_claims_freq.parquet").to_pandas()
df_sev = pq.read_table("clean_business_claims_sev.parquet").to_pandas()


In [None]:
df_freq.sort_values(by="solar_system", inplace=True)
df_sev.sort_values(by="solar_system", inplace=True)

In [None]:
#Grouping sev dataset based on policy id and summing claim_amount
agg_dict = {col: 'first' for col in df_sev.columns if col not in ['claim_id', 'claim_seq', 'policy_id', 'claim_amount']}
agg_dict['claim_amount'] = 'sum'

df_grouped_business_claims_sev = (
    df_sev.drop(columns=['claim_id', 'claim_seq'])
    .groupby('policy_id', as_index=False)
    .agg(agg_dict)
)

df_grouped_business_claims_sev.head(10)

In [21]:
#merging the two datasets on policy_id
sev_claim_amounts = df_sev.groupby('policy_id', as_index=False)['claim_amount'].sum()

df_combined = df_freq.merge(
    sev_claim_amounts, 
    on='policy_id', 
    how='left'
)

df_combined['claim_amount'] = df_combined['claim_amount'].fillna(0)

df_combined.columns

df_combined.to_csv("bussiness_clams_model_data.csv", index=False)

In [23]:
f_combined = df_combined[df_combined['exposure'] > 0].copy()
df_combined['pure_premium'] = df_combined['claim_amount'] / df_combined['exposure']

# 2. Separate Features, Target, and Weights
features = [
    'solar_system', 'production_load', 'energy_backup_score', 
    'supply_chain_index', 'avg_crew_exp', 'maintenance_freq', 
    'safety_compliance'
]

X = df_combined[features]
y = df_combined['pure_premium']
weights = df_combined['exposure']

# 3. Preprocessing Setup
# One-hot encode categorical data and standardize numerical data (crucial for regularization)
categorical_cols = ['solar_system']
numerical_cols = [col for col in features if col not in categorical_cols]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_cols)
    ])

# 4. Create Tweedie GLM Pipeline
# power=1.5 is standard for pure premium. alpha=0.1 applies mild L2 regularization.
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', TweedieRegressor(power=1.5, alpha=0.1, max_iter=1000))
])

# 5. Train/Test Split to validate against overfitting
X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    X, y, weights, test_size=0.2, random_state=42
)

# 6. Fit the Model (passing exposure as weights)
model_pipeline.fit(X_train, y_train, regressor__sample_weight=w_train)

# 7. Evaluate
y_pred = model_pipeline.predict(X_test)

# The D^2 score measures the fraction of Tweedie deviance explained by the model
d2_score = d2_tweedie_score(y_test, y_pred, sample_weight=w_test, power=1.5)
mae = mean_absolute_error(y_test, y_pred, sample_weight=w_test)

print(f"D^2 Tweedie Score (Deviance Explained): {d2_score:.4f}")
print(f"Weighted Mean Absolute Error: {mae:.2f}")

D^2 Tweedie Score (Deviance Explained): -0.0025
Weighted Mean Absolute Error: 1651107.47
