In [9]:
import os
import sys

# Step 1: Get the parent directory of your current working directory
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Step 2: Add project root to sys.path if not already present
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Step 3: Confirm
print("Project root added to sys.path:", project_root in sys.path)
print("src.data_loader readable:", os.path.exists(os.path.join(project_root, "src", "data_loader.py")))



Project root added to sys.path: True
src.data_loader readable: True


## 1. Introduction
- Predict total claims and optimal premium values.

In [10]:
import sys
sys.path.append('../src')
import pandas as pd
import src.data_loader as data_loader
import src.modeling as modeling
import src.utils as utils

## 2. Data Loading

In [13]:
df = data_loader.load_data('../data/MachineLearningRating_v3.txt')
df.head(10)

Unnamed: 0,UnderwrittenCoverID|PolicyID|TransactionMonth|IsVATRegistered|Citizenship|LegalType|Title|Language|Bank|AccountType|MaritalStatus|Gender|Country|Province|PostalCode|MainCrestaZone|SubCrestaZone|ItemType|mmcode|VehicleType|RegistrationYear|make|Model|Cylinders|cubiccapacity|kilowatts|bodytype|NumberOfDoors|VehicleIntroDate|CustomValueEstimate|AlarmImmobiliser|TrackingDevice|CapitalOutstanding|NewVehicle|WrittenOff|Rebuilt|Converted|CrossBorder|NumberOfVehiclesInFleet|SumInsured|TermFrequency|CalculatedPremiumPerTerm|ExcessSelected|CoverCategory|CoverType|CoverGroup|Section|Product|StatutoryClass|StatutoryRiskType|TotalPremium|TotalClaims
0,145249|12827|2015-03-01 00:00:00|True| |Close...
1,145249|12827|2015-05-01 00:00:00|True| |Close...
2,145249|12827|2015-07-01 00:00:00|True| |Close...
3,145255|12827|2015-05-01 00:00:00|True| |Close...
4,145255|12827|2015-07-01 00:00:00|True| |Close...
5,145247|12827|2015-01-01 00:00:00|True| |Close...
6,145247|12827|2015-04-01 00:00:00|True| |Close...
7,145247|12827|2015-06-01 00:00:00|True| |Close...
8,145247|12827|2015-08-01 00:00:00|True| |Close...
9,145245|12827|2015-03-01 00:00:00|True| |Close...


## 3. Feature Engineering
- Encode categorical variables for modeling.

In [None]:
cat_cols = ['Province', 'VehicleType', 'Gender', 'CoverType', 'Product']
df_enc = utils.encode_categorical(df, cat_cols)
df_enc.head()

KeyError: 'VehicleType'

 Fit Linear Regression Model

In [None]:
features = ['PostalCode', 'SumInsured', 'CalculatedPremiumPerTerm']
target = 'TotalClaims'
model, preds, r2, mse = modeling.fit_linear_regression(df_enc, features, target)
print(f'R2: {r2:.3f}, MSE: {mse:.2f}')

## 5. ML Model: Predict Optimal Premium Values

In [None]:
features = ['SumInsured', 'TotalClaims', 'PostalCode', 'Province', 'VehicleType', 'Gender']
target = 'TotalPremium'
model, X_test, y_test, y_pred, importances = modeling.fit_random_forest(df_enc, features, target)
from sklearn.metrics import mean_squared_error, r2_score
print(f'R2: {r2_score(y_test, y_pred):.3f}, MSE: {mean_squared_error(y_test, y_pred):.2f}')

### Feature Importances

In [None]:
import matplotlib.pyplot as plt
import numpy as np
feat_names = features
plt.figure(figsize=(8,4))
plt.barh(feat_names, importances)
plt.xlabel('Importance')
plt.title('Feature Importances for Premium Prediction')
plt.show()