In [1]:
# src/read_from_db.py
import sqlite3
import pandas as pd812
from cryptography.fernet import Fernet
import pandas as pd

DB_FILE = "telematics.db"
KEY_FILE = "secret.key"

# ------------------------
# Connect to database
# ------------------------
conn = sqlite3.connect(DB_FILE)

# Read drivers table
df = pd.read_sql("SELECT * FROM drivers", conn)

# Read trips table
features = pd.read_sql("SELECT * FROM trips", conn)

# Read telemetry (encrypted)
telemetry = pd.read_sql("SELECT * FROM telemetry_secure", conn)

conn.close()

print("✅ Loaded from DB")
print("Drivers:", df.shape)
print("Trips:", features.shape)
print("Telemetry:", telemetry.shape)

# ------------------------
# Example: decryption for analysis
# ------------------------
def load_key(key_file=KEY_FILE):
    with open(key_file, "rb") as f:
        return f.read()

def decrypt_value(value, fernet):
    if value is None:
        return None
    return float(fernet.decrypt(value.encode()).decode())

# Decrypt lat/lon for analysis
key = load_key()
fernet = Fernet(key)

telemetry["lat_dec"] = telemetry["lat"].apply(lambda x: decrypt_value(x, fernet))
telemetry["lon_dec"] = telemetry["lon"].apply(lambda x: decrypt_value(x, fernet))

# ✅ Now you can do EDA on features with decrypted lat/lon
display(telemetry)


✅ Loaded from DB
Drivers: (30, 23)
Trips: (300, 12)
Telemetry: (209104, 11)


Unnamed: 0,id,timestamp,trip_id,driver_id,lat,lon,speed,acceleration,road_type,engine_on,geohash,lat_dec,lon_dec
0,1,2025-09-07 01:17:00,driver_1_trip_1,driver_1,gAAAAABowmWylTbbXDlUHl-zlmxBNt-jwaK-tanrmV6slS...,gAAAAABowmW1Yr8susleq1X8ZsWJiK1U_Q-2e_VshY3dv9...,9.905877,-2.343964,residential,1,9muz8,33.678795,-117.062931
1,2,2025-09-07 01:17:05,driver_1_trip_1,driver_1,gAAAAABowmWyT5Ax185kRaethjl1hllfBMcHPoRj3vBWve...,gAAAAABowmW12Wy3gScW8eSQq5_8b1NZIsJLGy-JQygtku...,11.761901,0.371205,residential,1,9ub15,26.901010,-101.083936
2,3,2025-09-07 01:17:10,driver_1_trip_1,driver_1,gAAAAABowmWy-3FjlbDklgqPgBw9ukct7WmBWsLJZhKxa0...,gAAAAABowmW1YbH3HF1EaHZOmg8i414WiefDODSCmPCIaV...,12.201238,0.087868,residential,1,dq7rh,36.428386,-74.003414
3,4,2025-09-07 01:17:15,driver_1_trip_1,driver_1,gAAAAABowmWyl2Cg91Sa2nFSaUn5nDSZ9cHZ5KXKwRzlsV...,gAAAAABowmW1bWa1sIFAuWVfjpOkx_uXdZxxdZOsnS8Vyg...,20.073079,1.574368,residential,1,9mu5p,32.871377,-117.781032
4,5,2025-09-07 01:17:20,driver_1_trip_1,driver_1,gAAAAABowmWyrsNscrmYnHEENxY4VJ9ZbbL9tujzXW0ep8...,gAAAAABowmW18Ob4NkSDACi9udXQrZVTsoMyAj5Y0frJLh...,8.429803,-2.328655,residential,1,dr7v2,41.743242,-73.433907
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209099,209100,2025-09-05 05:19:05,driver_30_trip_10,driver_30,gAAAAABowmYJXRybYpeS4ggCAw46VZmsCcANcJJ6C5TBBd...,gAAAAABowmYLtqlnXoaSp4g7Wxs4yVPtdfJ8mJHoVEYdsI...,49.637773,-3.356191,highway,1,c2e34,48.015401,-119.085536
209100,209101,2025-09-05 05:19:10,driver_30_trip_10,driver_30,gAAAAABowmYJbdE1Nf0VaI2ibziWNzZqgFJTPR0j5V-mMI...,gAAAAABowmYLp5Q5fpoo7Y0PmLL3ZqEojkutdC5NojKqDa...,60.835609,2.239567,highway,1,f2wcr,48.057874,-68.942512
209101,209102,2025-09-05 05:19:15,driver_30_trip_10,driver_30,gAAAAABowmYJxuUeVBiH9u9GWLrlwgPVLNouuWq7NuHorc...,gAAAAABowmYLSmJiWyzwd4xzW4OxnuyX71UVk3ijOm4d4r...,53.181163,-1.530889,highway,1,dnky8,36.303880,-83.298447
209102,209103,2025-09-05 05:19:20,driver_30_trip_10,driver_30,gAAAAABowmYJ2G9TQc4XyPQjh5NESRTjMlTCo8srUI93PX...,gAAAAABowmYL2Y7YZT22RM0W5Qe-gdk4OTFGAr2wzZDlob...,53.840610,0.131889,highway,1,f0jm1,45.888556,-82.568962


In [2]:
import numpy as np

def enhanced_risk_score(df):
    # Define weights as percentage contributions summing to 100
    weights = {
        'total_harsh_brakes': 20,
        'total_harsh_accels': 15,
        'max_speed_overall': 20,
        'night_trip_pct_overall': 10,
        'claims_weighted_score': 25,
        # vehicle risk weight included later as fixed points, normalized separately
    }
    
    max_vehicle_risk = 15
    vehicle_risk_map = {'Sedan': 0, 'SUV': 5, 'Sports Car': 15, 'Truck': 10, 'Electric': -5}

    # Normalize numeric features to 0-1 based on min and max observed in the data
    norm_features = {}
    for feature in weights.keys():
        min_val = df[feature].min()
        max_val = df[feature].max()
        # avoid division by zero
        range_val = max_val - min_val if max_val > min_val else 1
        norm_features[feature] = (df[feature] - min_val) / range_val

    # Normalize vehicle risk to 0-1 (map then scale)
    vehicle_risk_raw = df['vehicle_type'].map(vehicle_risk_map).fillna(0)
    vehicle_risk_norm = (vehicle_risk_raw + 5) / (max_vehicle_risk + 5)  # shift min -5 to zero then scale

    # Calculate weighted sum of normalized features (total weights sum to 100)
    weighted_sum = np.zeros(len(df))
    for feature, weight in weights.items():
        weighted_sum += norm_features[feature] * weight

    # Add normalized vehicle_risk weighted as remainder to sum to 100
    total_weight = sum(weights.values())
    vehicle_weight = 100 - total_weight
    weighted_sum += vehicle_risk_norm * vehicle_weight

    # Ensure score within bounds 0-100
    score = weighted_sum.clip(0, 100)

    return score


In [3]:
df["enhanced_risk_score"] = enhanced_risk_score(df)

In [4]:
df

Unnamed: 0,driver_id,num_trips,total_miles,total_drive_time_min,avg_trip_duration_min,avg_trip_miles,avg_speed_overall,max_speed_overall,total_harsh_brakes,total_harsh_accels,...,urban_pct_overall,highway_pct_overall,years_driving,num_claims,num_violations,vehicle_age,vehicle_type,insurance_policy_length_years,claims_weighted_score,enhanced_risk_score
0,driver_10,10,108.77693,260.75,26.075,10.877693,24.966993,90.0,8,17,...,0.400128,0.198786,7,0,4,8,SUV,3.0,60.0,62.036752
1,driver_11,10,151.539064,363.333333,36.333333,15.153906,25.032543,90.0,15,18,...,0.400229,0.198624,20,0,1,14,Sedan,1.0,15.0,56.930106
2,driver_12,10,120.937551,287.833333,28.783333,12.093755,25.009732,90.0,9,14,...,0.399826,0.19861,29,0,0,14,Sedan,5.0,0.0,33.319291
3,driver_13,10,123.171401,294.5,29.45,12.31714,25.089498,87.146745,14,19,...,0.39983,0.198925,15,0,0,9,Sedan,1.0,0.0,28.860512
4,driver_14,10,137.031277,328.833333,32.883333,13.703128,24.855586,90.0,8,18,...,0.399899,0.198936,11,0,0,9,Sedan,8.0,0.0,36.750652
5,driver_15,10,88.125554,212.083333,21.208333,8.812555,25.090529,90.0,6,13,...,0.4,0.198035,8,0,0,11,Sedan,1.0,0.0,26.905214
6,driver_16,10,115.045247,276.333333,27.633333,11.504525,24.960005,90.0,13,15,...,0.400784,0.19813,29,0,0,12,Truck,1.0,0.0,48.959493
7,driver_17,10,139.531925,336.333333,33.633333,13.953193,24.800056,90.0,11,19,...,0.399901,0.198959,21,0,1,2,Sedan,2.0,15.0,49.204517
8,driver_18,10,145.438654,347.583333,34.758333,14.543865,25.053614,90.0,5,25,...,0.399664,0.199233,7,0,0,14,Sedan,2.0,0.0,42.094869
9,driver_19,10,101.456238,241.0,24.1,10.145624,24.955506,90.0,14,14,...,0.400415,0.198824,26,1,0,6,Sedan,6.0,25.0,51.878343


In [5]:
# import pandas as pd
# import numpy as np
# from xgboost import XGBRegressor
# from sklearn.model_selection import train_test_split, RandomizedSearchCV
# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.metrics import mean_squared_error, r2_score
# from sklearn.metrics import mean_absolute_error

# # Assuming df is your dataframe
# X = df.drop(columns=["claims_weighted_score", "driver_id", "enhanced_risk_score"])
# y = df["enhanced_risk_score"]

# # Train/test split with validation set for early stopping
# X_train_full, X_test, y_train_full, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42
# )
# X_train, X_val, y_train, y_val = train_test_split(
#     X_train_full, y_train_full, test_size=0.2, random_state=42
# )

# categorical = ["vehicle_type"]
# numeric = [col for col in X.columns if col not in categorical]

# preprocessor = ColumnTransformer([
#     ("num", StandardScaler(), numeric),
#     ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
# ])

# xgb_model = XGBRegressor(
#     random_state=42,
#     n_jobs=-1,
#     objective='reg:squarederror'
# )

# model = Pipeline([
#     ("preprocess", preprocessor),
#     ("xgb", xgb_model)
# ])

# param_dist = {
#     'xgb__n_estimators': [100, 200, 300, 400, 500],
#     'xgb__learning_rate': [0.001, 0.01, 0.05, 0.1],
#     'xgb__max_depth': [3, 4, 5, 6, 8, 10],
#     'xgb__min_child_weight': [1, 2, 3, 5],
#     'xgb__gamma': [0, 0.1, 0.3, 0.5, 1],
#     'xgb__subsample': [0.6, 0.8, 1.0],
#     'xgb__colsample_bytree': [0.6, 0.8, 1.0],
#     'xgb__reg_alpha': [0, 0.1, 0.5, 1],
#     'xgb__reg_lambda': [1, 1.5, 2, 3]
# }

# random_search = RandomizedSearchCV(
#     estimator=model,
#     param_distributions=param_dist,
#     n_iter=50,
#     scoring='neg_mean_squared_error',
#     cv=5,
#     verbose=2,
#     random_state=42,
#     n_jobs=-1
# )

# # Custom fit to use early stopping with validation
# def fit_with_early_stopping(random_search, X_train, y_train, X_val, y_val):
#     best_score = float('inf')
#     best_model = None
#     for params in random_search.param_distributions:
#         model.set_params(**params)
#         model.fit(
#             X_train, y_train,
#             xgb__early_stopping_rounds=10,
#             xgb__eval_set=[(X_val, y_val)],
#             xgb__verbose=False
#         )
#         preds = model.predict(X_val)
#         rmse = mean_squared_error(y_val, preds, squared=False)
#         if rmse < best_score:
#             best_score = rmse
#             best_model = model
#     return best_model

# # Fit RandomizedSearchCV normally (early stopping inside pipeline not directly supported)
# random_search.fit(X_train_full, y_train_full)

# print("Best hyperparameters:", random_search.best_params_)

# y_pred = random_search.predict(X_test)
# rmse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)
# mae = mean_absolute_error(y_test, y_pred)

# print(f"Test MAE: {mae}")
# print(f"Test RMSE: {rmse}")
# print(f"Test R²: {r2}")


In [6]:
# import pandas as pd
# from catboost import CatBoostRegressor
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# # Assume df is loaded with data and feature engineered as before

# X = df.drop(columns=['driver_id', 'enhanced_risk_score'])
# y = df['enhanced_risk_score']

# categorical_features = ['vehicle_type']  # update with all categorical columns

# # Train/test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # CatBoost Regressor instance (silent)
# catboost_model = CatBoostRegressor(
#     random_seed=42,
#     cat_features=categorical_features,
#     verbose=0
# )

# # Hyperparameter grid
# param_grid = {
#     'depth': [4, 6, 8],
#     'learning_rate': [0.005, 0.01, 0.05],
#     'l2_leaf_reg': [1, 3, 5, 7],
#     'iterations': [500, 1000, 1500],
#     'subsample': [0.7, 0.85, 1.0],
#     'random_strength': [1, 5, 10]
# }

# # Grid search with 3-fold CV
# grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid,
#                            scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)

# grid_search.fit(X_train, y_train)

# print("Best parameters:", grid_search.best_params_)

# # Evaluate best model on test set
# best_catboost = grid_search.best_estimator_

# y_pred = best_catboost.predict(X_test)
# mae = mean_absolute_error(y_test, y_pred)
# rmse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f"Test MAE: {mae:.4f}")
# print(f"Test RMSE: {rmse:.4f}")
# print(f"Test R²: {r2:.4f}")


In [7]:
# import pandas as pd
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# from catboost import CatBoostRegressor, Pool
# import optuna

# # -------------------------------
# # 1) Load Data
# # -------------------------------

# X = df.drop(columns=["driver_id", "enhanced_risk_score"])
# y = df["enhanced_risk_score"]

# cat_feats = ["vehicle_type"]

# # -------------------------------
# # 2) Train/test split
# # -------------------------------
# RANDOM_SEED = 42
# TEST_SIZE = 0.2

# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=TEST_SIZE, random_state=RANDOM_SEED
# )

# # -------------------------------
# # 3) CatBoost Baseline
# # -------------------------------
# cat_model = CatBoostRegressor(
#     random_seed=RANDOM_SEED,
#     cat_features=cat_feats,
#     verbose=0
# )

# cat_model.fit(X_train, y_train)
# y_pred = cat_model.predict(X_test)

# print("\n=== Baseline CatBoost ===")
# print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
# print(f"RMSE: {mean_squared_error(y_test, y_pred):.4f}")
# print(f"R²: {r2_score(y_test, y_pred):.4f}")

# # -------------------------------
# # 4) Optuna Hyperparameter Tuning
# # -------------------------------
# def objective(trial):
#     params = {
#         "depth": trial.suggest_int("depth", 4, 8),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
#         "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 1, 10),
#         "iterations": trial.suggest_int("iterations", 500, 2000),
#         "random_strength": trial.suggest_int("random_strength", 1, 10),
#         "subsample": trial.suggest_float("subsample", 0.7, 1.0),
#     }
    
#     model = CatBoostRegressor(
#         random_seed=RANDOM_SEED,
#         cat_features=cat_feats,
#         verbose=0,
#         **params
#     )
#     model.fit(X_train, y_train)
#     preds = model.predict(X_test)
#     return mean_squared_error(y_test, preds)  # RMSE

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=25, show_progress_bar=True)

# print("\nBest Trial:", study.best_trial.params)

# # -------------------------------
# # 5) Final CatBoost with Best Params
# # -------------------------------
# best_params = study.best_trial.params
# final_model = CatBoostRegressor(
#     random_seed=RANDOM_SEED,
#     cat_features=cat_feats,
#     verbose=0,
#     **best_params
# )
# final_model.fit(X_train, y_train)
# y_pred_final = final_model.predict(X_test)

# print("\n=== Tuned CatBoost ===")
# print(f"MAE: {mean_absolute_error(y_test, y_pred_final):.4f}")
# print(f"RMSE: {mean_squared_error(y_test, y_pred_final):.4f}")
# print(f"R²: {r2_score(y_test, y_pred_final):.4f}")


In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# -----------------------------
# Load Data
# -----------------------------
X = df.drop(columns=['driver_id', 'enhanced_risk_score'])
y = df['enhanced_risk_score']

# Identify categorical columns
categorical_features = ['vehicle_type']
numeric_features = [col for col in X.columns if col not in categorical_features]

# -----------------------------
# Preprocessing
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numeric_features)
    ]
)

# -----------------------------
# Define Models
# -----------------------------
catboost_model = CatBoostRegressor(
    random_seed=42,
    cat_features=[X.columns.get_loc(c) for c in categorical_features],
    verbose=0
)

rf_model = RandomForestRegressor(random_state=42)
xgb_model = XGBRegressor(random_state=42, verbosity=0)
gbr_model = GradientBoostingRegressor(random_state=42)

# Stacking Ensemble
estimators = [
    ('rf', rf_model),
    ('xgb', xgb_model),
    ('gbr', gbr_model)
]

stack_model = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(alpha=1.0),
    passthrough=True,
    n_jobs=-1
)

# -----------------------------
# Pipelines (for models needing preprocessing)
# -----------------------------
rf_pipeline = Pipeline([('preprocessor', preprocessor), ('model', rf_model)])
xgb_pipeline = Pipeline([('preprocessor', preprocessor), ('model', xgb_model)])
gbr_pipeline = Pipeline([('preprocessor', preprocessor), ('model', gbr_model)])
stack_pipeline = Pipeline([('preprocessor', preprocessor), ('model', stack_model)])

# Dictionary of models
models = {
    "CatBoost": catboost_model,
    "RandomForest": rf_pipeline,
    "XGBoost": xgb_pipeline,
    "GradientBoosting": gbr_pipeline,
    "StackingEnsemble": stack_pipeline
}

# -----------------------------
# Train/Test Split (for final evaluation)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# Cross-Validation Setup
# -----------------------------
scoring = {
    "MAE": make_scorer(mean_absolute_error, greater_is_better=False),
    "RMSE": make_scorer(lambda yt, yp: np.sqrt(mean_squared_error(yt, yp)), greater_is_better=False),
    "R2": make_scorer(r2_score)
}
kf = KFold(n_splits=5, shuffle=True, random_state=42)

def cross_validate_model(model, X, y):
    mae_scores = cross_val_score(model, X, y, cv=kf, scoring=scoring["MAE"], n_jobs=-1)
    rmse_scores = cross_val_score(model, X, y, cv=kf, scoring=scoring["RMSE"], n_jobs=-1)
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring=scoring["R2"], n_jobs=-1)
    return {
        "MAE": -np.mean(mae_scores),
        "RMSE": -np.mean(rmse_scores),
        "R²": np.mean(r2_scores)
    }

# -----------------------------
# Fit Models & Evaluate
# -----------------------------
print("=== Cross-Validation Results (5-Fold Average) ===")
cv_results = {}
for name, model in models.items():
    print(f"Running CV for {name}...")
    cv_results[name] = cross_validate_model(model, X, y)

cv_df = pd.DataFrame(cv_results).T
print(cv_df)

print("\n=== Hold-Out Test Results ===")
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"{name} -> MAE: {mae:.4f}, RMSE: {rmse:.4f}, R²: {r2:.4f}")


=== Cross-Validation Results (5-Fold Average) ===
Running CV for CatBoost...
Running CV for RandomForest...
Running CV for XGBoost...
Running CV for GradientBoosting...
Running CV for StackingEnsemble...
                       MAE       RMSE        R²
CatBoost          8.488642   9.852129 -0.021176
RandomForest      8.839239  10.292795 -0.152049
XGBoost           9.269547  10.936334 -0.384808
GradientBoosting  9.895270  11.180327 -0.489254
StackingEnsemble  2.676506   3.157082  0.801570

=== Hold-Out Test Results ===
Training CatBoost...
CatBoost -> MAE: 5.9872, RMSE: 6.5334, R²: 0.6254
Training RandomForest...
RandomForest -> MAE: 6.2625, RMSE: 7.1456, R²: 0.5519
Training XGBoost...
XGBoost -> MAE: 9.0612, RMSE: 9.9547, R²: 0.1303
Training GradientBoosting...
GradientBoosting -> MAE: 9.1663, RMSE: 9.7184, R²: 0.1711
Training StackingEnsemble...
StackingEnsemble -> MAE: 1.8475, RMSE: 2.2762, R²: 0.9545
