In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

def enhanced_risk_score(df):
    base_score = (
        df['total_harsh_brakes'] * 20 +
        df['total_harsh_accels'] * 15 +
        (df['max_speed_overall'] / 2) +
        (df['night_trip_pct_overall'] * 10)
    )
    vehicle_risk_map = {'Sedan': 0, 'SUV': 5, 'Sports Car': 15, 'Truck': 10, 'Electric': -5}
    total_score = base_score + df['claims_weighted_score'] + df['vehicle_type'].map(vehicle_risk_map).fillna(0)
    return np.clip(total_score, 0, 100)

In [2]:
def train_xgboost_risk_model(driver_df):
    # Encode vehicle_type categorical to numeric
    vehicle_type_map = {'Sedan': 0, 'SUV': 1, 'Sports Car': 2, 'Truck': 3, 'Electric': 4}
    driver_df['vehicle_type_num'] = driver_df['vehicle_type'].map(vehicle_type_map).fillna(-1)
    
    # Select all numeric features except driver_id and vehicle_type
    exclude_cols = ['driver_id']
    features = [col for col in driver_df.columns if col not in exclude_cols]
    
    X = driver_df[features]
    y = enhanced_risk_score(driver_df)
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    model = XGBRegressor(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42,
        verbosity=1
    )
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    print(f"Test MSE: {mean_squared_error(y_test, y_pred):.4f}")
    print(f"Test R2: {r2_score(y_test, y_pred):.4f}")
    
    # Add predicted risk scores as a new column clipped between 0 and 100
    driver_df['risk_score'] = np.clip(model.predict(X), 0, 100)
    
    return model, driver_df




In [3]:
if __name__ == "__main__":
    # Load your existing driver features CSV
    driver_df = pd.read_csv('driver_data.csv')
    
    # Train XGBoost model and append risk score column
    model, driver_df = train_xgboost_risk_model(driver_df)
    
    # Save updated DataFrame with new risk_score column
    driver_df.to_csv('driver_data_with_risk_score.csv', index=False)
    print("Risk score column added and saved to driver_data_with_risk_score.csv")

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:vehicle_type: object

In [14]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

def enhanced_risk_score(df):
    base_score = (
        df['total_harsh_brakes'] * 20 +
        df['total_harsh_accels'] * 15 +
        (df['max_speed_overall'] / 2) +
        (df['night_trip_pct_overall'] * 10)
    )
    vehicle_risk_map = {'Sedan': 0, 'SUV': 5, 'Sports Car': 15, 'Truck': 10, 'Electric': -5}
    total_score = base_score + df['claims_weighted_score'] + df['vehicle_type'].map(vehicle_risk_map).fillna(0)
    return np.clip(total_score, 0, 100)

In [15]:
df = pd.read_csv('driver_data.csv')


In [16]:
df["enhanced_risk_score"] = enhanced_risk_score(df)

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

# Features: include enhanced_risk_score
X = df.drop(columns=["claims_weighted_score", "driver_id"])
y = df["claims_weighted_score"]

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define categorical & numeric columns
categorical = ["vehicle_type"]
numeric = [col for col in X.columns if col not in categorical]

# Preprocessor: scale numeric, one-hot encode categorical
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
])

# XGBoost model
xgb_model = XGBRegressor(
    n_estimators=50,       # keep small
    learning_rate=0.1,     # stable step size
    max_depth=3,           # small trees, but enough splits
    subsample=0.8,         # some randomness
    colsample_bytree=0.8,  # some randomness
    random_state=42,
    n_jobs=-1
)

# Pipeline
model = Pipeline([
    ("preprocess", preprocessor),
    ("xgb", xgb_model)
])

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("XGBoost RMSE:", rmse)
print("R² Score:", r2)

XGBoost RMSE: 48.0313835144043
R² Score: 0.4951446056365967
