<a href="https://colab.research.google.com/github/Srideep-Kundu/MLProjects/blob/main/AccidentRiskPredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas
import json

with open('/content/traffic.json', 'r') as json_data :
  data = json.load(json_data)

df = pandas.DataFrame(data['records']).head(1000)
df.head()

Unnamed: 0,trip_id,driver_id,route,state,distance_km,duration_minutes,avg_speed_kmh,behavior_class,behavior_label,sensor_features,safety_score,grade,weather,time_of_day,traffic_density,accident_risk,timestamp
0,T000001,D23133,NH-1 Delhi-Amritsar,Gujarat,184.8,260,42.7,0,normal_driving,"[0.158, 0, 0, 0.372, -0.085, 0.456, 0.036, 0.1...",36,F,fog,night,high,0.15,1730821520
1,T000002,D29720,NH-8 Delhi-Mumbai,Uttar Pradesh,130.3,169,46.1,0,normal_driving,"[0.078, 0, 0, 0.228, -0.07, 0.298, 0.036, 0.07...",39,F,clear,peak_evening,high,0.17,1742426943
2,T000003,D08650,NH-3 Agra-Mumbai,Delhi,125.7,156,48.5,0,normal_driving,"[0.127, 0, 0, 0.374, -0.086, 0.459, 0.045, 0.1...",30,F,fog,night,high,0.14,1738703905
3,T000004,D31719,Yamuna Expressway,Rajasthan,165.0,212,46.6,2,sharp_turn,"[0.153, 0, 0, 0.4, -0.096, 0.496, 4.209, 13.80...",37,F,heavy_rain,midday,high,0.06,1745921559
4,T000005,D26385,Yamuna Expressway,Maharashtra,129.0,128,60.5,3,aggressive_acceleration,"[2.796, 0, 0, 2.959, 2.64, 0.319, 0.176, 0.028...",50,F,cloudy,night,high,0.06,1751267102


Selecting only relevant features

In [31]:
relevant_features = [
    'distance_km',
    'duration_minutes',
    'avg_speed_kmh',
    'behavior_class',
    'safety_score',
    'weather',
    'time_of_day',
    'traffic_density',
    'accident_risk',
    'timestamp'
]

df_processed = df[relevant_features].copy()

# Handle missing values (example: fill with median for numerical, mode for categorical)
for col in df_processed.columns:
    if df_processed[col].dtype in ['int64', 'float64']:
        df_processed[col] = df_processed[col].fillna(df_processed[col].median())
    else:
        df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])

# Encode categorical features (example: one-hot encoding)
categorical_cols = ['weather', 'time_of_day', 'traffic_density']
df_processed = pandas.get_dummies(df_processed, columns=categorical_cols, drop_first=True)

# Convert timestamp to datetime and extract features (this will be part of Feature Engineering, but we can start here)
df_processed['timestamp'] = pandas.to_datetime(df_processed['timestamp'], unit='s')
df_processed['hour'] = df_processed['timestamp'].dt.hour
df_processed['day_of_week'] = df_processed['timestamp'].dt.dayofweek
df_processed.drop('timestamp', axis=1, inplace=True) # Drop original timestamp

display(df_processed.head())

Unnamed: 0,distance_km,duration_minutes,avg_speed_kmh,behavior_class,safety_score,accident_risk,weather_cloudy,weather_fog,weather_heavy_rain,weather_light_rain,time_of_day_night,time_of_day_peak_evening,time_of_day_peak_morning,traffic_density_low,traffic_density_medium,traffic_density_very_high,hour,day_of_week
0,184.8,260,42.7,0,36,0.15,False,True,False,False,True,False,False,False,False,False,15,1
1,130.3,169,46.1,0,39,0.17,False,False,False,False,False,True,False,False,False,False,23,2
2,125.7,156,48.5,0,30,0.14,False,True,False,False,True,False,False,False,False,False,21,1
3,165.0,212,46.6,2,37,0.06,False,False,True,False,False,False,False,False,False,False,10,1
4,129.0,128,60.5,3,50,0.06,True,False,False,False,True,False,False,False,False,False,7,0


Feature Engineering

In [40]:
df_processed['safety_risk_interaction'] = df_processed['safety_score'] * df_processed['accident_risk']



print(f"Number of columns after one-hot encoding: {df_processed.shape[1]}")
print(f"Columns in df_processed: {df_processed.columns.tolist()}")


initial_selection = [
    'distance_km',
    'duration_minutes',
    'avg_speed_kmh',
    'behavior_class',
    'safety_score',
    'safety_risk_interaction',
    'hour',
    'day_of_week',
]


potential_categorical_features = [
    'weather_fog', 'weather_clear', 'traffic_density_medium', 'traffic_density_low', 'traffic_density_very_high',
    'weather_cloudy', 'weather_heavy_rain', 'weather_light_rain', 'weather_snowy', 'weather_windy',
    'time_of_day_night', 'time_of_day_peak_evening', 'time_of_day_peak_morning', 'time_of_day_daytime'
]

for feature in potential_categorical_features:
    if feature in df_processed.columns and len(initial_selection) < 7:
        initial_selection.append(feature)

if len(initial_selection) < 7:
    numerical_cols = [col for col in df_processed.columns if df_processed[col].dtype in ['int64', 'float64'] and col not in initial_selection]
    for col in numerical_cols:
        if len(initial_selection) < 7:
            initial_selection.append(col)


final_features = initial_selection[:7] # Ensure we don't exceed 7 features

print(f"Final number of features selected: {len(final_features)}")
print(f"Selected features: {final_features}")

df_engineered = df_processed[final_features].copy()

display(df_engineered.head())

Number of columns after one-hot encoding: 19
Columns in df_processed: ['distance_km', 'duration_minutes', 'avg_speed_kmh', 'behavior_class', 'safety_score', 'accident_risk', 'weather_cloudy', 'weather_fog', 'weather_heavy_rain', 'weather_light_rain', 'time_of_day_night', 'time_of_day_peak_evening', 'time_of_day_peak_morning', 'traffic_density_low', 'traffic_density_medium', 'traffic_density_very_high', 'hour', 'day_of_week', 'safety_risk_interaction']
Final number of features selected: 7
Selected features: ['distance_km', 'duration_minutes', 'avg_speed_kmh', 'behavior_class', 'safety_score', 'safety_risk_interaction', 'hour']


Unnamed: 0,distance_km,duration_minutes,avg_speed_kmh,behavior_class,safety_score,safety_risk_interaction,hour
0,184.8,260,42.7,0,36,5.4,15
1,130.3,169,46.1,0,39,6.63,23
2,125.7,156,48.5,0,30,4.2,21
3,165.0,212,46.6,2,37,2.22,10
4,129.0,128,60.5,3,50,3.0,7


In [41]:
from sklearn.model_selection import train_test_split

TARGET_VARIABLE = 'accident_risk'


if TARGET_VARIABLE not in df_processed.columns:
    print(f"Error: Target variable '{TARGET_VARIABLE}' not found in the processed dataframe.")
else:

    y = df_processed[TARGET_VARIABLE]
    X_processed = df_processed.drop(TARGET_VARIABLE, axis=1)


    initial_selection_X = [
        'distance_km',
        'duration_minutes',
        'avg_speed_kmh',
        'behavior_class',
        'safety_score',
        'safety_risk_interaction',
        'hour',
        'day_of_week',
    ]

    potential_categorical_features_X = [
        'weather_fog', 'weather_clear', 'traffic_density_medium', 'traffic_density_low', 'traffic_density_very_high',
        'weather_cloudy', 'weather_heavy_rain', 'weather_light_rain', 'weather_snowy', 'weather_windy',
        'time_of_day_night', 'time_of_day_peak_evening', 'time_of_day_peak_morning', 'time_of_day_daytime'
    ]

    for feature in potential_categorical_features_X:
        if feature in X_processed.columns and len(initial_selection_X) < 10:
            initial_selection_X.append(feature)

    if len(initial_selection_X) < 10:
        numerical_cols_X = [col for col in X_processed.columns if X_processed[col].dtype in ['int64', 'float64'] and col not in initial_selection_X]
        for col in numerical_cols_X:
            if len(initial_selection_X) < 10:
                initial_selection_X.append(col)

    final_features_X = initial_selection_X[:10] # Ensure we don't exceed 10 features for X

    # Use the selected features for X
    X = X_processed[final_features_X]


    # Split into training and the rest (for test and cross-validation)
    # 60% train, 40% temp
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)


    X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    print("Data splitting complete.")
    print(f"Training set shape: {X_train.shape}, {y_train.shape}")
    print(f"Cross-validation set shape: {X_cv.shape}, {y_cv.shape}")
    print(f"Test set shape: {X_test.shape}, {y_test.shape}")

Data splitting complete.
Training set shape: (600, 10), (600,)
Cross-validation set shape: (200, 10), (200,)
Test set shape: (200, 10), (200,)


In [42]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_cv_scaled = scaler.transform(X_cv)
X_test_scaled = scaler.transform(X_test)

print("Data standardization complete.")
print("Scaled training data sample:")
print(X_train_scaled[:5])

Data standardization complete.
Scaled training data sample:
[[-1.37424166 -1.20794094 -0.15316752  1.41409024 -0.91171073  0.10168034
   0.69127932  1.57950708 -0.18569534 -0.67289264]
 [ 1.45752135  0.61658114  0.89066223 -0.65783685 -2.00000206 -1.10299504
  -1.29393309 -1.43067052 -0.18569534 -0.67289264]
 [-0.03452005  0.96538683 -1.6928164   2.45005378  0.6887177  -0.14473053
  -1.1521322  -0.42727799 -0.18569534 -0.67289264]
 [ 1.01094747  0.71049036  0.07299559  0.37812669 -0.59162504 -0.68839897
   0.12407578 -1.43067052 -0.18569534  1.48612118]
 [ 0.39128295 -0.01395223  0.55141756 -0.65783685 -1.23179642 -1.56452651
   0.12407578  0.07441828  5.38516481 -0.67289264]]


Training Random Forest Regressor Model

In [43]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


model = RandomForestRegressor(n_estimators=100, random_state=42)


model.fit(X_train_scaled, y_train)

print("Random Forest Regressor model training complete.")


y_cv_pred = model.predict(X_cv_scaled)
mse_cv = mean_squared_error(y_cv, y_cv_pred)
rmse_cv = np.sqrt(mse_cv)
r2_cv = r2_score(y_cv, y_cv_pred)

print(f"Cross-validation RMSE: {rmse_cv}")
print(f"Cross-validation R-squared: {r2_cv}")


y_train_pred = model.predict(X_train_scaled)
mse_train = mean_squared_error(y_train, y_train_pred)
rmse_train = np.sqrt(mse_train)
r2_train = r2_score(y_train, y_train_pred)

print(f"Training RMSE: {rmse_train}")
print(f"Training R-squared: {r2_train}")

Random Forest Regressor model training complete.
Cross-validation RMSE: 0.005370046554733014
Cross-validation R-squared: 0.9760442440567298
Training RMSE: 0.0022100188536149064
Training R-squared: 0.9959692136359917


In [44]:
df_analysis = df.copy()


accident_risk_by_area = df_analysis.groupby(['state', 'route'])['accident_risk'].mean().reset_index()


accident_risk_by_area_sorted = accident_risk_by_area.sort_values(by='accident_risk', ascending=False)

print("Areas with the highest average accident risk:")
display(accident_risk_by_area_sorted.head(10))


accident_risk_by_traffic_density = df_analysis.groupby('traffic_density')['accident_risk'].mean().reset_index()
accident_risk_by_traffic_density_sorted = accident_risk_by_traffic_density.sort_values(by='accident_risk', ascending=False)

print("\nAverage accident risk by traffic density:")
display(accident_risk_by_traffic_density_sorted)

Areas with the highest average accident risk:


Unnamed: 0,state,route,accident_risk
8,Delhi,NH-8 Delhi-Mumbai,0.17
58,Tamil Nadu,NH-8 Delhi-Mumbai,0.17
78,West Bengal,NH-8 Delhi-Mumbai,0.17
68,Uttar Pradesh,NH-8 Delhi-Mumbai,0.17
48,Rajasthan,NH-8 Delhi-Mumbai,0.17
38,Maharashtra,NH-8 Delhi-Mumbai,0.17
28,Karnataka,NH-8 Delhi-Mumbai,0.17
18,Gujarat,NH-8 Delhi-Mumbai,0.17
24,Karnataka,NH-4 Chennai-Thane,0.16
4,Delhi,NH-4 Chennai-Thane,0.16



Average accident risk by traffic density:


Unnamed: 0,traffic_density,accident_risk
0,high,0.133896
2,medium,0.109774
1,low,0.09
3,very_high,0.08


Hyperparameter tuning

In [45]:
# Hyperparameter Tuning using GridSearchCV

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the GridSearchCV object
# We will use the scaled training data for tuning and cross-validation within the grid search
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                           param_grid=param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

print("Starting GridSearchCV...")


grid_search.fit(X_train_scaled, y_train)

print("GridSearchCV complete.")


best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Best hyperparameters found: {best_params}")


y_cv_pred_tuned = best_model.predict(X_cv_scaled)
mse_cv_tuned = mean_squared_error(y_cv, y_cv_pred_tuned)
rmse_cv_tuned = np.sqrt(mse_cv_tuned)
r2_cv_tuned = r2_score(y_cv, y_cv_pred_tuned)

print(f"Cross-validation RMSE (tuned model): {rmse_cv_tuned}")
print(f"Cross-validation R-squared (tuned model): {r2_cv_tuned}")


y_train_pred_tuned = best_model.predict(X_train_scaled)
mse_train_tuned = mean_squared_error(y_train, y_train_pred_tuned)
rmse_train_tuned = np.sqrt(mse_train_tuned)
r2_train_tuned = r2_score(y_train, y_train_pred_tuned)

print(f"Training RMSE (tuned model): {rmse_train_tuned}")
print(f"Training R-squared (tuned model): {r2_train_tuned}")

Starting GridSearchCV...
GridSearchCV complete.
Best hyperparameters found: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
Cross-validation RMSE (tuned model): 0.00530505000500045
Cross-validation R-squared (tuned model): 0.976620633335018
Training RMSE (tuned model): 0.002173214038989036
Training R-squared (tuned model): 0.9961023500957815


Training a Linear Regression model

In [46]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


linear_model = LinearRegression()


linear_model.fit(X_train_scaled, y_train)

print("Linear Regression model training complete.")

# Evaluate the Linear Regression model on the cross-validation set
y_cv_pred_linear = linear_model.predict(X_cv_scaled)
mse_cv_linear = mean_squared_error(y_cv, y_cv_pred_linear)
rmse_cv_linear = np.sqrt(mse_cv_linear)
r2_cv_linear = r2_score(y_cv, y_cv_pred_linear)

print(f"Linear Regression Cross-validation RMSE: {rmse_cv_linear}")
print(f"Linear Regression Cross-validation R-squared: {r2_cv_linear}")

# Evaluate the Linear Regression model on the training set
y_train_pred_linear = linear_model.predict(X_train_scaled)
mse_train_linear = mean_squared_error(y_train, y_train_pred_linear)
rmse_train_linear = np.sqrt(mse_train_linear)
r2_train_linear = r2_score(y_train, y_train_pred_linear)

print(f"Linear Regression Training RMSE: {rmse_train_linear}")
print(f"Linear Regression Training R-squared: {r2_train_linear}")

Linear Regression model training complete.
Linear Regression Cross-validation RMSE: 0.010488831412126086
Linear Regression Cross-validation R-squared: 0.9086080406121539
Linear Regression Training RMSE: 0.010423397179830365
Linear Regression Training R-squared: 0.9103363535510975


Making a predictive System

In [47]:
example_index = 126
example_features_scaled = X_test_scaled[example_index].reshape(1, -1)

actual_accident_risk = y_test.iloc[example_index]

predicted_accident_risk = model.predict(example_features_scaled)

print(f"Using example from index {example_index} of the test set:")
print(f"Actual Accident Risk: {actual_accident_risk}")
print(f"Predicted Accident Risk: {predicted_accident_risk[0]}")

difference = abs(actual_accident_risk - predicted_accident_risk[0])
print(f"Difference between Actual and Predicted: {difference}")

Using example from index 126 of the test set:
Actual Accident Risk: 0.17
Predicted Accident Risk: 0.16040000000000007
Difference between Actual and Predicted: 0.009599999999999942
