# Accident Risk Prediction with automl

**Project Goal:** Predict accident risk (continuous value 0-1) based on road and traffic conditions.

**Dataset:** Simulated Roads Accident dataset from Kaggle

### 1. Setup and Data Loading

In [None]:
# Import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from pycaret.regression import compare_models, evaluate_model, finalize_model, predict_model, setup, tune_model

# Set visualization style
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

# Load datasets
train_df = pd.read_csv("../data/train.csv", index_col="id")
test_df = pd.read_csv("../data/test.csv", index_col="id")

### 2. Initial Data Exploration

In [None]:
# Shape of datasets
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Display first rows of training set
train_df.head()

In [None]:
# Basic information about the dataset
train_df.info()

In [None]:
# Statistical summary
train_df.describe(include="all")

### 3. Data Splitting

In [None]:
# Separate features and target
X = train_df.drop("accident_risk", axis=1)
y = train_df["accident_risk"]


### 4. Pycaret Setup and Training

In [None]:
# pycaret setup, settings adjusted for working with weaker hardware
setup(data=X, target=y, session_id=42, verbose=False, fold_strategy="kfold", n_jobs=1, fold=2) # params adjusted for weaker hardware

# Compare models
best_model = compare_models(n_select=3, fold=2, verbose=True)

In [None]:
evaluate_model(best_model[0])

### 6. Model Tuning and Finalization

In [None]:
# Tune the best model
tuned_model = tune_model(best_model[0], optimize='RMSE', n_iter=10, verbose=True)

# Evaluate tuned model
evaluate_model(tuned_model)

In [None]:
# Finalize model
final_model = finalize_model(tuned_model)

### 7. Create Submission File


In [None]:
# Make predictions on Kaggle test set
kaggle_predictions = predict_model(final_model, data=test_df)

# Create submission file - reset index to avoid duplication
submission_df = pd.DataFrame({
    'id': test_df.index,
    'accident_risk': kaggle_predictions['prediction_label'].values  # Use .values to get only values
})

# Ensure predictions are in valid range
submission_df['accident_risk'] = submission_df['accident_risk'].clip(0, 1)

# Save to CSV
submission_df.to_csv('submission_pycaret.csv', index=False)

print("âœ… Submission file created: submission_pycaret.csv")
print(f"ðŸ“Š Predictions range: [{submission_df['accident_risk'].min():.4f}, {submission_df['accident_risk'].max():.4f}]")

# Show sample
print("\nSample predictions:")
display(submission_df.head(10))