# The code provides both time-series forecasts of future accident rates and severity levels, plus classification models to predict accident severity based on various features.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from imblearn.over_sampling import SMOTE
from tqdm import tqdm

In [None]:
df = pd.read_csv(r'F:\Education\third year/2 term\ML\total section\tasks\Road Accident Data\dataset\Road Accident Data.csv')
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.shape

# EDA (Data Cleaning)


In [None]:
print("\nCleaning data...")
columns_to_drop = ['Accident_Index', 'Carriageway_Hazards']
df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.isnull().sum().sort_index(ascending=False)

# Replacing missing values with most frequent data

In [None]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='most_frequent')
df['Road_Surface_Conditions'] = imputer.fit_transform(df[['Road_Surface_Conditions']]).ravel()
df['Weather_Conditions'] = imputer.fit_transform(df[['Weather_Conditions']]).ravel()
df['Time'] = imputer.fit_transform(df[['Time']]).ravel()
df['Road_Type'] = imputer.fit_transform(df[['Road_Type']]).ravel()


In [None]:
df.isnull().sum().sort_index(ascending=False)

In [None]:
df.columns

# Data Visualisation

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=df, x='Month', order=df['Month'].value_counts().index)
plt.title('Number of Accidents per Month')
plt.xlabel('Month')
plt.ylabel('Accident Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=df, x='Light_Conditions', hue='Accident_Severity')
plt.title('Accident Severity by Light Conditions')
plt.xlabel('Light Conditions')
plt.ylabel('Accident Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:

numeric_cols = df.select_dtypes(include=['int64', 'float64'])
corr_matrix = numeric_cols.corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=df, x='Accident_Severity', y='Speed_limit')
plt.title('Speed Limit Distribution by Accident Severity')
plt.xlabel('Accident Severity')
plt.ylabel('Speed Limit')
plt.tight_layout()
plt.show()


In [None]:
road_surface_counts = df['Road_Surface_Conditions'].value_counts()
plt.figure(figsize=(8,8))
plt.pie(road_surface_counts, labels=road_surface_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Road Surface Conditions')
plt.tight_layout()
plt.show()


In [None]:
# Feature engineering
print("\nCreating new features...")
df['Hour'] = pd.to_datetime(df['Time']).dt.hour
df['Is_Night'] = df['Hour'].apply(lambda x: 1 if x < 6 or x > 18 else 0)
df['Bad_Weather'] = df['Weather_Conditions'].apply(lambda x: 1 if x in ['Rain', 'Snow', 'Fog'] else 0)

In [None]:
print("Columns in DataFrame:", df.columns.tolist())

In [None]:
# Convert accident date to datetime
print("\nProcessing dates...")
df['Accident Date'] = pd.to_datetime(df['Accident Date'])

In [None]:
# Time Series Analysis
print("\nPerforming time series analysis...")
monthly_accidents = df.resample('M', on='Accident Date').size()
severity_trends = df.groupby([pd.Grouper(key='Accident Date', freq='M'), 'Accident_Severity']).size().unstack()

In [None]:
# ARIMA Modeling for accident prediction
print("\nBuilding ARIMA model...")
model = ARIMA(monthly_accidents, order=(5,1,0), seasonal_order=(1,1,1,12))
model_fit = model.fit()

In [None]:
# 5-month forecast
forecast_steps = 5
forecast = model_fit.get_forecast(steps=forecast_steps)
forecast_index = pd.date_range(monthly_accidents.index[-1], periods=forecast_steps+1, freq='M')[1:]
forecast_values = forecast.predicted_mean
conf_int = forecast.conf_int()

In [None]:
# Severity level forecasting
severity_forecasts = {}
for severity in severity_trends.columns:
    model_sev = ARIMA(severity_trends[severity].fillna(0), order=(3,1,1))
    model_fit_sev = model_sev.fit()
    severity_forecasts[severity] = model_fit_sev.get_forecast(steps=5).predicted_mean

forecast_severity = pd.DataFrame(severity_forecasts, index=forecast_index)

In [None]:
# Visualization
print("\nGenerating visualizations...")
plt.figure(figsize=(15, 10))
plt.subplot(2, 1, 1)
plt.plot(monthly_accidents.index, monthly_accidents, label='Historical Data')
plt.plot(forecast_index, forecast_values, color='red', label='Forecast')
plt.fill_between(forecast_index, conf_int.iloc[:,0], conf_int.iloc[:,1], color='pink', alpha=0.3)
plt.title('5-Month Accident Forecast')
plt.ylabel('Number of Accidents')
plt.legend()
plt.grid(True)

plt.subplot(2, 1, 2)
forecast_severity.plot(kind='area', stacked=True, ax=plt.gca())
plt.title('Predicted Accident Severity Distribution')
plt.ylabel('Number of Accidents')
plt.xlabel('Date')
plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
# Print forecast summary
print("\nForecast Summary:")
print("1. Total Accident Forecast:")
for i, (date, value) in enumerate(zip(forecast_index, forecast_values), 1):
    print(f"   Month {i}: {date.strftime('%Y-%m')} - {value:.0f} ±{((conf_int.iloc[i-1,1]-conf_int.iloc[i-1,0])/2):.0f} accidents")

print("\n2. Severity Distribution Forecast:")
print(forecast_severity.round().astype(int))

# Determining which column to encode

In [None]:
df.info()

In [None]:
for col in ['Month', 'Day_of_Week', 'Junction_Control', 'Junction_Detail',
       'Accident_Severity', 'Light_Conditions',
       'Local_Authority_(District)',
        'Police_Force', 'Road_Surface_Conditions',
       'Road_Type', 'Time', 'Urban_or_Rural_Area',
       'Weather_Conditions', 'Vehicle_Type']:
    print(f"Unique values in {col}: {df[col].unique()}")
    print(f"number of unique values in {col}: {df[col].nunique()}")
    print("\n")

# Encoding

In [None]:
# Prepare data for classification
print("\nPreparing data for classification models...")
LE = LabelEncoder()
categorical_cols = ['Month', 'Day_of_Week', 'Junction_Control', 'Junction_Detail',
                   'Light_Conditions', 'Local_Authority_(District)', 'Police_Force', 
                   'Road_Surface_Conditions', 'Road_Type', 'Urban_or_Rural_Area',
                   'Weather_Conditions', 'Vehicle_Type']

for col in categorical_cols:
    df[col] = LE.fit_transform(df[col])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# putting target in the end

In [None]:
Accident_Severity = df['Accident_Severity']
df.drop(['Accident_Severity'], axis=1, inplace=True)
df['Accident_Severity'] = Accident_Severity
df.head()

# spliting

In [None]:
# Split data
X = df.drop(['Accident_Severity', 'Accident Date'], axis=1)
y = df['Accident_Severity']

In [None]:
X.shape, y.shape

In [None]:
X.head()

In [None]:
y.head()

# Outlier

In [None]:
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("Numeric columns:", numeric_cols)

In [None]:
def outliers(df, column):
    if df[column].dtype in ['int64', 'float64']:  # Only for numeric columns
        Q1 = df[column].quantile(0.25)      
        Q3 = df[column].quantile(0.75)       
        IQR = Q3 - Q1                        
        outlier_indices = df[
            (df[column] < (Q1 - 1.5 * IQR)) | 
            (df[column] > (Q3 + 1.5 * IQR))
        ].index                              
        return outlier_indices
    return pd.Index([])  # Return empty index for non-numeric columns

# Now run only on numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_cols:
    outlier_idx = outliers(df, col)
    print(f"{col} => outlier_idx : {len(outlier_idx)}")


In [None]:
outlier_counts = {
    'Junction_Control': 2145,
    'Accident_Severity': 44693,
    'Light_Conditions': 63017,
    'Road_Type': 75826,
    'Weather_Conditions': 57419,
    'Vehicle_Type': 68179
}

outlier_df = pd.DataFrame(list(outlier_counts.items()), columns=['Feature', 'Outlier Count'])
outlier_df = outlier_df.sort_values(by='Outlier Count', ascending=False)

plt.figure(figsize=(12,6))
sns.barplot(data=outlier_df, x='Feature', y='Outlier Count', palette='Reds_r')
plt.title('Number of Outliers per Feature')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:

df_numeric = df.copy()
df_numeric['Vehicle_Type'] = df_numeric['Vehicle_Type'].astype('category').cat.codes
df_numeric['Road_Type'] = df_numeric['Road_Type'].astype('category').cat.codes
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
sns.boxplot(y=df_numeric['Road_Type'], color='skyblue')
plt.title('Boxplot of Road_Type')
plt.subplot(1,2,2)
sns.boxplot(y=df_numeric['Vehicle_Type'], color='salmon')
plt.title('Boxplot of Vehicle_Type')
plt.tight_layout()
plt.show()


In [None]:
def remove_outliers(df, column):
    """Remove outliers from a column, handling both numeric and categorical data"""
    if df[column].dtype in ['int64', 'float64']:
        # For numeric columns: use IQR method
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    else:
        # For categorical columns: remove rare categories (appearing less than 1% of time)
        value_counts = df[column].value_counts(normalize=True)
        common_categories = value_counts[value_counts >= 0.01].index
        return df[df[column].isin(common_categories)]

# Columns to process (including both numeric and categorical)
cols_with_outliers = ['Junction_Control', 'Accident_Severity',
                     'Light_Conditions', 'Road_Type', 'Weather_Conditions', 
                     'Vehicle_Type', 'Road_Surface_Conditions']

# Remove outliers from each column
for col in cols_with_outliers:
    print(f"\nProcessing column: {col}")
    print(f"Original shape: {df.shape}")
    df = remove_outliers(df, col)
    print(f"New shape: {df.shape}")

In [None]:
numeric_cols = ['Month', 'Day_of_Week', 'Junction_Control', 'Junction_Detail',
       'Accident_Severity', 'Light_Conditions',
       'Local_Authority_(District)',
        'Police_Force', 'Road_Surface_Conditions',
       'Road_Type', 'Time', 'Urban_or_Rural_Area',
       'Weather_Conditions', 'Vehicle_Type']
for col in numeric_cols:
    outlier_idx = outliers(df, col)
    print(f"{col} => outlier_idx : {len(outlier_idx)}")

# Making Data balanced

# Scaling

In [None]:
# 1. First ensure all features are numeric
print("Checking data types before normalization:")
print(X.dtypes)  # If X is numpy array, check df.dtypes before splitting

# 2. Convert time strings to numerical features (if present)
if 'Time' in df.columns:
    df['Hour'] = pd.to_datetime(df['Time']).dt.hour
    df['Minute'] = pd.to_datetime(df['Time']).dt.minute
    df.drop('Time', axis=1, inplace=True)

# 3. Ensure all categorical columns are label encoded
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
if len(categorical_cols) > 0:
    print("\nLabel encoding categorical columns:", list(categorical_cols))
    le = LabelEncoder()
    for col in categorical_cols:
        X[col] = le.fit_transform(X[col])

# 4. Now normalize
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)

# imbalance Data cause overfitting


In [None]:
y.value_counts().plot.bar(title="Accident_Severity")

In [None]:
print("Accident_Severity        : \n",y.value_counts())
print("\n")
print(y.value_counts(normalize=True))

# Data balancing

In [None]:
# Handle class imbalance
sm = SMOTE(random_state=42)
X, y = sm.fit_resample(X, y)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
print("Accident_Severity        : \n",y.value_counts())
print("\n")
print(y.value_counts(normalize=True))

In [None]:
y.value_counts().plot.bar(title="Accident_Severity")

# Data Modeling

In [None]:
X_train, X_test, y_train, y_test =train_test_split(X,y, test_size= 0.20, random_state=100, stratify=y)

In [None]:
print("X_train shape: ", X_train.shape)
print("X_test shape: ", X_test.shape)   
print("y_train shape: ", y_train.shape)
print("y_test shape: ", y_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

Models = {
    'LogisticRegression': LogisticRegression(),
    'GaussianNB': GaussianNB(),
    #'KNeighborsClassifier': KNeighborsClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    #'SVC': SVC(),
    #'DecisionTreeClassifier': DecisionTreeClassifier()
}

In [None]:
ModelName = []
ModelAccuracy = []

for nameModel, model in tqdm(Models.items()):
    model.fit(X_train, y_train)
    ModelName.append(nameModel)
    y_pred = model.predict(X_test)
    ModelAccuracy.append([
        accuracy_score(y_test, y_pred),
        precision_score(y_test, y_pred, average='weighted'), # weighted because of multi classification
        recall_score(y_test, y_pred, average='weighted'),
        f1_score(y_test, y_pred, average='weighted')
    ])

Model_accuracy = pd.DataFrame(ModelAccuracy, index=ModelName, columns=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
Model_accuracy

In [None]:
from sklearn.metrics import confusion_matrix
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))

In [None]:
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
labels = ['Low Severity', 'Medium Severity', 'High Severity']
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
Model_accuracy.plot(kind='bar', figsize=(10, 6))

plt.xlabel('Model')
plt.ylabel('Scores')
plt.title('Model Accuracy Scores')
plt.xticks(rotation=45)  
plt.legend(loc='upper right')
plt.tight_layout() 
plt.show()

In [None]:
import pickle
pickle.dump(model, open('road_accident_model.sav', 'wb'))
