<a href="https://colab.research.google.com/github/KimathiMacey/roadaccidentseverity/blob/main/RoadAccidentSeverity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

In [None]:
# Loading the dataset
url = '/content/road_accidents.csv'
data = pd.read_csv(url)

# Displays the first few rows of the dataset
data.head()

Unnamed: 0,Accident_ID,Weather,Time_of_Day,Road_Type,Driver_Age,Vehicle_Type,Traffic_Volume,Severity
0,1,Clear,Morning,Urban,25,Car,50,2
1,2,Rainy,Evening,Highway,40,Truck,30,4
2,3,Foggy,Afternoon,Rural,22,Motorcycle,20,3
3,4,Clear,Night,Urban,30,Car,60,1
4,5,Rainy,Morning,Highway,55,SUV,40,5


In [None]:
# Check column names and data types
data.info()

# Check for missing values
data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Accident_ID     12 non-null     int64 
 1   Weather         12 non-null     object
 2   Time_of_Day     12 non-null     object
 3   Road_Type       12 non-null     object
 4   Driver_Age      12 non-null     int64 
 5   Vehicle_Type    12 non-null     object
 6   Traffic_Volume  12 non-null     int64 
 7   Severity        12 non-null     int64 
dtypes: int64(4), object(4)
memory usage: 896.0+ bytes


Unnamed: 0,0
Accident_ID,0
Weather,0
Time_of_Day,0
Road_Type,0
Driver_Age,0
Vehicle_Type,0
Traffic_Volume,0
Severity,0


In [None]:
# Check column names
print(data.columns)

Index(['Accident_ID', 'Weather', 'Time_of_Day', 'Road_Type', 'Driver_Age',
       'Vehicle_Type', 'Traffic_Volume', 'Severity'],
      dtype='object')


In [None]:
data = pd.read_csv('/content/road_accidents.csv', header=0)


In [None]:
# Check column names
print(data.columns)

Index(['Accident_ID', 'Weather', 'Time_of_Day', 'Road_Type', 'Driver_Age',
       'Vehicle_Type', 'Traffic_Volume', 'Severity'],
      dtype='object')


In [None]:
data = pd.get_dummies(data, columns=['Accident_ID', 'Weather', 'Time_of_Day', 'Road_Type', 'Driver_Age', 'Vehicle_Type', 'Traffic_Volume', 'Severity'], drop_first=True)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('/content/road_accidents.csv', header=0)

# Create dummy variables, but store them in a separate DataFrame
dummy_data = pd.get_dummies(data, columns=['Accident_ID', 'Weather', 'Time_of_Day', 'Road_Type', 'Driver_Age', 'Vehicle_Type', 'Traffic_Volume', 'Severity'], drop_first=True)


# Define dependent and independent variables using the new dummy variable names
X = dummy_data[[col for col in dummy_data.columns if col.startswith(('Accident_ID_', 'Weather_', 'Time_of_Day_', 'Road_Type_', 'Driver_Age_', 'Vehicle_Type_', 'Traffic_Volume_'))]]
y = dummy_data[[col for col in dummy_data.columns if col.startswith('Severity_')]]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create a linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.19357896636746874


In [None]:
joblib.dump(model, 'road_accident_severity_model.pkl')

['road_accident_severity_model.pkl']

In [None]:
loaded_model = joblib.load('road_accident_severity_model.pkl')
predictions = loaded_model.predict(X_test)

# Example input as a dictionary, representing a single row
example_input_dict = {'Accident_ID': 4, 'Weather': 1, 'Time_of_Day': 0, 'Road_Type': 0, 'Driver_Age': 45, 'Vehicle_Type': 0, 'Traffic_Volume': 300}

# Create a DataFrame from the dictionary
example_input_df = pd.DataFrame([example_input_dict])

# One-hot encode the example input using the same columns as the training data
example_input_encoded = pd.get_dummies(example_input_df, columns=['Accident_ID', 'Weather', 'Time_of_Day', 'Road_Type', 'Driver_Age', 'Vehicle_Type', 'Traffic_Volume'], drop_first=True)

# Reindex the encoded example input to match the columns of X_train,
# filling missing columns with 0
example_input_final = example_input_encoded.reindex(columns=X_train.columns, fill_value=0)

# Predict severity
predicted_severity = model.predict(example_input_final)
print(f'Predicted Accident Severity: {predicted_severity[0]}')



Predicted Accident Severity: [0.29858604 0.12081867 0.24507292 0.07218341]


In [None]:
# Example input as a dictionary, representing a single row
example_input_dict = {'Accident_ID': 13, 'Weather': 1, 'Time_of_Day': 0, 'Road_Type': 0, 'Driver_Age': 22, 'Vehicle_Type': 1, 'Traffic_Volume': 120}

# Create a DataFrame from the dictionary
example_input_df = pd.DataFrame([example_input_dict])

# One-hot encode the example input using the same columns as the training data
example_input_encoded = pd.get_dummies(example_input_df, columns=['Accident_ID', 'Weather', 'Time_of_Day', 'Road_Type', 'Driver_Age', 'Vehicle_Type', 'Traffic_Volume'], drop_first=True)

# Reindex the encoded example input to match the columns of X_train,
# filling missing columns with 0
example_input_final = example_input_encoded.reindex(columns=X_train.columns, fill_value=0)

# Predict severity
predicted_severity = model.predict(example_input_final)
print(f'Predicted Accident Severity: {predicted_severity[0]}')

Predicted Accident Severity: [0.29858604 0.12081867 0.24507292 0.07218341]
