<a href="https://colab.research.google.com/github/MARKMIRUKA/Accident-Prediction/blob/main/accident_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/Datasets/accident data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
print(data.head())

# Check for missing values and basic information about the dataset
print(data.info())

# Display basic statistics for numerical columns
print(data.describe())


           Index Accident_Severity Accident Date   Latitude  \
0  200701BS64157           Serious      5/6/2019  51.506187   
1  200701BS65737           Serious      2/7/2019  51.495029   
2  200701BS66127           Serious    26-08-2019  51.517715   
3  200701BS66128           Serious    16-08-2019  51.495478   
4  200701BS66837            Slight      3/9/2019  51.488576   

              Light_Conditions           District Area  Longitude  \
0        Darkness - lights lit  Kensington and Chelsea  -0.209082   
1                     Daylight  Kensington and Chelsea  -0.173647   
2  Darkness - lighting unknown  Kensington and Chelsea  -0.210215   
3                     Daylight  Kensington and Chelsea  -0.202731   
4        Darkness - lights lit  Kensington and Chelsea  -0.192487   

   Number_of_Casualties  Number_of_Vehicles Road_Surface_Conditions  \
0                     1                   2                     Dry   
1                     1                   2             Wet or d

In [None]:
# Handling missing values
# Fill missing values for numerical columns with median
data['Latitude'].fillna(data['Latitude'].median(), inplace=True)
data['Longitude'].fillna(data['Longitude'].median(), inplace=True)

# For categorical columns, we can fill missing values with the mode (most frequent value)
data['Road_Surface_Conditions'].fillna(data['Road_Surface_Conditions'].mode()[0], inplace=True)
data['Road_Type'].fillna(data['Road_Type'].mode()[0], inplace=True)
data['Urban_or_Rural_Area'].fillna(data['Urban_or_Rural_Area'].mode()[0], inplace=True)
data['Weather_Conditions'].fillna(data['Weather_Conditions'].mode()[0], inplace=True)

# Confirm that there are no missing values left
print(data.isnull().sum())


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Latitude'].fillna(data['Latitude'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Longitude'].fillna(data['Longitude'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate

Index                      0
Accident_Severity          0
Accident Date              0
Latitude                   0
Light_Conditions           0
District Area              0
Longitude                  0
Number_of_Casualties       0
Number_of_Vehicles         0
Road_Surface_Conditions    0
Road_Type                  0
Urban_or_Rural_Area        0
Weather_Conditions         0
Vehicle_Type               0
dtype: int64


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encoding Accident Severity as target variable
severity_mapping = {'Slight': 0, 'Serious': 1, 'Fatal': 2}
data['Accident_Severity'] = data['Accident_Severity'].map(severity_mapping)

# Encode other categorical features using Label Encoding
label_encoder = LabelEncoder()

categorical_columns = ['Light_Conditions', 'District Area', 'Road_Surface_Conditions',
                       'Road_Type', 'Urban_or_Rural_Area', 'Weather_Conditions', 'Vehicle_Type']

for col in categorical_columns:
    data[col] = label_encoder.fit_transform(data[col])


In [None]:
# Independent variables (features)
X = data[['Latitude', 'Longitude', 'Number_of_Casualties', 'Number_of_Vehicles',
          'Light_Conditions', 'Road_Surface_Conditions', 'Road_Type',
          'Urban_or_Rural_Area', 'Weather_Conditions', 'Vehicle_Type']]

# Dependent variable (target)
y = data['Accident_Severity']


In [None]:
from sklearn.model_selection import train_test_split

# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Build and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)


Accuracy: 0.853930798571169
Classification Report:
               precision    recall  f1-score   support

           0       0.85      1.00      0.92    112849
           1       0.38      0.00      0.00     17536
           2       0.15      0.00      0.00      1751

    accuracy                           0.85    132136
   macro avg       0.46      0.33      0.31    132136
weighted avg       0.78      0.85      0.79    132136

