## Import library files

In [39]:
import pandas as pd
import re
import spacy
from nltk.corpus import stopwords
import os
import joblib

## Load data

In [40]:
# Define the relative path from the 'notebooks' folder to 'data/raw/'
data_path = os.path.join("..", "data", "raw", "supervised_accident_data.csv")

# Load the dataset
df = pd.read_csv(data_path)
display(df.head()) # Show first 3 rows
print(df.shape)

Unnamed: 0,Accident Report,Primary Cause,Secondary Cause,Risk Factor
0,"On 16 May 2024, at 06:29 AM, a auto-rickshaw c...",Other,No Violation,Medium
1,"On 15 Sep 2023, at 11:06 AM, a car collided wi...",Over-speeding,Seatbelt Violation,High
2,"On 29 Oct 2023, at 01:18 PM, a Jeep collided w...",Other,Seatbelt Violation,High
3,"On 27 Dec 2024, at 08:43 AM, a two-wheeler col...",Other,Helmet Violation,Medium
4,"On 28 Apr 2024, at 09:09 PM, a bus collided wi...",Drunk driving,No Violation,High


(20000, 4)


## Remove duplicates and missing values

In [41]:
print(df.isnull().sum())
print(df.duplicated().sum())

Accident Report    0
Primary Cause      0
Secondary Cause    0
Risk Factor        0
dtype: int64
0


# NLP Text cleaning

In [42]:
nlp= spacy.load("en_core_web_sm")
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    doc = nlp(text)
    words = [token.lemma_.lower() for token in doc if token.text.lower() not in stopwords.words('english')]
    return " ".join(words)


In [43]:
# Apply text preprocessing
df['Processed Report'] = df['Accident Report'].apply(preprocess_text)

# Display sample data
display(df.head())


Unnamed: 0,Accident Report,Primary Cause,Secondary Cause,Risk Factor,Processed Report
0,"On 16 May 2024, at 06:29 AM, a auto-rickshaw c...",Other,No Violation,Medium,16 may 2024 0629 autorickshaw collide twowheel...
1,"On 15 Sep 2023, at 11:06 AM, a car collided wi...",Over-speeding,Seatbelt Violation,High,15 sep 2023 1106 car collide bus nh544 near tr...
2,"On 29 Oct 2023, at 01:18 PM, a Jeep collided w...",Other,Seatbelt Violation,High,29 oct 2023 0118 pm jeep collide bus nh183 nea...
3,"On 27 Dec 2024, at 08:43 AM, a two-wheeler col...",Other,Helmet Violation,Medium,27 dec 2024 0843 twowheeler collide twowheeler...
4,"On 28 Apr 2024, at 09:09 PM, a bus collided wi...",Drunk driving,No Violation,High,28 apr 2024 0909 pm bus collide autorickshaw n...


In [None]:
cleaned_data_path= os.path.join("..", "data", "raw", "cleaned_processed_data.csv")
df.to_csv(cleaned_data_path, index=False)
df = pd.read_csv(cleaned_data_path)
display(df.head())

Unnamed: 0,Accident Report,Primary Cause,Secondary Cause,Risk Factor,Processed Report
0,"On 16 May 2024, at 06:29 AM, a auto-rickshaw c...",Other,No Violation,Medium,16 may 2024 0629 autorickshaw collide twowheel...
1,"On 15 Sep 2023, at 11:06 AM, a car collided wi...",Over-speeding,Seatbelt Violation,High,15 sep 2023 1106 car collide bus nh544 near tr...
2,"On 29 Oct 2023, at 01:18 PM, a Jeep collided w...",Other,Seatbelt Violation,High,29 oct 2023 0118 pm jeep collide bus nh183 nea...
3,"On 27 Dec 2024, at 08:43 AM, a two-wheeler col...",Other,Helmet Violation,Medium,27 dec 2024 0843 twowheeler collide twowheeler...
4,"On 28 Apr 2024, at 09:09 PM, a bus collided wi...",Drunk driving,No Violation,High,28 apr 2024 0909 pm bus collide autorickshaw n...


# Convert Text into Numeric Features(Vectors)
#### TF-IDF vectorization|

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Limit features for efficiency
X = vectorizer.fit_transform(df['Processed Report'])

# Save vectorizer
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

# Label encoding targets

In [46]:
# Encode target labels
from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encoder_primary = LabelEncoder()
y_primary = label_encoder_primary.fit_transform(df['Primary Cause'])

label_encoder_secondary = LabelEncoder()
y_secondary = label_encoder_secondary.fit_transform(df['Secondary Cause'])

label_encoder_risk = LabelEncoder()
y_risk = label_encoder_risk.fit_transform(df['Risk Factor'])

# Save encoders
joblib.dump(label_encoder_primary, 'label_encoder_primary.pkl')
joblib.dump(label_encoder_secondary, 'label_encoder_secondary.pkl')
joblib.dump(label_encoder_risk, 'label_encoder_risk.pkl')

# Display encoded values
print('Encoded primary cause classes are:',label_encoder_primary.classes_)  # Shows mapping of classes
print('Encoded secondary cause classes are:',label_encoder_secondary.classes_)  # Shows mapping of classes
print('Encoded risk factors are:',label_encoder_risk.classes_)  # Shows mapping of classes

Encoded primary cause classes are: ['Distracted driving' 'Drunk driving' 'Other' 'Over-speeding'
 'Wrong side driving']
Encoded secondary cause classes are: ['Helmet Violation' 'No Violation' 'Seatbelt Violation']
Encoded risk factors are: ['High' 'Medium']


# Save 'X' and 'y_primary'

In [47]:
import numpy as np

# Save sparse matrix
np.savez_compressed("X_sparse.npz", data=X.data, indices=X.indices, indptr=X.indptr, shape=X.shape)

print("Saved X_sparse.npz successfully!")


Saved X_sparse.npz successfully!


In [48]:
print(type(y_primary))
print(y_primary.ndim)

<class 'numpy.ndarray'>
1


In [49]:
pd.Series(y_primary).to_csv("y_primary.csv", index=False)
pd.Series(y_secondary).to_csv("y_secondary.csv", index=False)
pd.Series(y_risk).to_csv("y_risk.csv", index=False)