## Import library files

In [9]:
import pandas as pd
import re
import spacy
from nltk.corpus import stopwords

## Load data

In [10]:
df = pd.read_csv('supervised_accident_data.csv')
display(df.head())
print(df.shape)


Unnamed: 0,Accident Report,Primary Cause,Secondary Cause,Risk Factor
0,"On 11 Sep 2024, at 10:51 AM, a bicycle collide...",Drunk driving,,High
1,"On 11 May 2024, at 10:09 AM, a auto-rickshaw c...",Over-speeding,,High
2,"On 22 Oct 2023, at 10:16 PM, a Jeep collided w...",Other,,High
3,"On 12 Aug 2024, at 01:40 PM, a auto-rickshaw c...",Other,,High
4,"On 29 May 2024, at 10:54 PM, a two-wheeler col...",Distracted driving,,High


(20000, 4)


## Remove duplicates and missing values

In [11]:
print(df.isnull().sum())
print(df.duplicated().sum())

Accident Report        0
Primary Cause          0
Secondary Cause    14410
Risk Factor            0
dtype: int64
0


# NLP Text cleaning

In [15]:
nlp= spacy.load("en_core_web_sm")
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    doc = nlp(text)
    words = [token.lemma_.lower() for token in doc if token.text.lower() not in stopwords.words('english')]
    return " ".join(words)


In [16]:
# Apply text preprocessing
df['Processed Report'] = df['Accident Report'].apply(preprocess_text)

# Display sample data
print(df.head())


                                     Accident Report       Primary Cause  \
0  On 11 Sep 2024, at 10:51 AM, a bicycle collide...       Drunk driving   
1  On 11 May 2024, at 10:09 AM, a auto-rickshaw c...       Over-speeding   
2  On 22 Oct 2023, at 10:16 PM, a Jeep collided w...               Other   
3  On 12 Aug 2024, at 01:40 PM, a auto-rickshaw c...               Other   
4  On 29 May 2024, at 10:54 PM, a two-wheeler col...  Distracted driving   

  Secondary Cause Risk Factor  \
0             NaN        High   
1             NaN        High   
2             NaN        High   
3             NaN        High   
4             NaN        High   

                                    Processed Report  
0  11 sep 2024 1051 bicycle collide pedestrian nh...  
1  11 may 2024 1009 autorickshaw collide pedestri...  
2  22 oct 2023 1016 pm jeep collide pedestrian nh...  
3  12 aug 2024 0140 pm autorickshaw collide pedes...  
4  29 may 2024 1054 pm twowheeler collide pedestr...  


In [14]:
df.to_csv('cleaned_processed_data.csv', index=False)
