## Import library files

In [3]:
import pandas as pd
import re
import spacy
from nltk.corpus import stopwords

## Load data

In [4]:
df = pd.read_csv('supervised_accident_data.csv')
display(df.head())
print(df.shape)


Unnamed: 0,Accident Report,Primary Cause,Secondary Cause,Risk Factor
0,"On 11 Sep 2024, at 10:51 AM, a bicycle collide...",Drunk driving,,High
1,"On 11 May 2024, at 10:09 AM, a auto-rickshaw c...",Over-speeding,,High
2,"On 22 Oct 2023, at 10:16 PM, a Jeep collided w...",Other,,High
3,"On 12 Aug 2024, at 01:40 PM, a auto-rickshaw c...",Other,,High
4,"On 29 May 2024, at 10:54 PM, a two-wheeler col...",Distracted driving,,High


(20000, 4)


## Remove duplicates and missing values

In [5]:
print(df.isnull().sum())
print(df.duplicated().sum())

Accident Report        0
Primary Cause          0
Secondary Cause    14410
Risk Factor            0
dtype: int64
0


# NLP Text cleaning

In [6]:
nlp= spacy.load("en_core_web_sm")
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    doc = nlp(text)
    words = [token.lemma_.lower() for token in doc if token.text.lower() not in stopwords.words('english')]
    return " ".join(words)


In [18]:
# Apply text preprocessing
df['Processed Report'] = df['Accident Report'].apply(preprocess_text)

# Display sample data
display(df.head())


Unnamed: 0,Accident Report,Primary Cause,Secondary Cause,Risk Factor,Processed Report
0,"On 11 Sep 2024, at 10:51 AM, a bicycle collide...",Drunk driving,,High,11 sep 2024 1051 bicycle collide pedestrian nh...
1,"On 11 May 2024, at 10:09 AM, a auto-rickshaw c...",Over-speeding,,High,11 may 2024 1009 autorickshaw collide pedestri...
2,"On 22 Oct 2023, at 10:16 PM, a Jeep collided w...",Other,,High,22 oct 2023 1016 pm jeep collide pedestrian nh...
3,"On 12 Aug 2024, at 01:40 PM, a auto-rickshaw c...",Other,,High,12 aug 2024 0140 pm autorickshaw collide pedes...
4,"On 29 May 2024, at 10:54 PM, a two-wheeler col...",Distracted driving,,High,29 may 2024 1054 pm twowheeler collide pedestr...


In [None]:
df.to_csv('cleaned_processed_data.csv', index=False)
df = pd.read_csv('supervised_accident_data.csv')
display(df.head())

# Convert Text into Numeric Features(Vectors)
#### TF-IDF vectorization|

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Limit features for efficiency
X = vectorizer.fit_transform(df['Processed Report'])

# Encode target labels
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_primary = label_encoder.fit_transform(df['Primary Cause'])  # Encoding Primary Cause

# Display encoded values
print(label_encoder.classes_)  # Shows mapping of classes


['Distracted driving' 'Drunk driving' 'Other' 'Over-speeding'
 'Wrong side driving']


# Training Supervised Learning Model

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_primary, test_size=0.2, random_state=42, stratify=y_primary)

# Train Random Forest Model
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

# Save the model
import joblib
joblib.dump(model, 'accident_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')

print("Model trained and saved successfully!")


Model trained and saved successfully!


# Model evaluation

In [17]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = model.predict(X_test)

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.982
Classification Report:
                     precision    recall  f1-score   support

Distracted driving       1.00      1.00      1.00       448
     Drunk driving       1.00      1.00      1.00       438
             Other       0.97      1.00      0.98      2221
     Over-speeding       1.00      0.84      0.91       453
Wrong side driving       1.00      1.00      1.00       440

          accuracy                           0.98      4000
         macro avg       0.99      0.97      0.98      4000
      weighted avg       0.98      0.98      0.98      4000

