# Import libraries

In [26]:
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
import pandas as pd
import joblib
import os

# Load and preprocess data

### Load data

In [27]:
# Define the relative path from the 'notebooks' folder to 'data/raw/'
data_path = os.path.join("..", "data", "raw", "Unsupervised_accident_data.csv")

# Load the dataset
df = pd.read_csv(data_path)
display(df.head()) # Show first 3 rows
print(df.shape)

Unnamed: 0,Accident Report
0,"On 21 Aug 2023, at 09:09 PM, a two-wheeler col..."
1,"On 04 Jan 2025, at 02:06 PM, a auto-rickshaw c..."
2,"On 15 May 2023, at 05:20 PM, a auto-rickshaw c..."
3,"On 23 Feb 2023, at 11:17 PM, a bus collided wi..."
4,"On 30 Mar 2024, at 02:28 AM, a bicycle collide..."


(20000, 1)


### Check for duplicates and missing values

In [28]:
print(df.isnull().sum())
print(df.duplicated().sum())

Accident Report    0
dtype: int64
0


### preprocess text

In [29]:
import re

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'(?<!\d):|[^a-zA-Z0-9:\s]', ' ', text)  # Removes special characters except colons in time
    return text

df["cleaned_text"] = df["Accident Report"].apply(preprocess_text)


In [30]:
df

Unnamed: 0,Accident Report,cleaned_text
0,"On 21 Aug 2023, at 09:09 PM, a two-wheeler col...",on 21 aug 2023 at 09:09 pm a two wheeler col...
1,"On 04 Jan 2025, at 02:06 PM, a auto-rickshaw c...",on 04 jan 2025 at 02:06 pm a auto rickshaw c...
2,"On 15 May 2023, at 05:20 PM, a auto-rickshaw c...",on 15 may 2023 at 05:20 pm a auto rickshaw c...
3,"On 23 Feb 2023, at 11:17 PM, a bus collided wi...",on 23 feb 2023 at 11:17 pm a bus collided wi...
4,"On 30 Mar 2024, at 02:28 AM, a bicycle collide...",on 30 mar 2024 at 02:28 am a bicycle collide...
...,...,...
19995,"On 13 Jun 2023, at 12:13 AM, a truck collided ...",on 13 jun 2023 at 12:13 am a truck collided ...
19996,"On 21 Mar 2024, at 09:17 PM, a Jeep collided w...",on 21 mar 2024 at 09:17 pm a jeep collided w...
19997,"On 09 Jun 2023, at 05:27 PM, a bus collided wi...",on 09 jun 2023 at 05:27 pm a bus collided wi...
19998,"On 09 Mar 2023, at 08:56 PM, a car collided wi...",on 09 mar 2023 at 08:56 pm a car collided wi...


### Rule based-labelling of data

In [31]:
import spacy

# Load spaCy's English model
nlp = spacy.load("en_core_web_sm")

# Define keyword lists (Expand as needed)
primary_cause_keywords = ["overspeeding", "drunk driving", "reckless driving", "brake failure"]
secondary_cause_keywords = ["wet road", "oil spill", "poor visibility"]
risk_factor_keywords = ["no seat belt", "faulty brakes", "overloaded truck"]

# Function to label text automatically
def auto_label(text):
    doc = nlp(text.lower())
    labels = []
    
    for token in doc:
        word = token.text
        if word in primary_cause_keywords:
            labels.append((word, "PRIMARY_CAUSE"))
        elif word in secondary_cause_keywords:
            labels.append((word, "SECONDARY_CAUSE"))
        elif word in risk_factor_keywords:
            labels.append((word, "RISK_FACTOR"))
    
    return labels

# Apply function
df["labels"] = df["Accident Report"].apply(auto_label)

display(df.head())  # Check labeled data


Unnamed: 0,Accident Report,cleaned_text,labels
0,"On 21 Aug 2023, at 09:09 PM, a two-wheeler col...",on 21 aug 2023 at 09:09 pm a two wheeler col...,[]
1,"On 04 Jan 2025, at 02:06 PM, a auto-rickshaw c...",on 04 jan 2025 at 02:06 pm a auto rickshaw c...,[]
2,"On 15 May 2023, at 05:20 PM, a auto-rickshaw c...",on 15 may 2023 at 05:20 pm a auto rickshaw c...,[]
3,"On 23 Feb 2023, at 11:17 PM, a bus collided wi...",on 23 feb 2023 at 11:17 pm a bus collided wi...,[]
4,"On 30 Mar 2024, at 02:28 AM, a bicycle collide...",on 30 mar 2024 at 02:28 am a bicycle collide...,[]


In [32]:
path1 = os.path.join("..", "data", "processed", "labelled_report.csv")
df.to_csv('labelled_report.csv', index=False) 

# Convert data to BERT NER format: BERT needs tokenized input and entity labels.

In [33]:
def tokenize_and_label(row):
    tokens = tokenizer(row["Accident Report"], truncation=True, padding="max_length", max_length=512)
    word_ids = tokens.word_ids()  # Maps tokens to original words

    labels = [-100] * len(tokens["input_ids"])  # -100 for ignored subwords

    for word_index, (word, label) in enumerate(row["labels"]):
        for i, word_id in enumerate(word_ids):
            if word_id == word_index and word_id is not None:
                labels[i] = label2id[f"B-{label}" if labels[i] == -100 else f"I-{label}"]

    return {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"],
        "labels": labels
    }



In [34]:
from datasets import Dataset

tokenized_dataset = df.apply(tokenize_and_label, axis=1).tolist()
dataset = Dataset.from_list(tokenized_dataset)
print(dataset)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 20000
})


# Train BERT for NER

In [35]:
from datasets import Dataset
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# Convert dataframe to Hugging Face dataset
dataset = Dataset.from_pandas(df)

# Define entity labels
unique_labels = ["O", "B-PRIMARY_CAUSE", "I-PRIMARY_CAUSE",
                 "B-SECONDARY_CAUSE", "I-SECONDARY_CAUSE",
                 "B-RISK_FACTOR", "I-RISK_FACTOR"]

id2label = {i: label for i, label in enumerate(unique_labels)}
label2id = {label: i for i, label in enumerate(unique_labels)}

# Load BERT NER model
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save trained model
model.save_pretrained("bert-accident-ner")
tokenizer.save_pretrained("bert-accident-ner")


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['labels']