In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


# Data Preprocessing

## Load Dataset

In [None]:
!pip install evaluate
!pip install gradio
import pandas as pd
from datasets import Dataset
from ast import literal_eval
from transformers import AutoTokenizer
import numpy as np
from evaluate import load
import gradio as gr

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.4


In [None]:
import pandas as pd

# Use the raw file link
url = 'https://raw.githubusercontent.com/JordenBong/NLP-Assignment/main/dataset/ner.csv'

# Load your CSV file
df = pd.read_csv(url)

# Inspect the dataframe to understand its structure
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"['NNS', 'IN', 'NNS', 'VBP', 'VBN', 'IN', 'NNP'...","['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', '..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"['NNS', 'IN', 'NNS', 'VBN', 'IN', 'DT', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"['PRP', 'VBD', 'IN', 'DT', 'NNS', 'IN', 'NN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","['NNS', 'VBD', 'DT', 'NN', 'IN', 'NNS', 'IN', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"['DT', 'NN', 'VBZ', 'IN', 'DT', 'NN', 'IN', 'D...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


## Change the Representation in Columns

In [None]:
# Try to convert POS and Tag to list representation instead of string
print("POS Type (Before): ", type(df["POS"][0]))
print("Tag Type (Before): ", type(df["Tag"][0]))

df['POS'] = df['POS'].apply(literal_eval)
df['Tag'] = df['Tag'].apply(literal_eval)

print("POS Type (After): ", type(df["POS"][0]))
print("Tag Type (After): ", type(df["Tag"][0]))

POS Type (Before):  <class 'str'>
Tag Type (Before):  <class 'str'>
POS Type (After):  <class 'list'>
Tag Type (After):  <class 'list'>


## Tokenize the Sentence

In [None]:
# Preprocess 1: Tokenize the sentence and store in another column called Token
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
df['Token'] = df['Sentence'].map(word_tokenize)

In [None]:
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag,Token
0,Sentence: 1,Thousands of demonstrators have marched throug...,"[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[Thousands, of, demonstrators, have, marched, ..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Families, of, soldiers, killed, in, the, conf..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo...","[They, marched, from, the, Houses, of, Parliam..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[Police, put, the, number, of, marchers, at, 1..."
4,Sentence: 5,The protest comes on the eve of the annual con...,"[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,...","[The, protest, comes, on, the, eve, of, the, a..."


## Map the BIO Tags to dslim's Tags (Pre-trained model)

In [None]:
# Preprocess 2: Map the BIO entity type in the NER Tag - Tag mapping from current tags to dslim's tags
tag_mapping = {
    'B-art': 'B-MISC', 'I-art': 'I-MISC',
    'B-eve': 'B-MISC', 'I-eve': 'I-MISC',
    'B-geo': 'B-LOC', 'I-geo': 'I-LOC',
    'B-gpe': 'B-LOC', 'I-gpe': 'I-LOC',
    'B-nat': 'B-MISC', 'I-nat': 'I-MISC',
    'B-org': 'B-ORG', 'I-org': 'I-ORG',
    'B-per': 'B-PER', 'I-per': 'I-PER',
    'B-tim': 'B-MISC', 'I-tim': 'I-MISC',
    'O': 'O'
}

In [None]:
def map_tags(ner_tags):
    new_tags = [tag_mapping[tag] for tag in ner_tags]
    return new_tags

In [None]:
df['New_Tag'] = df['Tag'].apply(map_tags)

In [None]:
df.head()

Unnamed: 0,Sentence #,Sentence,POS,Tag,Token,New_Tag
0,Sentence: 1,Thousands of demonstrators have marched throug...,"[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ...","[O, O, O, O, O, O, B-geo, O, O, O, O, O, B-geo...","[Thousands, of, demonstrators, have, marched, ...","[O, O, O, O, O, O, B-LOC, O, O, O, O, O, B-LOC..."
1,Sentence: 2,Families of soldiers killed in the conflict jo...,"[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[Families, of, soldiers, killed, in, the, conf...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,Sentence: 3,They marched from the Houses of Parliament to ...,"[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo...","[They, marched, from, the, Houses, of, Parliam...","[O, O, O, O, O, O, O, O, O, O, O, B-LOC, I-LOC..."
3,Sentence: 4,"Police put the number of marchers at 10,000 wh...","[NNS, VBD, DT, NN, IN, NNS, IN, CD, IN, NNS, V...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]","[Police, put, the, number, of, marchers, at, 1...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
4,Sentence: 5,The protest comes on the eve of the annual con...,"[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,...","[The, protest, comes, on, the, eve, of, the, a...","[O, O, O, O, O, O, O, O, O, O, O, B-LOC, O, O,..."


## Check any Mismatch Length between Token and POS or Ner Tags

In [None]:
# Checking Function to identify problematic rows
def is_valid_row(row):
    token = row['Token']
    pos = row['POS']
    tag = row['New_Tag']

    return len(token) == len(pos) and len(token) == len(tag)

In [None]:
# Create mask for valid rows
valid_mask = df.apply(is_valid_row, axis=1)
problematic_df = df[~valid_mask]
clean_df = df[valid_mask]

In [None]:
print(f"Total rows: {len(df)}")
print(f"Problematic rows: {len(problematic_df)}")
print(f"Clean rows: {len(clean_df)}")

Total rows: 47959
Problematic rows: 223
Clean rows: 47736


## Save the Separate Dataset
- Direct use clean_df for training
- problematic_df is only 0.0046 % out of total instances
- manual correction for problematic_df if required

In [None]:
# Save problematic instances for potential manual correction
problematic_df.to_csv('problematic_instances.csv', index=False)
clean_df.to_csv('clean_instances.csv', index=False)


## Convert to HuggingFace Dataset

In [None]:
# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(clean_df)

## Final Verification

In [None]:
# Final verification
def verify_dataset(dataset):
    errors = 0
    for example in dataset:
        if len(example['Token']) != len(example['POS']) or len(example['Token']) != len(example['New_Tag']):
            errors += 1
    print(f"Clean dataset contains {errors} length mismatch errors out of {len(dataset)} examples")

verify_dataset(dataset)

Clean dataset contains 0 length mismatch errors out of 47736 examples


# Start Training

# Load Tokenizer

In [None]:
model_name = "dslim/bert-base-NER"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/829 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

# Define Label List

In [None]:
label_list = sorted(list(set(tag for row in dataset["New_Tag"] for tag in row)))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [None]:
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(
        example["Token"],
        is_split_into_words=True,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_offsets_mapping=True  # Useful for debugging
    )

    labels = []
    word_ids = tokenized_inputs.word_ids()  # Maps subwords to original word indices

    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)  # Ignore tokens like [CLS], [SEP], padding
        elif word_idx != previous_word_idx:
            labels.append(label_to_id[example["New_Tag"][word_idx]])
        else:
            labels.append(-100)  # Only label first subword token
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply to Dataset

In [None]:
# Apply to dataset
tokenized_dataset = dataset.map(tokenize_and_align_labels)

Map:   0%|          | 0/47736 [00:00<?, ? examples/s]

# Prepare Data for Training

# Split Data (~81% train, ~9% validation, 10% test)

In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_val = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

# Then split train_val into train and validation
split_train_val = train_val.train_test_split(test_size=0.1)
train_dataset = split_train_val["train"]
val_dataset = split_train_val["test"]

# Define model & print summary

In [None]:
!pip install torchinfo
from transformers import BertForTokenClassification
from torchinfo import summary
import torch

model = BertForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label = id_to_label,
    label2id = label_to_id
)



example = tokenized_dataset['train'][0]
input_ids = torch.tensor([example['input_ids']])
attention_mask = torch.tensor([example['attention_mask']])

inputs = {
    "input_ids": input_ids,
    "attention_mask": attention_mask
}

summary(model, input_data=inputs)

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Layer (type:depth-idx)                                       Output Shape              Param #
BertForTokenClassification                                   [1, 128, 9]               --
├─BertModel: 1-1                                             [1, 128, 768]             --
│    └─BertEmbeddings: 2-1                                   [1, 128, 768]             --
│    │    └─Embedding: 3-1                                   [1, 128, 768]             22,268,928
│    │    └─Embedding: 3-2                                   [1, 128, 768]             1,536
│    │    └─Embedding: 3-3                                   [1, 128, 768]             393,216
│    │    └─LayerNorm: 3-4                                   [1, 128, 768]             1,536
│    │    └─Dropout: 3-5                                     [1, 128, 768]             --
│    └─BertEncoder: 2-2                                      [1, 128, 768]             --
│    │    └─ModuleList: 3-6                                  --             

# Setup Trainer

In [None]:
#!pip install --upgrade transformers
from transformers import TrainingArguments, Trainer
import torch

print("CUDA available:", torch.cuda.is_available())
print("Device name:", torch.cuda.get_device_name() if torch.cuda.is_available() else "No GPU")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
    report_to=[],
)

from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# 4. Trainer setup
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator = data_collator
)

CUDA available: True
Device name: Tesla T4


  trainer = Trainer(


# Train the model

In [None]:
!export WANDB_DISABLED=true
trainer.train()

Step,Training Loss
500,1.1907
1000,0.1069
1500,0.0977
2000,0.0896
2500,0.0849
3000,0.0678
3500,0.0654
4000,0.0654
4500,0.0633
5000,0.0569


TrainOutput(global_step=7251, training_loss=0.14232191291780674, metrics={'train_runtime': 1452.3854, 'train_samples_per_second': 79.865, 'train_steps_per_second': 4.992, 'total_flos': 7577758864431360.0, 'train_loss': 0.14232191291780674, 'epoch': 3.0})

# Save Model

In [None]:
trainer.save_model("/content/drive/MyDrive/NLP/my-ner-model")

# To Evaluate using seqeval

In [None]:
from sklearn.metrics import classification_report
from datasets import load_metric
import numpy as np

!pip install seqeval
from seqeval.metrics import classification_report as seqeval_report
from seqeval.metrics import precision_score, recall_score, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id_to_label[label] for label in sent if label != -100]
        for sent in labels
    ]
    true_predictions = [
        [id_to_label[pred] for pred, label in zip(sent_pred, sent_label) if label != -100]
        for sent_pred, sent_label in zip(predictions, labels)
    ]

    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
    }


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=767169eccd30d5c14da3759bb249d19435716c8733baf25d8a02af5adfc8dc5c
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Evaluate on test dataset (final evaluation)
test_results = trainer.evaluate(eval_dataset=test_dataset)
print("\nFinal Test Set Metrics:")
print(f"Precision: {test_results['eval_precision']:.4f}")
print(f"Recall: {test_results['eval_recall']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

import numpy as np
from sklearn.metrics import classification_report
from seqeval.metrics import classification_report as seqeval_report
from seqeval.metrics import precision_score, recall_score, f1_score
import pandas as pd

# ---- Evaluate on test set ----
predictions, labels, _ = trainer.predict(test_dataset)

# Convert predictions to label IDs
preds = np.argmax(predictions, axis=2)

# Map predictions and labels to tag names (ignoring special tokens)
true_labels = [
    [id_to_label[label] for label in sent if label != -100]
    for sent in labels
]

true_preds = [
    [id_to_label[pred] for pred, label in zip(sent_pred, sent_label) if label != -100]
    for sent_pred, sent_label in zip(preds, labels)
]

# ---- Print Pretty Table ----
report_dict = seqeval_report(true_labels, true_preds, output_dict=True)
df = pd.DataFrame(report_dict).transpose()

# Keep only precision, recall, and f1-score columns
pretty_df = df[['precision', 'recall', 'f1-score']]
print(pretty_df)


  trainer = Trainer(



Final Test Set Metrics:
Precision: 0.8533
Recall: 0.8594
F1 Score: 0.8564
              precision    recall  f1-score
LOC            0.900434  0.937535  0.918610
MISC           0.866795  0.854424  0.860565
ORG            0.758411  0.708763  0.732747
PER            0.786857  0.794576  0.790698
micro avg      0.853344  0.859423  0.856373
macro avg      0.828124  0.823825  0.825655
weighted avg   0.851465  0.859423  0.855106


# Load Model

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Define the path safely
model_path = os.path.join("/content", "drive", "MyDrive", "NLP", "my-ner-model")

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print(next(model.parameters()).device)  # Should print: cuda:0
print(inputs['input_ids'].device)      # Should print: cuda:0

cuda:0
cpu


# Test on input

In [None]:
!pip install gradio
import torch
import spacy
from spacy import displacy
from spacy.tokens import Span
import re

def clean_text(text):
    # Remove mentions and URLs
    text = re.sub(r'@\w+|http\S+', '', text)

    # Remove hashtags and the hashtag words (e.g., "#vacation")
    text = re.sub(r'#\w+', '', text)

    # Normalize repeated characters (e.g., "Sooo" -> "So")
    text = re.sub(r'(.)\1{2,}', r'\1', text)

    # Remove emojis and special punctuation (keep only words and spaces)
    text = re.sub(r'[^\w\s]', '', text, flags=re.UNICODE)

    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Initialize empty spaCy model for visualization
nlp = spacy.blank("en")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def predict_ner_as_tagged_text(text):
    cleaned = clean_text(text)
    words = cleaned.split()

    encoded = tokenizer(words, is_split_into_words=True, return_tensors="pt",
                        truncation=True, padding="max_length", max_length=128)

    inputs = {k: v.to(device) for k, v in encoded.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2)[0].cpu().tolist()

    predicted_labels = []
    word_ids_list = tokenizer(words, is_split_into_words=True, return_tensors="pt").word_ids()
    previous_word_idx = None

    for idx, word_idx in enumerate(word_ids_list):
        if word_idx is None:
            continue
        elif word_idx != previous_word_idx:
            label_id = predictions[idx]
            label = id_to_label[label_id]

            if label != "O":
                predicted_labels.append((words[word_idx], label))

            previous_word_idx = word_idx

    doc = nlp(cleaned)
    ents = []
    offset = 0

    for word, label in predicted_labels:
        start = cleaned.find(word, offset)
        end = start + len(word)
        offset = end

        span = doc.char_span(start, end)
        if span:
            span = Span(doc, span.start, span.end, label)
            ents.append(span)

    doc.ents = ents

    colors = {
        "B-PER": "#ff4d4d",
        "B-LOC": "#3399ff",
        "B-ORG": "#33cc33",
        "B-MISC": "#ff9933",
        "I-PER": "#ff4d4d",
        "I-LOC": "#3399ff",
        "I-ORG": "#33cc33",
        "I-MISC": "#ff9933"
    }

    options = {"colors": colors}
    return displacy.render(doc, style="ent", jupyter=False, options=options)



# Gradio Interface

In [None]:
import gradio as gr
# Create Gradio Interface
iface = gr.Interface(
    fn=predict_ner_as_tagged_text,
    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
    outputs="html",
    title="Named Entity Recognition System",
    description="Enter a sentence to get word-level NER tags."
)

if __name__ == "__main__":
    iface.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://4824e0512c9ff9b5fe.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
