# Tagging Cleaned IMDB Dataset using Baseline NER Model
This notebook loads your trained NER model from a checkpoint and uses it to tag entities in the cleaned IMDB dataset.


In [10]:
import sys
sys.path.append("../scripts")  

In [11]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoModelForTokenClassification, RobertaTokenizerFast, AutoConfig, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
from span_f1 import readNlu, toSpans, getBegEnd, getLooseOverlap, getUnlabeled
import pickle


## 1. Load Cleaned IMDB Dataset

In [12]:
df = pd.read_csv("../data/clean_imdb_dataset.csv")
df['tokens'] = df['review'].apply(lambda x: x.split())
df['dummy_labels'] = df['tokens'].apply(lambda x: ['O'] * len(x))

## 2. Create HuggingFace Dataset Object

In [13]:
imdb_data = Dataset.from_dict({
    'sents': df['tokens'].tolist(),
    'ner_tags': df['dummy_labels'].tolist(),
    'ids': df['dummy_labels'].tolist()
})

## 3. Load Tokenizer and Label Mappings

In [14]:
# Load label mappings used during training
with open('../project/baseline_model/idx2lab', 'rb') as f:
    idx2lab = pickle.load(f)

with open('../project/baseline_model/lab2idx', 'rb') as f:
    lab2idx = pickle.load(f)

label_list = list(lab2idx.keys())  # Needed for num_labels

# Tokenizer and config
model_link = "deepset/roberta-base-squad2"
tokenizer = RobertaTokenizerFast.from_pretrained(model_link, use_fast=True, add_prefix_space=True)
config = AutoConfig.from_pretrained(
    model_link,
    num_labels=len(label_list),
    id2label=idx2lab,
    label2id=lab2idx
)


## 4. Tokenize IMDB Dataset Using Trained Format

In [15]:
text_column_name = 'sents'
label_column_name = 'ids'

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        max_length=128,
        padding=False,
        truncation=True,
        is_split_into_words=True
    )

    all_labels = []
    for batch_index, labels in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=batch_index)
        label_ids = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id == prev_word_id:
                label_ids.append(-100)
            else:
                label_ids.append(lab2idx[labels[word_id]])
            prev_word_id = word_id
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


In [16]:
processed_imdb = imdb_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=imdb_data.column_names,
    desc="Tokenizing IMDB reviews"
)

Tokenizing IMDB reviews:   0%|          | 0/49581 [00:00<?, ? examples/s]

## 5. Load Trained Model from Checkpoint

In [17]:
model_path = "../data/training_parameters/checkpoint-4704"
model = AutoModelForTokenClassification.from_pretrained(model_path, config=config)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

## 6. Run Predictions Using Trainer

In [18]:
def convert_int_to_labels(preds):
    logits, labels = preds
    predictions = np.argmax(logits, axis=-1)
    
    true_predictions = [
        [idx2lab[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]
    
    return None, true_predictions  # You only need predicted labels here


In [19]:
data_collator = DataCollatorForTokenClassification(tokenizer)
training_args = TrainingArguments(output_dir="tmp")
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator
)

predictions, labels, _ = trainer.predict(processed_imdb)
_, predicted_labels = convert_int_to_labels((predictions, labels))

## 7. Save Predictions in CoNLL Format

In [20]:
def write_conll_file(data, path):
    with open(path, "w", encoding="utf-8") as f:
        for sentence in data:
            words, labels = sentence
            for idx, (word, label) in enumerate(zip(words, labels), start=1):
                f.write(f"{idx}\t{word}\t{label}\t-\t-\n")
            f.write("\n")


imdb_tagged = [(tokens, labels) for tokens, labels in zip(df['tokens'].tolist(), predicted_labels)]
write_conll_file(imdb_tagged, "../data/imdb_tagged_output.iob2")
            

In [21]:
# Print the first tagged review as an example (safely aligned)
tokens, labels = imdb_tagged[1]
for idx in range(len(tokens)):
    word = tokens[idx]
    tag = labels[idx] if idx < len(labels) else "O"  # fallback if mismatch
    print(f"{idx+1}\t{word}\t{tag}")


1	A	O
2	wonderful	O
3	little	O
4	production	O
5	The	O
6	filming	O
7	technique	O
8	is	O
9	very	O
10	unassuming	O
11	very	O
12	oldtimeBBC	O
13	fashion	O
14	and	O
15	gives	O
16	a	O
17	comforting	O
18	and	O
19	sometimes	O
20	discomforting	O
21	sense	O
22	of	O
23	realism	O
24	to	O
25	the	O
26	entire	O
27	piece	O
28	The	O
29	actors	O
30	are	O
31	extremely	O
32	well	O
33	chosen	O
34	Michael	B-PER
35	Sheen	I-PER
36	not	O
37	only	O
38	has	O
39	got	O
40	all	O
41	the	O
42	polari	O
43	but	O
44	he	O
45	has	O
46	all	O
47	the	O
48	voices	O
49	down	O
50	pat	O
51	too	O
52	You	O
53	can	O
54	truly	O
55	see	O
56	the	O
57	seamless	O
58	editing	O
59	guided	O
60	by	O
61	the	O
62	references	O
63	to	O
64	Williams	B-PER
65	diary	O
66	entries	O
67	not	O
68	only	O
69	is	O
70	it	O
71	well	O
72	worth	O
73	the	O
74	watching	O
75	but	O
76	it	O
77	is	O
78	a	O
79	terrificly	O
80	written	O
81	and	O
82	performed	O
83	piece	O
84	A	O
85	masterful	O
86	production	O
87	about	O
88	one	O
89	of	O
90	the	O
91	great	O
92	masters	