# Tagging Test IMDB Dataset using Baseline NER Model
This notebook loads our trained NER model from a checkpoint and uses it to tag entities in the cleaned test split of the IMDB dataset.


In [38]:
import sys
sys.path.append("../scripts")  

In [39]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoModelForTokenClassification, RobertaTokenizerFast, AutoConfig, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
from span_f1 import readNlu, toSpans, getBegEnd, getLooseOverlap, getUnlabeled
import pickle


## 1. Load 10% test data IMDB Dataset

In [40]:
df = pd.read_csv("../data/test_9010.csv")
df['tokens'] = df['review'].apply(lambda x: x.split())
df['dummy_labels'] = df['tokens'].apply(lambda x: ['O'] * len(x))

## 1b. Load 50% test data IMDB Dataset

In [41]:
df5050 = pd.read_csv("../data/test_5050.csv")
df5050['tokens'] = df5050['review'].apply(lambda x: x.split())
df5050['dummy_labels'] = df5050['tokens'].apply(lambda x: ['O'] * len(x))

## 1c. Load 90% test data IMDB Dataset

In [42]:
df1090 = pd.read_csv("../data/test_1090.csv")
df1090['tokens'] = df1090['review'].apply(lambda x: x.split())
df1090['dummy_labels'] = df1090['tokens'].apply(lambda x: ['O'] * len(x))

## 2. Create HuggingFace Dataset Object

In [43]:
test_9010_data = Dataset.from_dict({
    'sents': df['tokens'].tolist(),
    'ner_tags': df['dummy_labels'].tolist(),
    'ids': df['dummy_labels'].tolist()
})

test_5050_data = Dataset.from_dict({
    'sents': df5050['tokens'].tolist(),
    'ner_tags': df5050['dummy_labels'].tolist(),
    'ids': df5050['dummy_labels'].tolist()
})

test_1090_data = Dataset.from_dict({
    'sents': df1090['tokens'].tolist(),
    'ner_tags': df1090['dummy_labels'].tolist(),
    'ids': df1090['dummy_labels'].tolist()
})

## 3. Load Tokenizer and Label Mappings

In [44]:
# Load label mappings used during training
with open('../project/baseline_model/idx2lab', 'rb') as f:
    idx2lab = pickle.load(f)

with open('../project/baseline_model/lab2idx', 'rb') as f:
    lab2idx = pickle.load(f)


In [45]:

label_list = list(lab2idx.keys())  # Needed for num_labels

# Tokenizer and config
model_link = "deepset/roberta-base-squad2"
tokenizer = RobertaTokenizerFast.from_pretrained(model_link, use_fast=True, add_prefix_space=True)
config = AutoConfig.from_pretrained(
    model_link,
    num_labels=len(label_list),
    id2label=idx2lab,
    label2id=lab2idx
)


## 4. Tokenize IMDB Dataset Using Trained Format

In [46]:
text_column_name = 'sents'
label_column_name = 'ids'

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        max_length=128,
        padding=False,
        truncation=True,
        is_split_into_words=True
    )

    all_labels = []
    for batch_index, labels in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=batch_index)
        label_ids = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id == prev_word_id:
                label_ids.append(-100)
            else:
                label_ids.append(lab2idx[labels[word_id]])
            prev_word_id = word_id
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


In [47]:
processed_9010 = test_9010_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=imdb_data.column_names,
    desc="Tokenizing 9010 reviews"
)

processed_5050 = test_5050_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=imdb_data.column_names,
    desc="Tokenizing 5050 reviews"
)

processed_1090 = test_1090_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=imdb_data.column_names,
    desc="Tokenizing 1090 reviews"
)

Tokenizing 9010 reviews: 100%|██████████| 4959/4959 [00:03<00:00, 1479.99 examples/s]
Tokenizing 5050 reviews: 100%|██████████| 24791/24791 [00:14<00:00, 1669.63 examples/s]
Tokenizing 1090 reviews: 100%|██████████| 44623/44623 [00:28<00:00, 1555.21 examples/s]


## 5. Load Trained Model from Checkpoint

In [23]:
model_path = "../project/baseline_model/output_trainer/checkpoint-4704"
model = AutoModelForTokenClassification.from_pretrained(model_path, config=config)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

## 6. Run Predictions Using Trainer

In [24]:
def convert_int_to_labels(preds):
    logits, labels = preds
    predictions = np.argmax(logits, axis=-1)
    
    true_predictions = [
        [idx2lab[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]
    
    return None, true_predictions  # You only need predicted labels here


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)
training_args = TrainingArguments(output_dir="tmp")
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator
)


In [48]:

predictions9010, labels9010, _ = trainer.predict(processed_9010)
_9010, predicted_labels9010 = convert_int_to_labels((predictions9010, labels9010))

predictions5050, labels5050, _ = trainer.predict(processed_5050)
_5050, predicted_labels5050 = convert_int_to_labels((predictions5050, labels5050))

predictions1090, labels1090, _ = trainer.predict(processed_1090)
_1090, predicted_labels1090 = convert_int_to_labels((predictions1090, labels1090))

## 7. Save Predictions in CoNLL Format

In [None]:
def write_conll_file(data, path):
    with open(path, "w", encoding="utf-8") as f:
        for sentence in data:
            words, labels = sentence
            for idx, (word, label) in enumerate(zip(words, labels), start=1):
                f.write(f"{idx}\t{word}\t{label}\t-\t-\n")
            f.write("\n")


In [50]:
tagged_9010 = [(tokens, labels) for tokens, labels in zip(df['tokens'].tolist(), predicted_labels9010)]
write_conll_file(tagged_9010, "../data/test_9010_tagged_output.iob2")

tagged_5050 = [(tokens, labels) for tokens, labels in zip(df5050['tokens'].tolist(), predicted_labels5050)]
write_conll_file(tagged_5050, "../data/test_5050_tagged_output.iob2")

tagged_1090 = [(tokens, labels) for tokens, labels in zip(df1090['tokens'].tolist(), predicted_labels1090)]
write_conll_file(tagged_1090, "../data/test_1090_tagged_output.iob2")


            

In [27]:
# Print the first tagged review as an example (safely aligned)
tokens, labels = imdb_tagged[1]
for idx in range(len(tokens)):
    word = tokens[idx]
    tag = labels[idx] if idx < len(labels) else "O"  # fallback if mismatch
    print(f"{idx+1}\t{word}\t{tag}")


1	Not	O
2	good	O
3	Mostly	O
4	because	O
5	you	O
6	dont	O
7	give	O
8	a	O
9	damn	O
10	about	O
11	what	O
12	happens	O
13	to	O
14	all	O
15	these	O
16	people	O
17	Some	O
18	comments	O
19	1	O
20	I	O
21	am	O
22	tired	O
23	of	O
24	seeing	O
25	governesses	O
26	who	O
27	never	O
28	talk	O
29	to	O
30	their	O
31	pupils	O
32	never	O
33	teach	O
34	them	O
35	anything	O
36	and	O
37	take	O
38	a	O
39	tired	O
40	and	O
41	annoyed	O
42	look	O
43	whenever	O
44	the	O
45	said	O
46	pupil	O
47	who	O
48	of	O
49	course	O
50	has	O
51	been	O
52	won	O
53	over	O
54	in	O
55	the	O
56	space	O
57	of	O
58	4	O
59	seconds	O
60	says	O
61	something	O
62	2	O
63	Fine	O
64	so	O
65	Rosina	B-PER
66	has	O
67	a	O
68	father	O
69	complex	O
70	and	O
71	therefore	O
72	is	O
73	attracted	O
74	to	O
75	her	O
76	employer	O
77	But	O
78	Charles	B-PER
79	is	O
80	completely	O
81	different	O
82	in	O
83	all	O
84	aspects	O
85	from	O
86	her	O
87	father	O
88	if	O
89	anything	O
90	Henry	B-PER
91	is	O
92	much	O
93	closer	O
94	as	O
95	a	O
96	sensual	O
97