# Tagging Cleaned IMDB Dataset using Baseline NER Model
This notebook loads your trained NER model from a checkpoint and uses it to tag entities in the cleaned IMDB dataset.


In [None]:
import sys
sys.path.append("../scripts")  

In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoModelForTokenClassification, RobertaTokenizerFast, AutoConfig, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
from span_f1 import readNlu, toSpans, getBegEnd, getLooseOverlap, getUnlabeled
import pickle


## 1. Load Cleaned IMDB Dataset

In [None]:
df = pd.read_csv("../data/clean_imdb_dataset.csv")
df['tokens'] = df['review'].apply(lambda x: x.split())
df['dummy_labels'] = df['tokens'].apply(lambda x: ['O'] * len(x))

## 2. Create HuggingFace Dataset Object

In [None]:
imdb_data = Dataset.from_dict({
    'sents': df['tokens'].tolist(),
    'ner_tags': df['dummy_labels'].tolist(),
    'ids': df['dummy_labels'].tolist()
})

## 3. Load Tokenizer and Label Mappings

In [None]:
# Load label mappings used during training
with open('../project/baseline_model/idx2lab', 'rb') as f:
    idx2lab = pickle.load(f)

with open('../project/baseline_model/lab2idx', 'rb') as f:
    lab2idx = pickle.load(f)

label_list = list(lab2idx.keys())  # Needed for num_labels

# Tokenizer and config
model_link = "deepset/roberta-base-squad2"
tokenizer = RobertaTokenizerFast.from_pretrained(model_link, use_fast=True, add_prefix_space=True)
config = AutoConfig.from_pretrained(
    model_link,
    num_labels=len(label_list),
    id2label=idx2lab,
    label2id=lab2idx
)


## 4. Tokenize IMDB Dataset Using Trained Format

In [None]:
text_column_name = 'sents'
label_column_name = 'ids'

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples[text_column_name],
        max_length=128,
        padding=False,
        truncation=True,
        is_split_into_words=True
    )

    all_labels = []
    for batch_index, labels in enumerate(examples[label_column_name]):
        word_ids = tokenized_inputs.word_ids(batch_index=batch_index)
        label_ids = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id == prev_word_id:
                label_ids.append(-100)
            else:
                label_ids.append(lab2idx[labels[word_id]])
            prev_word_id = word_id
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs


In [None]:
processed_imdb = imdb_data.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=imdb_data.column_names,
    desc="Tokenizing IMDB reviews"
)

## 5. Load Trained Model from Checkpoint

In [None]:
model_path = "../data/training_parameters/checkpoint-4704"
model = AutoModelForTokenClassification.from_pretrained(model_path, config=config)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

## 6. Run Predictions Using Trainer

In [None]:
def convert_int_to_labels(preds):
    logits, labels = preds
    predictions = np.argmax(logits, axis=-1)
    
    true_predictions = [
        [idx2lab[pred] for pred, label in zip(pred_seq, label_seq) if label != -100]
        for pred_seq, label_seq in zip(predictions, labels)
    ]
    
    return None, true_predictions  # You only need predicted labels here


In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)
training_args = TrainingArguments(output_dir="tmp")
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator
)

predictions, labels, _ = trainer.predict(processed_imdb)
_, predicted_labels = convert_int_to_labels((predictions, labels))

## 7. Save Predictions in CoNLL Format

In [None]:
def write_conll_file(data, path):
    with open(path, "w", encoding="utf-8") as f:
        for sentence in data:
            words, labels = sentence
            for idx, (word, label) in enumerate(zip(words, labels), start=1):
                f.write(f"{idx}\t{word}\t{label}\t-\t-\n")
            f.write("\n")


imdb_tagged = [(tokens, labels) for tokens, labels in zip(df['tokens'].tolist(), predicted_labels)]
#write_conll_file(imdb_tagged, "../data/imdb_tagged_output.iob2")
            

In [None]:
# Print the first tagged review as an example (safely aligned)
tokens, labels = imdb_tagged[1]
for idx in range(len(tokens)):
    word = tokens[idx]
    tag = labels[idx] if idx < len(labels) else "O"  # fallback if mismatch
    print(f"{idx+1}\t{word}\t{tag}")
