Loading in dataset and visualisation notebook

In [1]:
import pandas as pd
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, AutoConfig, RobertaTokenizerFast, DataCollatorForTokenClassification
import numpy as np
import tqdm as notebook_tqdm
from datasets import Dataset, DatasetDict
import torch


#from bert.bert_topic import ClassModel

#all of the functions from the span_f1 file
from span_f1 import readNlu, toSpans, getBegEnd, getLooseOverlap, getUnlabeled

model_link = 'deepset/roberta-base-squad2'
train_data_source = 'en_ewt-ud-train.iob2'
dev_data_source = 'en_ewt-ud-dev.iob2'
test_data_source = 'en_ewt-ud-test-masked.iob2'

In [2]:
df = pd.read_csv("imdb_dataset.csv")
print(df.shape)
print(df.describe())
df.head(10)


(50000, 2)
                                                   review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
df["review"] = df["review"].str.replace("<br /><br />", " ", regex=False)
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming te...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
import pandas as pd

# Load and clean
df = pd.read_csv("imdb_dataset.csv")
df["review"] = df["review"].str.replace("<br /><br />", " ", regex=False)

# Convert to words + tags format
df["words"] = df["review"].apply(lambda x: x.split())
df["tags"] = df["sentiment"].apply(lambda s: [s] * len(s.split()))

# Visualize the transformation
print(df[["words", "tags"]].head(3))


                                               words        tags
0  [One, of, the, other, reviewers, has, mentione...  [positive]
1  [A, wonderful, little, production., The, filmi...  [positive]
2  [I, thought, this, was, a, wonderful, way, to,...  [positive]


In [5]:
df["sentiment"] = df["sentiment"].str.upper() 
df["sentiment"] = df["sentiment"].replace({"POSITIVE": "POS", "NEGATIVE": "NEG"})

label_set = sorted(df["sentiment"].unique())
lab2idx = {lab: idx for idx, lab in enumerate(label_set)}
idx2lab = {idx: lab for lab, idx in lab2idx.items()}

df["sents"] = df["review"].apply(lambda x: x.split())
df["ner_tags"] = df["sentiment"].apply(lambda tag: [tag] * len(tag.split()))
df["ids"] = df["sentiment"].apply(lambda tag: [lab2idx[tag]] * len(tag.split()))

print(df["sents"].iloc[0])
print(df["ner_tags"].iloc[0])
print(df["ids"].iloc[0])


['One', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', '1', 'Oz', 'episode', "you'll", 'be', 'hooked.', 'They', 'are', 'right,', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me.', 'The', 'first', 'thing', 'that', 'struck', 'me', 'about', 'Oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence,', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'GO.', 'Trust', 'me,', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid.', 'This', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs,', 'sex', 'or', 'violence.', 'Its', 'is', 'hardcore,', 'in', 'the', 'classic', 'use', 'of', 'the', 'word.', 'It', 'is', 'called', 'OZ', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'Oswald', 'Maximum', 'Security', 'State', 'Penitentary.', 'It', 'focuses', 'mainly', 'on', 'Emerald', 'City,', 'an', 'experimental', 'section', 'of', 'the', 'prison', 'where', 'all', 'th

In [6]:
df.head()

Unnamed: 0,review,sentiment,words,tags,sents,ner_tags,ids
0,One of the other reviewers has mentioned that ...,POS,"[One, of, the, other, reviewers, has, mentione...",[positive],"[One, of, the, other, reviewers, has, mentione...",[POS],[1]
1,A wonderful little production. The filming te...,POS,"[A, wonderful, little, production., The, filmi...",[positive],"[A, wonderful, little, production., The, filmi...",[POS],[1]
2,I thought this was a wonderful way to spend ti...,POS,"[I, thought, this, was, a, wonderful, way, to,...",[positive],"[I, thought, this, was, a, wonderful, way, to,...",[POS],[1]
3,Basically there's a family where a little boy ...,NEG,"[Basically, there's, a, family, where, a, litt...",[negative],"[Basically, there's, a, family, where, a, litt...",[NEG],[0]
4,"Petter Mattei's ""Love in the Time of Money"" is...",POS,"[Petter, Mattei's, ""Love, in, the, Time, of, M...",[positive],"[Petter, Mattei's, ""Love, in, the, Time, of, M...",[POS],[1]


In [7]:
# Fix the tagging length based on the tokenized review
df["sents"] = df["review"].apply(lambda x: x.split())
df["ner_tags"] = df.apply(lambda row: [row["sentiment"]] * len(row["sents"]), axis=1)
df["ids"] = df.apply(lambda row: [lab2idx[row["sentiment"]]] * len(row["sents"]), axis=1)


In [8]:
from datasets import Dataset

# Convert to HF Dataset
dataset = Dataset.from_pandas(df[["sents", "ner_tags", "ids"]])


In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained(model_link, use_fast=True, add_prefix_space=True)

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["sents"],
        max_length=128,             
        padding=False,              
        truncation=True, 
        is_split_into_words=True
    )

    all_labels = []
    for batch_index, labels in enumerate(examples["ids"]):
        word_ids = tokenized_inputs.word_ids(batch_index=batch_index)

        label_ids = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id == prev_word_id:
                label_ids.append(-100)
            else:
                label_ids.append(labels[word_id])
            prev_word_id = word_id
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels
    return tokenized_inputs

# Apply tokenizer
tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset.column_names,
    desc="Running tokenizer on IMDb dataset"
)


Running tokenizer on IMDb dataset:   0%|          | 0/50000 [00:00<?, ? examples/s]