In [50]:
#!pip install transformers
#!pip install torch torchvision
#!pip install scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [72]:
# load libraries
import pandas as pd
import json
import random
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizerFast
from transformers import RobertaForTokenClassification
import torch, torchvision
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_
from tqdm import tqdm

In [3]:
# load the tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

# function that creates BIO-tags for text
def tokenize_and_align_labels(text, entities, label_to_id):
    # Tokenize and get offsets
    encoding = tokenizer(text, return_offsets_mapping=True, truncation=True)
    labels = ["O"] * len(encoding.offset_mapping)
    
    for ent in entities:
        start, end = ent["start"], ent["end"]
        ent_label = ent["labels"][0]  # assume one label per span
        
        for idx, (token_start, token_end) in enumerate(encoding.offset_mapping):
            if token_start >= end or token_end <= start:
                continue
            if token_start >= start and token_end <= end:
                labels[idx] = f"I-{ent_label}"
        
        # Change first matched token to B-
        for idx, label in enumerate(labels):
            if label == f"I-{ent_label}":
                labels[idx] = f"B-{ent_label}"
                break

    # Convert labels to IDs
    label_ids = [label_to_id.get(label, label_to_id["O"]) for label in labels]
    return encoding["input_ids"], encoding["attention_mask"], label_ids

In [4]:
# Load Label Studio JSON
with open("../data/annotations.json", "r") as f:
    data = json.load(f)

# Build label set dynamically
label_set = {"O"}
for task in data:
    for result in task["annotations"][0]["result"]:
        if result["type"] == "labels":
            for lbl in result["value"]["labels"]:
                label_set.add(f"B-{lbl}")
                label_set.add(f"I-{lbl}")
label_list = sorted(label_set)
label_to_id = {label: i for i, label in enumerate(label_list)}

In [5]:
dataset = []
for task in data:
    text = task["data"]["sentence"]
    results = task["annotations"][0]["result"]
    spans = [
        {
            "start": r["value"]["start"],
            "end": r["value"]["end"],
            "labels": r["value"]["labels"]
        }
        for r in results if r["type"] == "labels"
    ]
    input_ids, attention_mask, label_ids = tokenize_and_align_labels(text, spans, label_to_id)
    dataset.append({
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": label_ids,
        "text": text
    })

In [63]:
include = []
not_include = []
for sentence in dataset:
    labels = sentence["labels"]
    if 0 in labels:
        include.append(sentence)
    else:
        not_include.append(sentence)

split_idx = int(len(include) * 0.75)

train_include = include[:split_idx]
test_include = include[split_idx:]

train_dataset = train_include + random.sample(not_include, int(len(not_include) * 0.3))
test_dataset = test_include + random.sample(not_include, int(len(not_include) * 0.3))

In [65]:
# Step 1: Extract lists
input_ids_list = [item["input_ids"] for item in train_dataset]
attention_masks_list = [item["attention_mask"] for item in train_dataset]
label_ids_list = [item["labels"] for item in train_dataset]

# Step 2: Compute max length
max_len = max(len(seq) for seq in input_ids_list)

# Step 3: Pad all sequences
def pad(seq, pad_val, max_len):
    return seq + [pad_val] * (max_len - len(seq))

input_ids = torch.tensor([pad(seq, pad_val=1, max_len=max_len) for seq in input_ids_list])           # pad_val=1 for RoBERTa
attention_masks = torch.tensor([pad(seq, pad_val=0, max_len=max_len) for seq in attention_masks_list])
labels = torch.tensor([pad(seq, pad_val=-100, max_len=max_len) for seq in label_ids_list])           # -100 for ignored loss

# Step 4: Wrap into TensorDataset
train_dataset = TensorDataset(input_ids, attention_masks, labels)

In [66]:
# Step 1: Extract lists
input_ids_list = [item["input_ids"] for item in test_dataset]
attention_masks_list = [item["attention_mask"] for item in test_dataset]
label_ids_list = [item["labels"] for item in test_dataset]

# Step 2: Compute max length
max_len = max(len(seq) for seq in input_ids_list)

# Step 3: Pad all sequences
def pad(seq, pad_val, max_len):
    return seq + [pad_val] * (max_len - len(seq))

input_ids = torch.tensor([pad(seq, pad_val=1, max_len=max_len) for seq in input_ids_list])           # pad_val=1 for RoBERTa
attention_masks = torch.tensor([pad(seq, pad_val=0, max_len=max_len) for seq in attention_masks_list])
labels = torch.tensor([pad(seq, pad_val=-100, max_len=max_len) for seq in label_ids_list])           # -100 for ignored loss

# Step 4: Wrap into TensorDataset
test_dataset = TensorDataset(input_ids, attention_masks, labels)

In [67]:
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=8)

In [71]:
# model setup
model = RobertaForTokenClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label_to_id),
    id2label={v: k for k, v in label_to_id.items()},
    label2id=label_to_id
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)
epochs = 10

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [73]:
model.train()

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc="Training")

    for batch in progress_bar:
        input_ids, attention_masks, labels = [b.to(device) for b in batch]

        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss # batch loss
        total_loss += loss.item()

        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_loss:.4f}")


Epoch 1/10


Training: 100%|██████████| 13/13 [00:10<00:00,  1.29it/s, loss=0.242]


Average training loss: 0.5971
Epoch 2/10


Training: 100%|██████████| 13/13 [00:09<00:00,  1.39it/s, loss=0.137]


Average training loss: 0.3016
Epoch 3/10


Training: 100%|██████████| 13/13 [00:09<00:00,  1.36it/s, loss=0.079]


Average training loss: 0.2180
Epoch 4/10


Training: 100%|██████████| 13/13 [00:09<00:00,  1.38it/s, loss=0.0905]


Average training loss: 0.1595
Epoch 5/10


Training: 100%|██████████| 13/13 [00:09<00:00,  1.34it/s, loss=0.0834]


Average training loss: 0.1261
Epoch 6/10


Training: 100%|██████████| 13/13 [00:09<00:00,  1.38it/s, loss=0.0404]


Average training loss: 0.0898
Epoch 7/10


Training: 100%|██████████| 13/13 [00:09<00:00,  1.32it/s, loss=0.0572]


Average training loss: 0.0585
Epoch 8/10


Training: 100%|██████████| 13/13 [00:09<00:00,  1.39it/s, loss=0.0633] 


Average training loss: 0.0455
Epoch 9/10


Training: 100%|██████████| 13/13 [00:09<00:00,  1.39it/s, loss=0.0652]


Average training loss: 0.0346
Epoch 10/10


Training: 100%|██████████| 13/13 [00:09<00:00,  1.38it/s, loss=0.00716]

Average training loss: 0.0270





In [80]:
model.eval()

true_labels = []
pred_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_masks, labels = [b.to(device) for b in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_masks)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=2)

        for i in range(len(labels)):
            true_seq = labels[i].cpu().numpy()
            pred_seq = predictions[i].cpu().numpy()

            for t, p in zip(true_seq, pred_seq):
                if t != -100:
                    true_labels.append(t)
                    pred_labels.append(p)


In [83]:
from sklearn.metrics import classification_report

print(classification_report(
    true_labels,
    pred_labels
))

              precision    recall  f1-score   support

           0       0.56      0.74      0.64        19
           1       0.48      0.43      0.45        23
           2       0.97      0.96      0.96       409

    accuracy                           0.92       451
   macro avg       0.67      0.71      0.68       451
weighted avg       0.93      0.92      0.92       451



In [87]:
id2label = {v: k for k, v in label_to_id.items()}

In [106]:
def predict_sentence(sentence, model, tokenizer, id2label, device='cpu'):
    model.eval()

    # Tokenize the input sentence, return PyTorch tensors and offset mappings
    encoding = tokenizer(
        sentence,
        return_tensors="pt",
        return_offsets_mapping=True,
        truncation=True
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)
    offset_mapping = encoding["offset_mapping"][0]

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits  # [1, seq_len, num_labels]
        predictions = torch.argmax(logits, dim=-1)[0]  # seq_len

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    labels = [id2label[pred.item()] for pred in predictions]

    # Combine tokens and labels, ignoring special tokens (like CLS, SEP)
    result = []
    for token, label, (start, end) in zip(tokens, labels, offset_mapping):
        if start == 0 and end == 0:
            # Skip special tokens with offset (0,0)
            continue
        token_text = sentence[start:end]
        result.append((token_text, label))

    return result

In [113]:
# try out a custom sentence
predict_sentence("Old sick people feel left behind which is also true for young mothers.", model=model, tokenizer=tokenizer, id2label=id2label)

[('Old', 'B-SocialGroup'),
 ('sick', 'I-SocialGroup'),
 ('people', 'I-SocialGroup'),
 ('feel', 'O'),
 ('left', 'O'),
 ('behind', 'O'),
 ('which', 'O'),
 ('is', 'O'),
 ('also', 'O'),
 ('true', 'O'),
 ('for', 'O'),
 ('young', 'B-SocialGroup'),
 ('mothers', 'I-SocialGroup'),
 ('.', 'O')]