# SICSS 2025: Active Learning Workshop

# The Preliminaries: Defining the Concept

Also done previously but not part of the lab: Manually labelling a training dataset (the gold data file)

In [20]:
SUPP_LABELS = [
"Academic",
"Moral",
"Tech",
"Data",
"Library",
"Finance",
"Religious",
"Unknown"
]

## Import modules

In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm # progress bars
from transformers import RobertaTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer # Huggingface models and utilities
import torch # PyTorch for model handling
from torch import nn
import evaluate
import random

  from .autonotebook import tqdm as notebook_tqdm


## Handling authentication with the Hugging Face Hub 

To use Hugging Face’s models (especially large ones or those requiring authentication), you need an access token. This token links your Hugging Face account to your code securely.

🔑 How to Create a Hugging Face Access Token:

* Create a [Hugging Face account](https://huggingface.co) (if you don't already have one).
* After logging in, go to your Access Tokens page and click on "new token".
* Choose a name (e.g., sicss-token), select the role, and click "create".
* Copy the token and past it in the cell below.
* **Never share your token publicly!**



In [None]:
from huggingface_hub import login
my_token = "PAST YOUR TOKEN HERE"
login(token=my_token)

## Train a RoBERTa model 

In [3]:
# Set seeds for reproducible results
seed = 1989
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.mps.manual_seed(seed)

In [4]:
# When running on CPU, this somehow makes sure training times do not degrade
# see: https://discuss.pytorch.org/t/training-time-gets-slower-and-slower-on-cpu/145483/3
torch.set_flush_denormal(True)

# Select the device
# If you have a MacBook (with a Silicon chip), you have "mps" available. On
# Windows or Linux, if you have an nVidia GPU, you have CUDA available.
# Otherwise, use the CPU.

device = torch.device("cpu") # Fallback: CPU

if torch.backends.mps.is_available():
  device = torch.device("mps")
elif torch.cuda.is_available():
  device = torch.device("cuda")

# Annotated Data Loading

Load the annotated gold standard data

In [5]:
sentences: list[str] = list()
labels: list[int] = list()

In [6]:
def read_samples ():
  """Reads in the gold data and yields tuples (sentence, labels)"""
  with open("AL_gold_data.tsv", "r", encoding="utf-8") as fp:
    next(fp) # Skip header
    for line in fp:
      cols = line.strip().split("\t")
      sentence = cols[0]
      label = np.argmax(np.asarray([int(x) for x in cols[1:9]]))
      yield (sentence, label)

In [7]:
for sentence, label in read_samples():
    sentences.append(sentence)
    labels.append(label)

In [8]:
# We create a random train/valid split
rand = np.random.default_rng()
train_idx: list[int] = rand.choice(len(sentences), size=round(len(sentences) * 0.8), replace = False)
valid_idx = set(range(len(sentences))).difference(set(train_idx))
print(f"Datasets prepared! We are training with {len(train_idx)} training and {len(valid_idx)} validation samples.")

Datasets prepared! We are training with 722 training and 181 validation samples.


In [9]:
class CustomDataset(torch.utils.data.Dataset):
  """Basically copied verbatim from https://huggingface.co/transformers/v3.5.1/custom_datasets.html"""
  def __init__(self, encodings, labels):
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: val[idx] for key, val in self.encodings.items()}
    item['labels'] = self.labels[idx]
    return item

  def __len__(self):
    return len(self.labels)

In [10]:
# NOTE: This will download the roberta-base tokenizer model to your device.
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [11]:
def sentences_to_data_loader (sentences: list[str], labels: list[int]):
  """Takes a list of sentences, a batch size and a list of integer labels and constructs a dataset from that."""
  tok = tokenizer(sentences, padding="max_length", truncation=True, return_tensors='pt', return_attention_mask=True)
  return CustomDataset(tok, torch.tensor(labels))


In [12]:
# Now create two datasets with this information:
support_train = sentences_to_data_loader([sentences[i] for i in train_idx], labels=[labels[i] for i in train_idx])
support_valid = sentences_to_data_loader([sentences[i] for i in valid_idx], labels=[labels[i] for i in valid_idx])

# Model Training Evaluation

Set up evaluation and the trainer

In [13]:
# How to determine the best model (ideally f1, otherwise loss works)
metric = 'f1'
is_greater_better = True

f1_metric = evaluate.load('f1')
acc_metric = evaluate.load('accuracy')

In [None]:
def compute_metrics_support(eval_pred):
    predictions, labels = eval_pred
    predictions = torch.from_numpy(predictions)
    predictions = nn.functional.softmax(predictions, dim=-1)
    predictions = np.argmax(predictions, axis=-1)

    # Calculates one F1 per label, so we should have an array with ... nine (?) elements
    f1 = f1_metric.compute(predictions=predictions, references=labels, average=None)['f1']
    acc = acc_metric.compute(predictions=predictions, references=labels)['accuracy']

    # NOTE: We define the F1 here as the average score of all categories
    avg_f1 = np.mean(f1)

    return { 'f1': avg_f1, 'accuracy': acc }

# Finetuning

In [15]:
args = TrainingArguments(
    output_dir="model",
    eval_strategy = "epoch", # Print results after each epoch
    save_strategy = "epoch", # If loading best model, save + eval need to match
    per_device_train_batch_size=8, # Default is 8
    per_device_eval_batch_size=8,
    num_train_epochs=15.0, # default 3
    learning_rate = 5e-05, # default: 5e-05
    adam_epsilon = 1e-8, # Taken from Rubing's script
    load_best_model_at_end = True, # Default: False
    metric_for_best_model = metric,
    greater_is_better = is_greater_better,
    # use_mps_device=True #  <-- UNCOMMENT this line if you are using a MacOS machine
  )

# NOTE: This will download the RoBERTa Base model to your machine
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(SUPP_LABELS), # How many labels should the model learn to assign?
    problem_type="single_label_classification"
  )

trainer = Trainer(
  model=model,
  args=args,
  train_dataset=support_train,
  eval_dataset=support_valid,
  compute_metrics=compute_metrics_support
)

print("Training support category model!")
trainer.train()
trainer.save_model("finetuned_model")
print("Model trained!")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training support category model!


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,1.129135,0.258045,0.640884
2,No log,1.11491,0.255467,0.635359
3,No log,1.3108,0.409537,0.674033
4,No log,1.204556,0.565022,0.701657
5,No log,1.345684,0.521445,0.696133
6,0.733200,1.503726,0.551395,0.696133
7,0.733200,1.87056,0.4331,0.674033
8,0.733200,1.931399,0.478047,0.651934
9,0.733200,1.829501,0.581153,0.723757
10,0.733200,1.93401,0.552193,0.712707



[EVALUATION|SUPPORT] F1(avg): 0.2580449967878826 | F1(individual): 0.6756756756756757, 0.6896551724137931, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6990291262135923 | Accuracy: 0.6408839779005525


[EVALUATION|SUPPORT] F1(avg): 0.2554673721340388 | F1(individual): 0.6857142857142857, 0.7160493827160493, 0.0, 0.0, 0.0, 0.0, 0.0, 0.6419753086419753 | Accuracy: 0.6353591160220995


[EVALUATION|SUPPORT] F1(avg): 0.4095365009343504 | F1(individual): 0.717948717948718, 0.7160493827160493, 0.0, 0.0, 0.4444444444444444, 0.0, 0.6666666666666666, 0.7311827956989247 | Accuracy: 0.6740331491712708


[EVALUATION|SUPPORT] F1(avg): 0.565022425954832 | F1(individual): 0.75, 0.7476635514018691, 0.18181818181818182, 0.13333333333333333, 0.5, 0.75, 0.6666666666666666, 0.7906976744186046 | Accuracy: 0.7016574585635359


[EVALUATION|SUPPORT] F1(avg): 0.5214448417462016 | F1(individual): 0.7659574468085106, 0.6966292134831461, 0.47058823529411764, 0.0, 0.4, 0.4444444444444444, 0.6666666666666666, 0.7272727272727273 | Ac

SafetensorError: Error while serializing: IoError(Os { code: 28, kind: StorageFull, message: "No space left on device" })

## Predictions

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("finetuned_model")
model.to(device)

# predicted = list()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

# Annotating

In [17]:
corpus = pd.read_csv("AL_corpus.tsv", sep="\t")
print(f"Corpus size: {len(corpus)} sentences.")

Corpus size: 20170 sentences.


In [18]:
corpus

Unnamed: 0,year,sentence
0,1927,"Thanks are due to Dr* W. S. Pord, Assistant Pr..."
1,1927,Mr.V.
2,1927,"C. Kersey, Assistant Superintendent of Schools..."
3,1927,School principals have been very kind to allow...
4,1927,"Records of attendance, enrollment, etc were wi..."
...,...,...
20165,2020,I also acknowledge my students and colleagues ...
20166,2020,I offer a special acknowledgment to my dear fr...
20167,2020,The friendship of so many people has helped ma...
20168,2020,"responded to the survey, interest in service e..."


In [21]:
with open("AL_predictions.tsv", "w") as fp:
    fp.write(f"filename\tsentence\tsupport_label\n")
    for row in tqdm(corpus.itertuples(), total=len(corpus), desc="Predicting", dynamic_ncols=True):
        tok = tokenizer(row.sentence, padding="max_length", truncation=True, return_tensors='pt')
        tok = tok.to(device)

        output = model(**tok)
        predictions = output.logits.detach().squeeze(0).cpu().numpy()
        supp_label = np.argmax(predictions)

        fp.write(f"{row.year}\t{row.sentence}\t{SUPP_LABELS[supp_label]}\n")
        fp.flush() # Make sure we can watch as the file fills

    print("Prediction done! You can find the predictions in the ''AL_predictions.tsv'' file.")

Predicting:   6%|▌         | 1229/20170 [00:46<11:50, 26.66it/s]


KeyboardInterrupt: 