In [None]:
# set to either COLAB or LOCAL
ENV="COLAB"

if ENV == "COLAB":
    from google.colab import drive
    drive.mount('/content/drive')
    data_path = "/content/drive/MyDrive/pcl_datasets"
    ensemble_path = "/content/drive/MyDrive"
    preds_path = "/content/drive/MyDrive/pcl_preds"
elif ENV == "LOCAL":
    data_path = "../data"
    ensemble_path = "../models"
    preds_path = "../predictions"

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import html

import torch
from torch.nn import CrossEntropyLoss, functional
from torch.utils.data import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments

)
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split

In [3]:
# check gpu

cuda_available = torch.cuda.is_available()

print('Cuda available? ',cuda_available)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Cuda available?  True


# Exercise 4

In [None]:
# load pcl dataset
cols = ["id", "par_id", "keyword", "country", "text", "label"]

df = pd.read_csv(f"{data_path}/dontpatronizeme_pcl.tsv", sep="\t", names=cols, header=None, skiprows=3)
df.head(300)

Unnamed: 0,id,par_id,keyword,country,text,label
0,1,@@24942188,hopeless,ph,"We 're living in times of absolute insanity , ...",0
1,2,@@21968160,migrant,gh,"In Libya today , there are countless number of...",0
2,3,@@16584954,immigrant,ie,White House press secretary Sean Spicer said t...,0
3,4,@@7811231,disabled,nz,Council customers only signs would be displaye...,0
4,5,@@1494111,refugee,ca,""" Just like we received migrants fleeing El Sa...",0
...,...,...,...,...,...,...
295,296,@@19617986,immigrant,ie,They 'll hardly have that place cleaned in tim...,0
296,297,@@20497185,refugee,my,Then there are those who support German Chance...,0
297,298,@@1958113,homeless,tz,New Dar master plan to render millions homeless,0
298,299,@@10033865,disabled,ke,While speaking in Eldoret during an interview ...,0


In [8]:
# update labels: class 0-1 are non-PCL, class 2-4 are PCL

df["binary_label"] = (df["label"] >= 2).astype(int)
df["binary_label"].value_counts()


Unnamed: 0_level_0,count
binary_label,Unnamed: 1_level_1
0,9476
1,993


In [9]:
# fill na entries with blank strings (since present in dev set)

print(f"Before handling na: {df['text'].isna().sum()}")
df["text"] = df["text"].fillna("")
print(f"After handling na: {df['text'].isna().sum()}")

Before handling na: 1
After handling na: 0


In [10]:
# remove special characters

def clean_special_chars(text):
    text = html.unescape(text)
    text = text.replace('\n', ' ').replace('\t', ' ')
    return " ".join(text.split())

df['text'] = df['text'].apply(clean_special_chars)

print(df[df["text"].str.contains(r"&amp;|\n|\t", regex=True)].shape[0])

0


In [11]:
# load in 4 roberta models for ensemble

num_models = 4
model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

models = [AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
) for i in range(num_models)]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [None]:
# set up dataframes
# set aside dev, acts as a soft evaluation set
# split remaining dataset into train and validation sets

rand_state = 63

train_ids = pd.read_csv(f"{data_path}/train_semeval_parids-labels.csv")
dev_ids = pd.read_csv(f"{data_path}/dev_semeval_parids-labels.csv")

train_ids = train_ids.rename(columns={"par_id": "id"})
dev_ids = dev_ids.rename(columns={"par_id": "id"})

temp_df = df.merge(train_ids, on="id")
temp_df = temp_df[["id", "text", "binary_label"]]

train_df, val_df = train_test_split(
    temp_df,
    test_size=0.2,
    stratify=temp_df["binary_label"],
    random_state=rand_state
)

dev_df = df.merge(dev_ids, on="id")
dev_df = dev_df[["id", "text", "binary_label"]]

In [13]:
# Label proportions check for imbalanced classes and ensuring stratified splits correct, taking the first train and val dfs

for name, split_df in zip(["Train", "Validation", "Dev"], [train_df, val_df, dev_df]):
    print(f"{name}:")

    counts = split_df["binary_label"].value_counts().sort_index()
    proportions = split_df["binary_label"].value_counts(normalize=True).sort_index()
    summary = pd.DataFrame({
        "count": counts,
        "proportion": proportions
    })
    print(summary, '\n')

Train:
              count  proportion
binary_label                   
0              6065    0.905224
1               635    0.094776 

Validation:
              count  proportion
binary_label                   
0              1516    0.905075
1               159    0.094925 

Dev:
              count  proportion
binary_label                   
0              1895    0.904967
1               199    0.095033 



In [14]:
# dataset class for huggingface Trainer

class PCLDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=192):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=max_length
        )
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item


In [15]:
# compute metrics function to display f1, precision, recall

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    f1 = f1_score(labels, preds, average="binary")
    precision = precision_score(labels, preds, average="binary")
    recall = recall_score(labels, preds, average="binary")
    return {"f1": f1, "precision": precision, "recall": recall}

In [16]:
# set different max sequence lengths

train_datasets = []
val_datasets = []
dev_datasets = []
max_lengths = [128, 128, 192, 256]

for i in range(num_models):
    train_dataset = PCLDataset(
        train_df["text"].tolist(),
        train_df["binary_label"].tolist(),
        tokenizer,
        max_length=max_lengths[i]
    )

    val_dataset = PCLDataset(
        val_df["text"].tolist(),
        val_df["binary_label"].tolist(),
        tokenizer,
        max_length=max_lengths[i]
    )

    dev_dataset = PCLDataset(
        dev_df["text"].tolist(),
        dev_df["binary_label"].tolist(),
        tokenizer,
        max_length=max_lengths[i]
    )

    train_datasets.append(train_dataset)
    val_datasets.append(val_dataset)
    dev_datasets.append(dev_dataset)



In [17]:
# give all models different seeds for further diversity

all_training_args = []
rand_seeds = [7, 39, 43, 65]

for i in range(num_models):
    training_args = TrainingArguments(
        output_dir=f"./outputs_roberta_{i}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=4,
        weight_decay=0.01,
        fp16=False,
        warmup_steps=200,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=50,
        report_to="none",
        max_grad_norm=1.0,
        seed=rand_seeds[i]
    )
    all_training_args.append(training_args)


In [18]:
# trainer class that implements weighted CE loss

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)

        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32)
        else:
            self.class_weights = torch.tensor([1.0, 20.0], dtype=torch.float32)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        class_weights = self.class_weights.to(device=logits.device, dtype=logits.dtype)
        loss_fct = CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss


In [19]:
# trainer class that implements focal loss

class FocalTrainer(Trainer):
    def __init__(self, *args, gamma=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.gamma = gamma

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        probs = functional.softmax(logits, dim=1)
        pt = probs.gather(1, labels.unsqueeze(1)).squeeze(1)
        log_pt = torch.log(pt + 1e-12)

        loss = -((1 - pt) ** self.gamma) * log_pt
        loss = loss.mean()

        return (loss, outputs) if return_outputs else loss


In [20]:
# uses sklearn compute class weight, uses inverse of class size

def get_class_weights():
    y_train = train_df["binary_label"].values
    class_weights = compute_class_weight(
        class_weight="balanced",
        classes=np.array([0, 1]),
        y=y_train
    )
    return class_weights

In [None]:
# train ensemble loop

for i in range(num_models):
    if i == 0:
        trainer = Trainer(
            model=models[i],
            args=all_training_args[i],
            train_dataset=train_datasets[i],
            eval_dataset=val_datasets[i],
            compute_metrics=compute_metrics
        )
    elif i == 1:
        class_weights = get_class_weights()
        trainer = WeightedTrainer(
            model=models[i],
            args=all_training_args[i],
            train_dataset=train_datasets[i],
            eval_dataset=val_datasets[i],
            compute_metrics=compute_metrics,
            class_weights=class_weights
        )
    elif i == 2:
        class_weights = get_class_weights()
        trainer = WeightedTrainer(
            model=models[i],
            args=all_training_args[i],
            train_dataset=train_datasets[i],
            eval_dataset=val_datasets[i],
            compute_metrics=compute_metrics,
            class_weights=class_weights
        )
    else:
        trainer = FocalTrainer(
            model=models[i],
            args=all_training_args[i],
            train_dataset=train_datasets[i],
            eval_dataset=val_datasets[i],
            compute_metrics=compute_metrics,
        )

    trainer.train()

    save_path = f"{ensemble_path}/roberta_pcl_model_a_{i}"
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Model {i} saved to {save_path}")



Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.267534,0.201971,0.43318,0.810345,0.295597
2,0.207565,0.19059,0.531746,0.72043,0.421384
3,0.127915,0.204934,0.597403,0.61745,0.578616
4,0.046957,0.322204,0.598726,0.606452,0.591195


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model 0 saved to /content/drive/MyDrive/roberta_pcl_model_a_0


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.566044,0.495641,0.400631,0.267368,0.798742
2,0.4603,0.466679,0.371287,0.231125,0.943396
3,0.271769,0.48288,0.578824,0.462406,0.773585
4,0.13836,0.928215,0.593548,0.609272,0.578616


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model 1 saved to /content/drive/MyDrive/roberta_pcl_model_a_1


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.49669,0.486695,0.531792,0.491979,0.578616
2,0.404891,0.466478,0.57561,0.47012,0.742138
3,0.266335,0.501715,0.562648,0.450758,0.748428
4,0.141371,1.016333,0.584718,0.619718,0.553459


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model 2 saved to /content/drive/MyDrive/roberta_pcl_model_a_2


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.063876,0.083776,0.145251,0.65,0.081761
2,0.055586,0.058483,0.198895,0.818182,0.113208
3,0.028977,0.053513,0.589474,0.666667,0.528302
4,0.011082,0.099713,0.588652,0.674797,0.522013


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model 3 saved to /content/drive/MyDrive/roberta_pcl_model_a_3


In [None]:
# validation set to find best threshold

val_labels = np.array(val_df["binary_label"].tolist())
all_probs_val = []

training_args = TrainingArguments(
    output_dir="./tmp_eval",
    per_device_eval_batch_size=32,
    do_train=False,
    do_eval=False,
    logging_strategy="no"
)

for i in range(num_models):
    model_path = f"{ensemble_path}/roberta_pcl_model_a_{i}"
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    trainer = Trainer(
        model=model,
        args=training_args,
    )

    preds = trainer.predict(val_datasets[i])
    logits = preds.predictions

    probs = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    all_probs_val.append(probs)

all_probs_val = np.array(all_probs_val)
ensemble_probs_val = np.mean(all_probs_val, axis=0)
thresholds = np.linspace(0.0, 1.0, 201)

best_f1 = 0
best_threshold = 0

for t in thresholds:
    preds = (ensemble_probs_val >= t).astype(int)
    f1 = f1_score(val_labels, preds)

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

val_preds = (ensemble_probs_val >= best_threshold).astype(int)

precision = precision_score(val_labels, val_preds)
recall = recall_score(val_labels, val_preds)

print(f"Best Threshold: {best_threshold:.4f}")
print(f"Validation F1: {best_f1:.4f}")
print(f"Validation Precision: {precision:.4f}")
print(f"Validation Recall: {recall:.4f}")


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Best Threshold: 0.3750
Validation F1: 0.6173
Validation Precision: 0.6061
Validation Recall: 0.6289


# Exercise 5

## 5.1 Global Evaluation

### Dev set

In [21]:
best_threshold = 0.3750

In [None]:
# dev set evaluation

dev_labels = np.array(dev_df["binary_label"].tolist())
all_probs_dev = []

training_args = TrainingArguments(
    output_dir="./tmp_eval",
    per_device_eval_batch_size=32,
    do_train=False,
    do_eval=False,
    logging_strategy="no"
)

for i in range(num_models):
    model_path = f"{ensemble_path}/roberta_pcl_model_a_{i}"
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    trainer = Trainer(
        model=model,
        args=training_args,
    )

    preds = trainer.predict(dev_datasets[i])
    logits = preds.predictions

    probs = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    all_probs_dev.append(probs)


all_probs_dev = np.array(all_probs_dev)
ensemble_probs_dev = np.mean(all_probs_dev, axis=0)
threshold = best_threshold

dev_preds = (ensemble_probs_dev >= threshold).astype(int)

f1 = f1_score(dev_labels, dev_preds)
print(f"dev F1: {f1:.4f}")

print(classification_report(dev_labels, dev_preds, digits=4))


Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

dev F1: 0.6172
              precision    recall  f1-score   support

           0     0.9646    0.9478    0.9561      1895
           1     0.5733    0.6683    0.6172       199

    accuracy                         0.9212      2094
   macro avg     0.7689    0.8080    0.7866      2094
weighted avg     0.9274    0.9212    0.9239      2094



In [23]:
confusion_matrix(dev_labels, dev_preds)

array([[1796,   99],
       [  66,  133]])

In [None]:
# save dev set preds

with open(f"{preds_path}/dev_preds.txt", "w") as f:
    f.write("\n".join(map(str, dev_preds)))

f.close()

### Test set

In [None]:
# load test set

data_path = f"{data_path}/task4_test.tsv"
cols = ["id", "par_id", "keyword", "country", "text"]

test_df = pd.read_csv(data_path, sep="\t", names=cols, header=None)
test_df.head()

Unnamed: 0,id,par_id,keyword,country,text
0,t_0,@@7258997,vulnerable,us,"In the meantime , conservatives are working to..."
1,t_1,@@16397324,women,pk,In most poor households with no education chil...
2,t_2,@@16257812,migrant,ca,The real question is not whether immigration i...
3,t_3,@@3509652,migrant,gb,"In total , the country 's immigrant population..."
4,t_4,@@477506,vulnerable,ca,"Members of the church , which is part of Ken C..."


In [25]:
# dataset class for test set since no labels

class PCLTestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.encodings = tokenizer(
            texts,
            truncation=True,
            padding=True,
            max_length=max_length
        )

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        return item

In [26]:
# create test datasets

test_datasets = []

for i in range(num_models):
    test_dataset = PCLTestDataset(
        test_df["text"].tolist(),
        tokenizer,
        max_length=max_lengths[i]
    )

    test_datasets.append(test_dataset)

In [None]:
# create test set preds

all_probs_test = []

training_args = TrainingArguments(
    output_dir="./tmp_eval",
    per_device_eval_batch_size=32,
    do_train=False,
    do_eval=False,
    logging_strategy="no"
)

for i in range(num_models):
    model_path = f"{ensemble_path}/roberta_pcl_model_a_{i}"
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    trainer = Trainer(
        model=model,
        args=training_args,
    )

    preds = trainer.predict(test_datasets[i])
    logits = preds.predictions

    probs = torch.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    all_probs_test.append(probs)

all_probs_test = np.array(all_probs_test)
ensemble_probs_test = np.mean(all_probs_test, axis=0)

test_preds = (ensemble_probs_test >= best_threshold).astype(int)
test_preds

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/201 [00:00<?, ?it/s]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# save test preds

with open(f"{preds_path}/test_preds.txt", "w") as f:
    f.write("\n".join(map(str, test_preds)))

f.close()