In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
import pandas as pd
def load_data(file_path):
  data = pd.read_csv(file_path, sep="\t", header=None, names=["file_name", "entity", "start_offset", "end_offset", "label_0", "label_1", "label_2", "label_3", "label_4", "label_5", "label_6", "label_7", "label_8", "label_9", "label_10", "label_11", "label_12"])

  # Drop the first label column
  data = data.drop(columns=["label_0"])
  data = data.sort_values(by=["file_name", "start_offset"], ascending=[True, False])
  return data

In [2]:
# Load the file
file_path_train = "./Datasets/Train/EN/subtask-1-annotations.txt"
file_path_val = "./Datasets/Development/EN/subtask-1-annotations.txt"
train_data = load_data(file_path_train)
val_data = load_data(file_path_val)

In [3]:
def add_special_tokens(entity_info, folder_path):
    """
    Add special tokens to mark entities in the text based on their offsets.

    Parameters:
        file_path (str): Path to the .txt file containing the text.
        entity_info (list of dict): A list of dictionaries containing the entity offsets and labels.
            Example:
            [{"start_offset": 27, "end_offset": 40, "entity": "lab-grown meat"}]

    Returns:
        str: The modified text with special tokens.
    """
        # Open and read the file content

    with open(folder_path+"/"+entity_info["file_name"], "r", encoding="utf-8") as file:
        text = file.read()


    # Sort the entity_info by descending start_offset to avoid messing up offsets
    #entity_info = sorted(entity_info, key=lambda x: x["start_offset"], reverse=True)

    # Add special tokens to each entity

    start, end = entity_info["start_offset"], entity_info["end_offset"]
    text = text[:start] + "<T> " + text[start:end+1] + " </T>" + text[end+1:]

    return text

In [4]:
def preprocess_dataset(data, folder_path):
  dataset = []
  for index, row in data.iterrows():
    dict_entity = {"file_name": row[0], "start_offset": int(row[2]), "end_offset": int(row[3]), "entity": row[1]}
    label_list = []
    for i in range(4, 16):
      if str(row[i]) != 'nan':
        label_list.append(row[i])
    dataset.append({"text": add_special_tokens(dict_entity, folder_path), "textual_labels": label_list})
  return dataset


In [5]:
train_dataset = preprocess_dataset(train_data, "./Datasets/Train/EN/raw-documents")
val_dataset = preprocess_dataset(val_data, "./Datasets/Development/EN/subtask-1-documents")

In [6]:
print(train_dataset[54]["textual_labels"])
print(val_dataset[2]["textual_labels"])

['Saboteur', 'Conspirator']
['Bigot']


In [7]:
taxonomy = [
    "Guardian", "Martyr", "Peacemaker", "Rebel", "Underdog", "Virtuous",
    "Instigator", "Conspirator", "Tyrant", "Foreign Adversary", "Traitor",
    "Spy", "Saboteur", "Corrupt", "Incompetent", "Terrorist", "Deceiver",
    "Bigot", "Forgotten", "Exploited", "Victim", "Scapegoat"
]
# Map labels to binary vectors
def encode_labels(label_list, taxonomy):
    return [1.0 if label in label_list else 0.0 for label in taxonomy]

for sample in train_dataset:
    sample["labels"] = encode_labels(sample["textual_labels"], taxonomy)
for sample in val_dataset:
    sample["labels"] = encode_labels(sample["textual_labels"], taxonomy)

In [None]:
import shutil

# Path to the folder you want to remove
folder_path = "/content/results"

# Remove the folder and its contents
shutil.rmtree(folder_path)

print(f"Folder {folder_path} has been removed.")

folder_path = "/content/logs"

# Remove the folder and its contents
shutil.rmtree(folder_path)

print(f"Folder {folder_path} has been removed.")

In [None]:
!pip install datasets

In [8]:
from datasets import Dataset

train_dataset = Dataset.from_list(train_dataset)
val_dataset = Dataset.from_list(val_dataset)

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
print(train_dataset)

Dataset({
    features: ['text', 'textual_labels', 'labels'],
    num_rows: 686
})


In [15]:
from transformers import AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Define the special tokens to add
special_tokens_dict = {
    "additional_special_tokens": ["<T>", "</T>"]
}


def preprocess_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=tokenizer.model_max_length
    )
    # Convert labels to float for each example in the batch
    tokenized["labels"] = [[float(x) for x in label_list] for label_list in examples["labels"]]
    return tokenized

train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████| 686/686 [00:00<00:00, 2676.99 examples/s]
Map: 100%|██████████| 91/91 [00:00<00:00, 1021.43 examples/s]

{'labels': tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'input_ids': tensor([     0,  42673, 175107,      7, 101057,   9624,     14, 116071, 110196,
           271,     70, 140978,    214,    111,    581,    481,  15130,      7,
             6,  63588,  61692,    397,  61661, 208585,    294,  11507,  64163,
         61661,  35602, 113786,  24639,  48026,   8545,  23373,  75018,  35213,
         23186,  31766,  23373,  14688,    159,  56564,  83435,  23373,  29971,
        145688,    313, 187273,   8241,  14452,  42673, 160641,    621,  35971,
         26866,      7,   3934,   2363,  10002,  44540,    136, 163684,    214,
            70,  48800,  20288,  40059,    214, 125861,      7,      5,  42673,
            83,   7730,     47,    186,     70,  11698,    111,     70,  39746,
          7154,     47,     70, 153552,  15549,  17946,   7432,     23,     70,
         17274,      5,      6,      5,   9563,  78684,   2685,    509,     10,
          5155,    1




In [16]:
print(train_dataset[0]["labels"][0])  # Should output a list of floats

tensor(0)


In [17]:
print(train_dataset[0])

{'labels': tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'input_ids': tensor([     0,  42673, 175107,      7, 101057,   9624,     14, 116071, 110196,
           271,     70, 140978,    214,    111,    581,    481,  15130,      7,
             6,  63588,  61692,    397,  61661, 208585,    294,  11507,  64163,
         61661,  35602, 113786,  24639,  48026,   8545,  23373,  75018,  35213,
         23186,  31766,  23373,  14688,    159,  56564,  83435,  23373,  29971,
        145688,    313, 187273,   8241,  14452,  42673, 160641,    621,  35971,
         26866,      7,   3934,   2363,  10002,  44540,    136, 163684,    214,
            70,  48800,  20288,  40059,    214, 125861,      7,      5,  42673,
            83,   7730,     47,    186,     70,  11698,    111,     70,  39746,
          7154,     47,     70, 153552,  15549,  17946,   7432,     23,     70,
         17274,      5,      6,      5,   9563,  78684,   2685,    509,     10,
          5155,    1

In [18]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=len(taxonomy),
    problem_type="multi_label_classification"
)
model.resize_token_embeddings(len(tokenizer))

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(250002, 768, padding_idx=1)

In [19]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=100
)




In [21]:
from transformers import Trainer
from scipy.special import expit  # Sigmoid
from sklearn.metrics import precision_recall_curve
import numpy as np

def compute_metrics(pred):
    """
    Compute exact match ratio for multilabel classification.

    Parameters:
        pred: Tuple containing logits and labels.

    Returns:
        dict: A dictionary with the exact match ratio and chosen threshold.
    """
    logits, labels = pred

    # Apply sigmoid to convert logits to probabilities
    probabilities = expit(logits)

    # Compute optimal threshold using precision-recall curve
    precisions, recalls, thresholds = precision_recall_curve(labels.ravel(), probabilities.ravel())
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
    best_threshold = thresholds[np.argmax(f1_scores)]

    # Convert probabilities to binary predictions using the chosen threshold
    predictions = (probabilities > best_threshold).astype(int)

    # Calculate exact match ratio
    exact_match = np.all(predictions == labels, axis=1).mean()

    return {"exact_match_ratio": exact_match, "best_threshold": best_threshold}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [22]:
trainer.train()

  0%|          | 0/129 [00:00<?, ?it/s]

RuntimeError: result type Float can't be cast to the desired output type Long