In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# MODEL TRAINING

In [None]:
import numpy as np
import torch

# set seed for reproducibility
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

In [None]:
LABEL2ID = {"VICTIM": 0, "FACILITATOR": 1, "AGENT": 2, "CAMPAIGNER": 3, "OBJECTIVE": 4, "NEGATIVE_EFFECT": 5}
ID2LABEL = {0: "VICTIM", 1: "FACILITATOR", 2: "AGENT", 3: "CAMPAIGNER", 4: "OBJECTIVE", 5: "NEGATIVE_EFFECT"}

In [None]:
import json

# Path to your JSON file
train_dataset_path = '/content/drive/Shareddrives/PanClef2024/dataset/dataset_en_train.json'
test_dataset_path = '/content/drive/Shareddrives/PanClef2024/dataset/dataset_en_official_test_nolabels.json'

# Load JSON data into a variable
with open(train_dataset_path, 'r') as file:
    train_data_raw = json.load(file)

# train_data_raw

In [None]:
train_spans = []
train_clas = []
valid_test_spans = []
valid_test_clas = []
valid_spans = []
valid_clas = []
test_spans = []
test_clas = []

In [None]:
for entry in train_data_raw[:int(len(train_data_raw)*0.7)]:
  for annot in entry['annotations']:
    train_spans.append(annot['span_text'])
    train_clas.append(annot['category'])
for entry in train_data_raw[int(len(train_data_raw)*0.7):]:
  for annot in entry['annotations']:
    valid_test_spans.append(annot['span_text'])
    valid_test_clas.append(annot['category'])

In [None]:
print(len(train_spans))
print(len(valid_test_spans))

15782
6871


In [None]:
for span in valid_test_spans[:int(len(valid_spans)*0.7)]:
  valid_spans.append(span)
for span in valid_test_spans[int(len(valid_spans)*0.7):]:
  test_spans.append(span)
for clas in valid_test_clas[:int(len(valid_spans)*0.7)]:
  valid_clas.append(clas)
for clas in valid_test_clas[int(len(valid_spans)*0.7):]:
  test_clas.append(clas)

In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m859.3 kB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1

In [None]:
from datasets import Dataset, DatasetDict


# train (80%), validation (10%), test (10%) split
# train_test_datasets = preprocessed_datasets['train'].train_test_split(test_size=0.2, seed=SEED, shuffle=True)
# validation_test_datasets = train_test_datasets['test'].train_test_split(test_size=0.5, seed=SEED, shuffle=True)

preprocessed_datasets = DatasetDict({
    'train': [[train_spans[i], train_clas[i]] for i in range(len(train_spans))],
    'validation': [[valid_spans[i], valid_clas[i]] for i in range(len(valid_spans))],
    'test':  [[test_spans[i], test_clas[i]] for i in range(len(test_spans))]
})
preprocessed_datasets

DatasetDict({
    validation: []
    test: [['to forcibly vaccinate people using the military and police', 'OBJECTIVE'], ['the military', 'FACILITATOR'], ['police', 'FACILITATOR'], ['Vaxxies are dying at such an unignorable rate that the completely corrupt , Pfizer funded CDC has to do " " an investigation " "', 'NEGATIVE_EFFECT'], ['Pfizer funded CDC', 'AGENT'], ['no Democrat in America', 'FACILITATOR'], ['Budget airline Ryanair', 'FACILITATOR'], ['summit', 'CAMPAIGNER'], ['all security forces', 'VICTIM'], ["FNC 's Carlson", 'CAMPAIGNER'], ['Teachers', 'FACILITATOR'], ['Tribalism', 'NEGATIVE_EFFECT'], ["We 've Reached Jonestown at This Point", 'NEGATIVE_EFFECT'], ['the CDC', 'AGENT'], ['DEATH only from the covid 19 Vaccine', 'NEGATIVE_EFFECT'], ['life threatening effects , disability or hospitalisation side effects', 'NEGATIVE_EFFECT'], ['THE NEW YELLOW VEST MOVEMENT', 'CAMPAIGNER'], ['the great populist anti - vaccine passport movement', 'CAMPAIGNER'], ['Nearly a million people', 'CA

In [None]:
train_dataset = Dataset.from_dict({'text': train_spans, 'labels': train_clas})
valid_dataset = Dataset.from_dict({'text': valid_spans, 'labels': valid_clas})
test_dataset = Dataset.from_dict({'text': test_spans, 'labels': test_clas})

# Create DatasetDict
preprocessed_datasets = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

In [None]:
from transformers import AutoTokenizer

CHECKPOINT = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
tokenized_datasets = preprocessed_datasets.map(lambda batch: tokenizer(batch['text'], truncation=True), batched=True, remove_columns=['text'])
tokenized_datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/15782 [00:00<?, ? examples/s]

Map:   0%|          | 0/6871 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 15782
    })
    validation: Dataset({
        features: ['labels'],
        num_rows: 0
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 6871
    })
})

In [None]:
#randomly inserted this

from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification
import torch

# Define the checkpoint and labels
CHECKPOINT = "bert-base-uncased"
# LABEL2ID = {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}  # Example label mapping
# ID2LABEL = {0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2'}

# Load pre-trained model and tokenizer
tokenizer = BertTokenizer.from_pretrained(CHECKPOINT)
model = AutoModelForSequenceClassification.from_pretrained(
    CHECKPOINT,
    problem_type='multi_label_classification',
    num_labels=len(LABEL2ID),
    label2id=LABEL2ID,
    id2label=ID2LABEL,
)

# Save the model and tokenizer
save_directory = "./fine_tuned_model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)
print(f"Model and tokenizer saved in {save_directory}")

# Load the saved model and tokenizer
loaded_model = BertForSequenceClassification.from_pretrained(save_directory)
loaded_tokenizer = BertTokenizer.from_pretrained(save_directory)
print("Model and tokenizer loaded successfully")

# Example usage
def classify_span(span_text):
    inputs = loaded_tokenizer(span_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = loaded_model(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax().item()
    return predicted_class_id

# Example classification
span_text = "Example text to classify"
predicted_class = classify_span(span_text)
print(f"Predicted class: {predicted_class}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer saved in ./fine_tuned_model
Model and tokenizer loaded successfully
Predicted class: 3


In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    CHECKPOINT,
    problem_type='multi_label_classification',
    num_labels=len(LABEL2ID),
    label2id=LABEL2ID,
    id2label=ID2LABEL,
)

save_directory = "./fine_tuned_model"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved in {save_directory}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model and tokenizer saved in ./fine_tuned_model


In [None]:
from transformers import DataCollatorWithPadding
from torch.utils.data import DataLoader

# get data collator for data loader
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# setup dataloaders with tokenized dataset
# to shuffle only be train for each epoch
# in 64 batch sizes with dynamic padding

dataloaders = {}
for dataset_type in tokenized_datasets.keys():
    dataloaders[dataset_type] = DataLoader(
        dataset=tokenized_datasets[dataset_type],
        batch_size=64,
        shuffle=(dataset_type == 'train'),
        collate_fn=data_collator,
    )

# EDIT LATER


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    sentences = list(doc.sents)
    spans = [(sent.start_char, sent.end_char, sent.text) for sent in sentences]
    return spans


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load pre-trained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model for sequence classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", torch_dtype=torch.float16)

# Define function to classify spans
def classify_span(span_text):
    inputs = tokenizer(span_text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax().item()
    return predicted_class_id

# Example usage
span_text = "Example text to classify"
predicted_class = classify_span(span_text)
print(f"Predicted class: {predicted_class}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted class: 1


In [None]:
def generate_output(text, spans, categories):
    annotations = []
    for start_char, end_char, span_text in spans:
        category_id = classify_span(span_text)
        category = categories[category_id]
        annotations.append({
            "start_char": start_char,
            "end_char": end_char,
            "category": category
        })
    return annotations


In [None]:
import json

# Path to your JSON file
file_path = '/content/drive/Shareddrives/BERTTask/dataset/dataset_en_official_test_nolabels.json'

# Load JSON data into a variable
with open(file_path, 'r') as file:
    data = json.load(file)

# Print the loaded data
print(data)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/Shareddrives/BERTTask/dataset/dataset_en_official_test_nolabels.json'

In [None]:
# Define categories
categories = ["VICTIM", "FACILITATOR", "AGENT", "CAMPAIGNER", "OBJECTIVE", "NEGATIVE_EFFECT"]

# Process each entry
outputs = []
for entry in data:
    text = entry["text"]
    spans = preprocess_text(text)
    # dic = {'id': entry['id'], 'annotations': []}
    annotations = generate_output(text, spans, categories)
    output_entry = {
        "id": entry["id"],
        "annotations": annotations
    }
    # print(output_entry)
    # for line in output_entry['annotations']:
    #   print(text[line['start_char']: line['end_char']])
    # print()
    print(output_entry)
    outputs.append(output_entry)

# Example output


{'id': '6742', 'annotations': [{'start_char': 0, 'end_char': 49, 'category': 'CAMPAIGNER'}, {'start_char': 50, 'end_char': 88, 'category': 'AGENT'}, {'start_char': 89, 'end_char': 133, 'category': 'AGENT'}, {'start_char': 134, 'end_char': 222, 'category': 'CAMPAIGNER'}, {'start_char': 223, 'end_char': 354, 'category': 'CAMPAIGNER'}, {'start_char': 355, 'end_char': 374, 'category': 'AGENT'}, {'start_char': 375, 'end_char': 388, 'category': 'AGENT'}, {'start_char': 389, 'end_char': 416, 'category': 'AGENT'}]}
{'id': '10596', 'annotations': [{'start_char': 0, 'end_char': 178, 'category': 'AGENT'}, {'start_char': 179, 'end_char': 184, 'category': 'NEGATIVE_EFFECT'}, {'start_char': 185, 'end_char': 234, 'category': 'CAMPAIGNER'}]}
{'id': '482', 'annotations': [{'start_char': 0, 'end_char': 52, 'category': 'AGENT'}, {'start_char': 53, 'end_char': 344, 'category': 'CAMPAIGNER'}, {'start_char': 345, 'end_char': 360, 'category': 'AGENT'}, {'start_char': 361, 'end_char': 376, 'category': 'CAMPAI

In [None]:
with open('drive/Shareddrives/BERTTask/kaprov_task2_en.json', 'w') as fp:
  fp.write(str(outputs))

In [None]:
json_file_path = 'drive/Shareddrives/BERTTask/kaprov_task2_en.json'
outputs.to_json(json_file_path, orient='records', lines=False)

print(f"DataFrame has been written to {json_file_path}")

AttributeError: 'list' object has no attribute 'to_json'