In [None]:
!pip install datasets
!pip install clean-text
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/1

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
import matplotlib.pyplot as plt
import re
from cleantext import clean
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch
from sklearn.metrics import f1_score
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split



# Dataset

In this part we read the dataset and we plot the distributions of the labels and the length of all the texts. 
We are dealing with a small dataset containing conspiratorial texts.

In [None]:
dataset = pd.read_csv("../datasets/subtaskA_train.csv")
dataset = dataset.drop('Id', axis=1)
dataset = dataset.rename(columns={'comment_text': 'text', 'conspiratorial': 'label'})
dataset

Unnamed: 0,text,label
0,⚡Se non ci fossero soldati non ci sarebbero gu...,0
1,"21/08/21]( [PRE-PRINT]\n\n📄__ ""Shedding of Inf...",1
2,PAURA E DELIRIO ALLA CNN: IL MINISTERO DELLA V...,1
3,L'Aspirina non aumenta la sopravvivenza dei pa...,0
4,L'Italia non puo' dare armi lo vieta la Costit...,0
...,...,...
1837,avvolge la terra spesso 2000 km Oooh mi ricor...,1
1838,Comunque le recensioni erano negative ancora p...,0
1839,Intanto in Kazakistan la gente che ne ha le pa...,1
1840,Una profezia ad una conferenza del 2015:\nIl p...,1


# Text Cleaning

In this section, first of all, we define the function that we will use to clean our text. As we can se above there are some texts between parentesis that are usefull and also unicodes.
We are removing all that stuff creating a text that is similar to the ones used to train the pre-trained model. So, for instance, we won't remove the punctuation and and stopwords.

In [None]:
# We define the function for the cleaning of the text

def text_cleaning(text):
    # Convert words to lower case
    text = text.lower()
    text = clean(text, no_emoji=True)

    text = re.sub(r'(\[.*?\])', '', text)
    text = re.sub(r'[0-9]{2}\/[0-9]{2}\/[0-9]{2,4}', ' ', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'[_"\%()|+&=*%#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\.+','.', text)
    text = re.sub(r'\,+',',', text)
    text = re.sub(r'\!+','!', text)
    text = re.sub(r'\?+','?', text)
    text = re.sub(r'\n+','', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub('[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', text)

    return text

In [None]:
dataset['text'] = list(map(text_cleaning, dataset.text))
dataset

Unnamed: 0,text,label
0,se non ci fossero soldati non ci sarebbero gue...,0
1,shedding of infectious sars-cov-2 despite vac...,1
2,paura e delirio alla cnn: il ministero della v...,1
3,l'aspirina non aumenta la sopravvivenza dei pa...,0
4,l'italia non puo' dare armi lo vieta la costit...,0
...,...,...
1837,avvolge la terra spesso 2000 km oooh mi ricord...,1
1838,comunque le recensioni erano negative ancora p...,0
1839,intanto in kazakistan la gente che ne ha le pa...,1
1840,una profezia ad una conferenza del 2015:il pro...,1


# Model download

In this part we are downloading the bert-model and it's tokenizer from the hugging face hub.
In that specific case we will use the bert-italian-xxl model

In [None]:
model_name = 'dbmdz/bert-base-italian-xxl-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/243k [00:00<?, ?B/s]

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

In [None]:
# We controll if the device support the computation on the GPU, otherwise we will compute the model on the CPU
device = ('cuda' if torch.cuda.is_available() else 'cpu')

num_labels = 2  # Number of classes
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-italian-xxl-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification

## Metrics
We define the metric that we want to calculate during the fine-tuning of the model

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='micro')
    return {'f1_score': f1}

In [None]:
def createDataset(train, val):
  tds = Dataset.from_pandas(train)
  vds = Dataset.from_pandas(val)
  dataset_hf = DatasetDict()
  
  dataset_hf['train'] = tds
  dataset_hf['validation'] = vds
  dataset_hf['train']= dataset_hf['train'].remove_columns("__index_level_0__")
  dataset_hf['validation']= dataset_hf['validation'].remove_columns("__index_level_0__")

  return dataset_hf

In [None]:
# Define a custom collator function with dynamic masking
def collate_fn(batch):
    input_ids = [example['input_ids'] for example in batch]
    labels = [example['label'] for example in batch]

    # Apply dynamic masking
    for i in range(len(input_ids)):
        # Generate a random mask
        mask = [0] * len(input_ids[i])
        for j in range(len(input_ids[i])):
            if random.random() < 0.15:
                mask[j] = 1

        # Apply the mask to the input sequence
        input_ids[i] = [input_ids[i][j] if not mask[j] else tokenizer.mask_token_id for j in range(len(input_ids[i]))]

    # Pad the input sequences
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(input_ids[i]) for i in range(len(input_ids))],
        batch_first=True,
        padding_value=tokenizer.pad_token_id
    )

    # Create the input dictionary
    inputs = {'input_ids': input_ids, 'attention_mask': input_ids != tokenizer.pad_token_id, 'labels': torch.tensor(labels)}

    return inputs

## Cross-Validation

Now we will define the cross validation to find the best hyperparameters of the model. This since we have a very small dataset, so we are trying to generalize as much as possible.

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import random

lr_rates = [2e-5, 3e-5]
epochs = [2,3,4]
lr_scheduler = ['constant']

log_results_cross = pd.DataFrame({
    'learning_rate': [],
    'epochs': [],
    'f1_score': [],
    'lr_scheduler': []
})

for lr in lr_rates:
  for epoch in epochs:
    for scheduler in lr_scheduler:
      results = []

      for i in range(5):
        # We create the split of the dataset, shuffling before the data
        train, val = train_test_split(dataset, train_size=0.8, shuffle=True)
        df = createDataset(train, val)
        texts_encoded = df.map(tokenize, batched=True, batch_size=None)

        # We define the argument that the model has to use
        batch_size = 8
        logging_steps = len(texts_encoded['train'])
        training_args = TrainingArguments(output_dir=model_name,
                                    num_train_epochs=epoch,
                                    learning_rate=lr,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    weight_decay=0.01,
                                    evaluation_strategy='epoch',
                                    disable_tqdm=False,
                                    logging_steps=logging_steps,
                                    #lr_scheduler_type=scheduler,
                                    fp16=True,
                                    push_to_hub=False,
                                    log_level='error')
        # Train of the model
        trainer = Trainer(model=model,
                    args=training_args,
                    compute_metrics=compute_metrics,
                    train_dataset=texts_encoded['train'],
                    eval_dataset=texts_encoded['validation'],
                    data_collator=collate_fn,
                    tokenizer=tokenizer)
        trainer.train()
        results.append(trainer.predict(texts_encoded['validation']).metrics['test_f1_score'])
    
    # We calculate the mean of the f1_scores and we save the model and the result on a log
    mean_f1_score = sum(results) / len(results)
    print(mean_f1_score)

    parameters = [lr, epoch, mean_f1_score, scheduler]
    log_results_cross.loc[len(log_results_cross)] = parameters
  
log_results_cross.to_csv('/log/log_dynamic_3.csv')

Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.481992,0.761518
2,No log,0.425828,0.821138


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.343411,0.840108
2,No log,0.424494,0.842818


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.790936,0.831978
2,No log,0.855572,0.826558


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,1.197837,0.831978
2,No log,1.099246,0.826558


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,1.452355,0.834688
2,No log,1.32771,0.834688


0.8341463414634147


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,1.823508,0.840108
2,No log,1.211566,0.840108
3,No log,1.024266,0.845528


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,1.893318,0.834688
2,No log,1.762294,0.818428
3,No log,1.370301,0.845528


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,1.956771,0.848238
2,No log,1.911794,0.823848
3,No log,1.398912,0.864499


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.83956,0.739837
2,No log,1.733734,0.821138
3,No log,1.622223,0.840108


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,1.810448,0.818428
2,No log,1.701096,0.826558
3,No log,1.395854,0.856369


0.8303523035230352


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,1.784914,0.810298
2,No log,1.957709,0.794038
3,No log,1.413805,0.856369
4,No log,1.603117,0.810298


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.261001,0.783198
2,No log,1.85549,0.815718
3,No log,1.555124,0.842818
4,No log,1.752202,0.815718


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,1.611083,0.840108
2,No log,2.131054,0.818428
3,No log,1.64542,0.829268
4,No log,1.81619,0.823848


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,2.310758,0.829268
2,No log,1.710504,0.810298
3,No log,1.598757,0.845528
4,No log,1.876259,0.823848


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,2.508403,0.818428
2,No log,2.610637,0.818428
3,No log,2.221761,0.840108
4,No log,2.470044,0.818428


0.8233062330623305


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,2.671244,0.829268
2,No log,2.744933,0.804878


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,2.777238,0.818428
2,No log,2.96223,0.810298


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,2.721318,0.829268
2,No log,3.034823,0.813008


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,2.797923,0.831978
2,No log,3.066439,0.815718


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,2.856323,0.829268
2,No log,3.097586,0.818428


0.8281842818428184


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,2.899365,0.829268
2,No log,3.1361,0.815718
3,No log,2.754971,0.829268


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.010695,0.821138
2,No log,3.294974,0.810298
3,No log,2.84036,0.829268


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.031869,0.818428
2,No log,3.317983,0.810298
3,No log,2.859922,0.829268


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.055376,0.818428
2,No log,3.339798,0.810298
3,No log,2.879025,0.831978


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.075596,0.818428
2,No log,3.358818,0.810298
3,No log,2.895755,0.831978


0.8119241192411921


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.092799,0.821138
2,No log,3.377627,0.810298
3,No log,2.918539,0.831978
4,No log,3.309898,0.807588


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.253274,0.818428
2,No log,3.410737,0.810298
3,No log,2.971358,0.837398
4,No log,3.338859,0.807588


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.244778,0.818428
2,No log,3.435331,0.807588
3,No log,3.002047,0.837398
4,No log,3.357059,0.804878


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.247102,0.818428
2,No log,3.452946,0.810298
3,No log,3.024216,0.837398
4,No log,3.372454,0.807588


Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.252744,0.823848
2,No log,3.469038,0.810298
3,No log,3.04195,0.834688
4,No log,3.387067,0.810298


0.835230352303523


In [None]:
train, val = train_test_split(dataset, train_size=0.8, shuffle=True)
dataset_final_train = createDataset(train, val)
dataset_final_encoded = dataset_final_train.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/1473 [00:00<?, ? examples/s]

Map:   0%|          | 0/369 [00:00<?, ? examples/s]

In [None]:
# We take the best parameter from the log
idx_best_model = log_results_cross['f1_score'].idxmax()
learning_rate_best = log_results_cross['learning_rate'][idx_best_model]
epoch_best = float(log_results_cross['epochs'][idx_best_model])
#lr_scheduler_best = log_results_cross['lr_scheduler'][idx_best_model]

# We define the arguments for the training
batch_size = 8
logging_steps = len(dataset_final_encoded['train'])
model_name_hub = 'prova-xxl-single-2'

# We train the final model
training_args = TrainingArguments(output_dir=model_name_hub,
                                  num_train_epochs=epoch_best,
                                  learning_rate=learning_rate_best,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  #lr_scheduler_type=lr_scheduler_best,
                                  fp16=True,
                                  push_to_hub=True,
                                  log_level='error')

trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_final_encoded['train'],
                  eval_dataset=dataset_final_encoded['validation'],
                  data_collator=collate_fn,
                  tokenizer=tokenizer)
trainer.train()
trainer.push_to_hub()

Cloning https://huggingface.co/Mike00vito/prova-xxl-single-2 into local empty directory.


Epoch,Training Loss,Validation Loss,F1 Score
1,No log,3.259457,0.823848
2,No log,3.484294,0.810298
3,No log,3.057198,0.834688
4,No log,3.400702,0.810298


Upload file pytorch_model.bin:   0%|          | 1.00/422M [00:00<?, ?B/s]

Upload file runs/May06_19-10-03_a76815eac119/events.out.tfevents.1683400212.a76815eac119.175.60:   0%|        …

To https://huggingface.co/Mike00vito/prova-xxl-single-2
   d4b2a9e..f2fa86a  main -> main

   d4b2a9e..f2fa86a  main -> main

To https://huggingface.co/Mike00vito/prova-xxl-single-2
   f2fa86a..20b58a8  main -> main

   f2fa86a..20b58a8  main -> main



'https://huggingface.co/Mike00vito/prova-xxl-single-2/commit/f2fa86a4d3d16614d6039d6f230a0cc9828dcce0'