In [None]:
!pip install datasets
!pip install clean-text
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/1

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
import matplotlib.pyplot as plt
import re
from cleantext import clean
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch
from sklearn.metrics import f1_score
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import random



# Dataset

In this part we read the dataset and we plot the distributions of the labels and the length of all the texts. 
We are dealing with a small dataset containing conspiratorial texts.

In [None]:
dataset = pd.read_csv("../datasets/subtaskA_train_aug.csv")
dataset = dataset.drop('Id', axis=1)
dataset = dataset.rename(columns={'comment_text': 'text', 'conspiratorial': 'label'})
dataset

Unnamed: 0,text,label
0,se non ci fossero soldati non ci sarebbero gue...,0
1,shedding of infectious sars-cov-2 despite vac...,1
2,paura e delirio alla cnn: il ministero della v...,1
3,l'aspirina non aumenta la sopravvivenza dei pa...,0
4,l'italia non puo' dare armi lo vieta la costit...,0
...,...,...
3679,Abraccia la terra spesso 2000 km oooh mi ricor...,1
3680,Tuttavia le recensioni erano negative anche pr...,0
3681,"Nel frattempo, in Kazakistan, le persone che h...",1
3682,una profezia ad una conferenza nel 2015: il pr...,1


# Text Cleaning

In this section, first of all, we define the function that we will use to clean our text. As we can se above there are some texts between parentesis that are usefull and also unicodes.
We are removing all that stuff creating a text that is similar to the ones used to train the pre-trained model. So, for instance, we won't remove the punctuation and and stopwords.

In [None]:
# We define the function for the cleaning of the text

def text_cleaning(text):
    # Convert words to lower case
    text = str(text)
    text = text.lower()
    text = clean(text, no_emoji=True)

    text = re.sub(r'(\[.*?\])', '', text)
    text = re.sub(r'[0-9]{2}\/[0-9]{2}\/[0-9]{2,4}', ' ', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'[_"\%()|+&=*%#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\.+','.', text)
    text = re.sub(r'\,+',',', text)
    text = re.sub(r'\!+','!', text)
    text = re.sub(r'\?+','?', text)
    text = re.sub(r'\n+','', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub('[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', '', text)

    return text

In [None]:
dataset['text'] = list(map(text_cleaning, dataset.text))
dataset

Unnamed: 0,text,label
0,se non ci fossero soldati non ci sarebbero gue...,0
1,shedding of infectious sars-cov-2 despite vacc...,1
2,paura e delirio alla cnn: il ministero della v...,1
3,l'aspirina non aumenta la sopravvivenza dei pa...,0
4,l'italia non puo' dare armi lo vieta la costit...,0
...,...,...
3679,abraccia la terra spesso 2000 km oooh mi ricor...,1
3680,tuttavia le recensioni erano negative anche pr...,0
3681,"nel frattempo, in kazakistan, le persone che h...",1
3682,una profezia ad una conferenza nel 2015: il pr...,1


# Model download

In this part we are downloading the bert-model and it's tokenizer from the hugging face hub.
In that specific case we will use the bert-italian-xxl model

In [None]:
model_name = 'dbmdz/bert-base-italian-xxl-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/243k [00:00<?, ?B/s]

In [None]:
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

In [None]:
# We controll if the device support the computation on the GPU, otherwise we will compute the model on the CPU
device = ('cuda' if torch.cuda.is_available() else 'cpu')

num_labels = 2  # Number of classes
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-base-italian-xxl-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification

## Metrics
We define the metric that we want to calculate during the fine-tuning of the model

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='micro')
    return {'f1_score': f1}

In [None]:
def createDataset(train, val):
  tds = Dataset.from_pandas(train)
  vds = Dataset.from_pandas(val)
  dataset_hf = DatasetDict()
  
  dataset_hf['train'] = tds
  dataset_hf['validation'] = vds
  dataset_hf['train']= dataset_hf['train'].remove_columns("__index_level_0__")
  dataset_hf['validation']= dataset_hf['validation'].remove_columns("__index_level_0__")

  return dataset_hf

## Cross-Validation

Now we will define the cross validation to find the best hyperparameters of the model. This since we have a very small dataset, so we are trying to generalize as much as possible.

In [None]:
# Define a custom collator function with dynamic masking
def collate_fn(batch):
    input_ids = [example['input_ids'] for example in batch]
    labels = [example['label'] for example in batch]

    # Apply dynamic masking
    for i in range(len(input_ids)):
        # Generate a random mask
        mask = [0] * len(input_ids[i])
        for j in range(len(input_ids[i])):
            if random.random() < 0.15:
                mask[j] = 1

        # Apply the mask to the input sequence
        input_ids[i] = [input_ids[i][j] if not mask[j] else tokenizer.mask_token_id for j in range(len(input_ids[i]))]

    # Pad the input sequences
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(input_ids[i]) for i in range(len(input_ids))],
        batch_first=True,
        padding_value=tokenizer.pad_token_id
    )

    # Create the input dictionary
    inputs = {'input_ids': input_ids, 'attention_mask': input_ids != tokenizer.pad_token_id, 'labels': torch.tensor(labels)}

    return inputs

In [None]:
lr_rates = [2e-5, 3e-5]
epochs = [2,3,4]
lr_scheduler = ['constant']

log_results_cross = pd.DataFrame({
    'learning_rate': [],
    'epochs': [],
    'f1_score': [],
    'lr_scheduler': []
})

for lr in lr_rates:
  for epoch in epochs:
    for scheduler in lr_scheduler:
      results = []

      for i in range(5):
        # We create the split of the dataset, shuffling before the data
        train, val = train_test_split(dataset, train_size=0.8, shuffle=True)
        df = createDataset(train, val)
        texts_encoded = df.map(tokenize, batched=True, batch_size=None)

        # We define the argument that the model has to use
        batch_size = 16
        logging_steps = len(texts_encoded['train'])
        training_args = TrainingArguments(output_dir=model_name,
                                    num_train_epochs=epoch,
                                    learning_rate=lr,
                                    per_device_train_batch_size=batch_size,
                                    per_device_eval_batch_size=batch_size,
                                    weight_decay=0.01,
                                    evaluation_strategy='epoch',
                                    disable_tqdm=False,
                                    logging_steps=logging_steps,
                                    #lr_scheduler_type=scheduler,
                                    fp16=True,
                                    push_to_hub=False,
                                    log_level='error')
        # Train of the model
        trainer = Trainer(model=model,
                    args=training_args,
                    compute_metrics=compute_metrics,
                    train_dataset=texts_encoded['train'],
                    eval_dataset=texts_encoded['validation'],
                    data_collator=collate_fn,
                    tokenizer=tokenizer)
        trainer.train()
        results.append(trainer.predict(texts_encoded['validation']).metrics['test_f1_score'])
    
    # We calculate the mean of the f1_scores and we save the model and the result on a log
    mean_f1_score = sum(results) / len(results)
    print(mean_f1_score)

    parameters = [lr, epoch, mean_f1_score, scheduler]
    log_results_cross.loc[len(log_results_cross)] = parameters
  
log_results_cross.to_csv('/log/log_dynamic_xxl.csv')

Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.248299,0.92673
2,No log,0.274974,0.925373


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.458315,0.919946
2,No log,0.377661,0.929444


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.455068,0.922659
2,No log,0.423981,0.933514


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.382062,0.932157
2,No log,0.444505,0.934871


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.474774,0.932157
2,No log,0.489935,0.940299


0.9294436906377204


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.605172,0.937585
2,No log,0.537771,0.933514
3,No log,0.34583,0.937585


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.497187,0.936228
2,No log,0.629345,0.92673
3,No log,0.436997,0.945726


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.561617,0.932157
2,No log,0.673481,0.915875
3,No log,0.434554,0.940299


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.607162,0.915875
2,No log,0.593622,0.922659
3,No log,0.516636,0.933514


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.624151,0.929444
2,No log,0.696005,0.924016
3,No log,0.60444,0.930801


0.9381275440976934


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.790117,0.925373
2,No log,0.555641,0.925373
3,No log,0.526242,0.930801
4,No log,0.415176,0.936228


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.572845,0.944369
2,No log,0.686848,0.934871
3,No log,0.576786,0.933514
4,No log,0.438188,0.943012


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,2.50522,0.788331
2,No log,0.49716,0.947083
3,No log,0.554605,0.930801
4,No log,0.464847,0.944369


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.78257,0.925373
2,No log,0.622364,0.932157
3,No log,0.497146,0.944369
4,No log,0.480635,0.944369


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.562864,0.95251
2,No log,0.656276,0.947083
3,No log,0.697971,0.940299
4,No log,0.679222,0.940299


0.9476255088195387


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.621263,0.953867
2,No log,0.741123,0.94844


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.678818,0.953867
2,No log,0.80698,0.947083


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.698797,0.953867
2,No log,0.851444,0.945726


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.723897,0.95251
2,No log,0.885388,0.945726


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.763928,0.95251
2,No log,0.906135,0.945726


0.9419267299864316


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.780573,0.95251
2,No log,0.927002,0.945726
3,No log,0.938746,0.945726


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.83102,0.951153
2,No log,0.674147,0.937585
3,No log,0.627608,0.943012


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.895355,0.941655
2,No log,0.79485,0.934871
3,No log,0.550746,0.943012


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.66485,0.938942
2,No log,0.634239,0.944369
3,No log,0.549937,0.949796


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.874261,0.943012
2,No log,0.930009,0.940299
3,No log,0.689104,0.949796


0.9389416553595658


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.846207,0.944369
2,No log,0.982952,0.937585
3,No log,0.63253,0.934871
4,No log,0.52193,0.944369


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.761138,0.933514
2,No log,0.573862,0.929444
3,No log,0.473011,0.940299
4,No log,0.44678,0.945726


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,1.259499,0.902307
2,No log,0.460351,0.936228
3,No log,0.46431,0.945726
4,No log,0.425675,0.95251


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.752953,0.941655
2,No log,0.931108,0.917232
3,No log,0.605136,0.937585
4,No log,0.572255,0.940299


Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.823091,0.928087
2,No log,1.266407,0.892809
3,No log,0.65637,0.941655
4,No log,0.65825,0.928087


0.9473541383989146


# Train final Model

Now that we finished the cross-validation, we take the best model found and we buil the final model.
Then, we push the model into the hugging face hub

In [None]:
train, val = train_test_split(dataset, train_size=0.8, shuffle=True)
dataset_final_train = createDataset(train, val)
dataset_final_encoded = dataset_final_train.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/2947 [00:00<?, ? examples/s]

Map:   0%|          | 0/737 [00:00<?, ? examples/s]

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# We take the best parameter from the log
idx_best_model = log_results_cross['f1_score'].idxmax()
learning_rate_best = log_results_cross['learning_rate'][idx_best_model]
epoch_best = float(log_results_cross['epochs'][idx_best_model])
#lr_scheduler_best = log_results_cross['lr_scheduler'][idx_best_model]

# We define the arguments for the training 
batch_size = 8
logging_steps = len(dataset_final_encoded['train'])
model_name_hub = 'prova-xxl-single'

# We train the final model
training_args = TrainingArguments(output_dir=model_name_hub,
                                  num_train_epochs=epoch_best,
                                  learning_rate=learning_rate_best,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  #lr_scheduler_type=lr_scheduler_best,
                                  fp16=True,
                                  push_to_hub=True,
                                  log_level='error')

trainer = Trainer(model=model,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_final_encoded['train'],
                  eval_dataset=dataset_final_encoded['validation'],
                  data_collator=collate_fn,
                  tokenizer=tokenizer)
trainer.train()
trainer.push_to_hub()

/content/prova-xxl-single is already a clone of https://huggingface.co/Mike00vito/prova-xxl-single. Make sure you pull the latest changes with `repo.git_pull()`.


Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.866497,0.914518
2,No log,0.430153,0.951153
3,No log,0.530868,0.938942
4,No log,0.495983,0.94844


Upload file pytorch_model.bin:   0%|          | 1.00/422M [00:00<?, ?B/s]

Upload file runs/May06_11-33-33_682109be0111/events.out.tfevents.1683372816.682109be0111.5011.66:   0%|       …

To https://huggingface.co/Mike00vito/prova-xxl-single
   8e6896d..27add41  main -> main

   8e6896d..27add41  main -> main

To https://huggingface.co/Mike00vito/prova-xxl-single
   27add41..c4e24ae  main -> main

   27add41..c4e24ae  main -> main



'https://huggingface.co/Mike00vito/prova-xxl-single/commit/27add41a7a4bf106f928d8f8dab22570fe041430'