In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, parent_dir)

seeds = [42, 69, 420, 1, 3]

# Getting the data

As always, let's import our last updated data.

In [2]:
from utils import load_datasets

gru_data_folder = os.path.join(parent_dir, "3_preprocessing", "output", "gru")
gru_train_set, gru_val_set, gru_test_set = load_datasets(folder=gru_data_folder)

grubert_data_folder = os.path.join(parent_dir, "3_preprocessing", "output", "grubert")
grubert_train_set, grubert_val_set, grubert_test_set = load_datasets(folder=grubert_data_folder)

# Baseline evaluation

In [3]:
from metrics import compute_baseline_metrics

f1_not_ironic_random, f1_ironic_random, zero_one_f1_random = compute_baseline_metrics(gru_val_set['iro'], len(gru_val_set), "random", seeds=seeds)

print("The scores are the mean values across all seeds:")
print()
print("f1_not_ironic_random: ", f1_not_ironic_random)
print("f1_ironic_random: ", f1_ironic_random)
print("zero_one_f1_random: ", zero_one_f1_random)

Importato package text_enrichment.
The scores are the mean values across all seeds:

f1_not_ironic_random:  0.6458375794869037
f1_ironic_random:  0.17243933610161782
zero_one_f1_random:  0.4091384577942607


In [4]:
f1_not_ironic_majority, f1_1_ironic_majority, zero_one_f1_majority = compute_baseline_metrics(gru_val_set['iro'], len(gru_val_set), "majority", seeds=seeds)

print("f1_not_ironic_majority:", f1_not_ironic_majority)
print("f1_ironic_majority: ", f1_1_ironic_majority)
print("zero_one_f1_majority: ", zero_one_f1_majority)

f1_not_ironic_majority: 0.9428571428571428
f1_ironic_majority:  0.0
zero_one_f1_majority:  0.4714285714285714


# Gru Evaluation

## Validation Set

In [5]:
from typing import Dict
import torch
import pandas as pd
from tqdm import tqdm
from metrics import DataFactory, Evaluator

results_val_Gru: Dict[int, pd.DataFrame] = {}

for seed in seeds:
    print("Seed", seed)
    parent_dir_weights = os.path.join(parent_dir, f'6_training/weights/{seed}/')
    weights_file_names = [
        "Gru_pos_tags_enrichment.pth",
        "Gru_hashtag_enrichment.pth",
        "Gru_base_case.pth",
        "Gru_hashtag_segmentation.pth",
    ]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Create a DataFactory
    data_factory = DataFactory(
        embedding_model_path=os.path.join(parent_dir, "embedding_models/italian_word2vec_100.bin"),
        bert_model_name="m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"
    )

    # 2) Create an Evaluator
    evaluator = Evaluator(
        data_factory=data_factory,
        device=device,
        gru_hidden_size=32,
        num_gru_layers=2,
        gru_dropout=0.2,
        embedding_dim_gru=100,
        batch_size=8,
        max_len=50
    )

    # 3) Run evaluation
    results_df = evaluator.compute_metrics_over_files(
        file_list=weights_file_names,
        threshold_df=gru_val_set,
        eval_df=gru_val_set,
        parent_dir=parent_dir_weights
    )

    results_val_Gru[seed] = results_df


Seed 42


Evaluating models: 100%|██████████| 4/4 [01:47<00:00, 26.76s/it]


Seed 69


Evaluating models: 100%|██████████| 4/4 [01:55<00:00, 28.93s/it]


Seed 420


Evaluating models: 100%|██████████| 4/4 [02:04<00:00, 31.03s/it]


Seed 1


Evaluating models: 100%|██████████| 4/4 [02:01<00:00, 30.32s/it]


Seed 3


Evaluating models: 100%|██████████| 4/4 [01:56<00:00, 29.12s/it]


In [6]:
results_val_Gru[42]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,pos_tags_enrichment,0.904268,0.371212,0.63774,0.22
1,gru,hashtag_enrichment,0.928492,0.384615,0.656554,0.31
2,gru,base_case,0.924634,0.390909,0.657772,0.28
3,gru,hashtag_segmentation,0.923858,0.4,0.661929,0.21


In [7]:
results_val_Gru[69]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,pos_tags_enrichment,0.918919,0.351351,0.635135,0.28
1,gru,hashtag_enrichment,0.911392,0.407692,0.659542,0.21
2,gru,base_case,0.916048,0.404858,0.660453,0.23
3,gru,hashtag_segmentation,0.935912,0.382979,0.659445,0.28


In [8]:
results_val_Gru[420]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,pos_tags_enrichment,0.885496,0.338983,0.61224,0.21
1,gru,hashtag_enrichment,0.925323,0.387097,0.65621,0.28
2,gru,base_case,0.925323,0.387097,0.65621,0.28
3,gru,hashtag_segmentation,0.924719,0.385321,0.65502,0.21


In [9]:
results_val_Gru[1]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,pos_tags_enrichment,0.908987,0.333333,0.62116,0.25
1,gru,hashtag_enrichment,0.924634,0.390909,0.657772,0.3
2,gru,base_case,0.916239,0.395062,0.655651,0.25
3,gru,hashtag_segmentation,0.927455,0.368932,0.648194,0.23


In [10]:
results_val_Gru[3]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,pos_tags_enrichment,0.897153,0.361011,0.629082,0.2
1,gru,hashtag_enrichment,0.917095,0.417671,0.667383,0.22
2,gru,base_case,0.918332,0.421053,0.669693,0.22
3,gru,hashtag_segmentation,0.915718,0.38843,0.652074,0.18


## Test Set

In [11]:
from typing import Dict
import torch
import pandas as pd
from tqdm import tqdm
from metrics import DataFactory, Evaluator

results_test_Gru: Dict[int, pd.DataFrame] = {}
seeds = [42, 69, 420, 1, 3]

for seed in seeds:
    print("Seed", seed)
    parent_dir_weights = os.path.join(parent_dir, f'6_training/weights/{seed}/')
    weights_file_names = [
        "Gru_base_case.pth",
        "Gru_hashtag_segmentation.pth",
        "Gru_pos_tags_enrichment.pth",
        "Gru_hashtag_enrichment.pth",
    ]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Create a DataFactory
    data_factory = DataFactory(
        embedding_model_path=os.path.join(parent_dir, "embedding_models/italian_word2vec_100.bin"),
        bert_model_name="m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"
    )

    # 2) Create an Evaluator
    evaluator = Evaluator(
        data_factory=data_factory,
        device=device,
        gru_hidden_size=32,
        num_gru_layers=2,
        gru_dropout=0.2,
        embedding_dim_gru=100,
        batch_size=8,
        max_len=50
    )

    # 3) Run evaluation
    results_df = evaluator.compute_metrics_over_files(
        file_list=weights_file_names,
        threshold_df=gru_val_set,
        eval_df=gru_test_set,
        parent_dir=parent_dir_weights
    )

    results_test_Gru[seed] = results_df


Seed 42


Evaluating models: 100%|██████████| 4/4 [01:54<00:00, 28.66s/it]


Seed 69


Evaluating models: 100%|██████████| 4/4 [01:51<00:00, 27.95s/it]


Seed 420


Evaluating models: 100%|██████████| 4/4 [01:57<00:00, 29.31s/it]


Seed 1


Evaluating models: 100%|██████████| 4/4 [01:51<00:00, 27.80s/it]


Seed 3


Evaluating models: 100%|██████████| 4/4 [01:57<00:00, 29.25s/it]


In [12]:
results_test_Gru[42]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.921946,0.4,0.660973,0.28
1,gru,hashtag_segmentation,0.920545,0.40678,0.663662,0.21
2,gru,pos_tags_enrichment,0.899594,0.370909,0.635251,0.22
3,gru,hashtag_enrichment,0.927487,0.410959,0.669223,0.31


In [13]:
results_test_Gru[69]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.909196,0.416357,0.662776,0.23
1,gru,hashtag_segmentation,0.923937,0.352381,0.638159,0.28
2,gru,pos_tags_enrichment,0.920814,0.391304,0.656059,0.28
3,gru,hashtag_enrichment,0.903151,0.415493,0.659322,0.21


In [14]:
results_test_Gru[420]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.923772,0.405286,0.664529,0.28
1,gru,hashtag_segmentation,0.91898,0.386266,0.652623,0.21
2,gru,pos_tags_enrichment,0.902053,0.430034,0.666043,0.21
3,gru,hashtag_enrichment,0.923772,0.405286,0.664529,0.28


In [15]:
results_test_Gru[1]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.91422,0.429119,0.671669,0.25
1,gru,hashtag_segmentation,0.919618,0.347032,0.633325,0.23
2,gru,pos_tags_enrichment,0.915759,0.418972,0.667366,0.25
3,gru,hashtag_enrichment,0.924464,0.401786,0.663125,0.3


In [16]:
results_test_Gru[3]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.909936,0.440433,0.675185,0.22
1,gru,hashtag_segmentation,0.911392,0.407692,0.659542,0.18
2,gru,pos_tags_enrichment,0.902098,0.404255,0.653177,0.2
3,gru,hashtag_enrichment,0.911201,0.443636,0.677419,0.22


## Display

In [17]:
from metrics import average_out_scores_for_seeds

average_results_val = average_out_scores_for_seeds(results_val_Gru)

average_results_val

Unnamed: 0_level_0,Unnamed: 1_level_0,F1_not_ironic,F1_not_ironic,F1_ironic,F1_ironic,F1_avg_0_1,F1_avg_0_1,BestThreshold,BestThreshold
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
ModelClass,Configuration,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
gru,base_case,0.920115,0.004536,0.399796,0.013603,0.659956,0.005753,0.252,0.027749
gru,hashtag_enrichment,0.921387,0.006979,0.397597,0.014391,0.659492,0.0046,0.264,0.046152
gru,hashtag_segmentation,0.925532,0.007267,0.385132,0.01117,0.655332,0.005525,0.222,0.037014
gru,pos_tags_enrichment,0.902965,0.012564,0.351178,0.015534,0.627071,0.010462,0.232,0.032711


In [18]:
average_results_test = average_out_scores_for_seeds(results_test_Gru)

average_results_test

Unnamed: 0_level_0,Unnamed: 1_level_0,F1_not_ironic,F1_not_ironic,F1_ironic,F1_ironic,F1_avg_0_1,F1_avg_0_1,BestThreshold,BestThreshold
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
ModelClass,Configuration,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
gru,base_case,0.915814,0.006742,0.418239,0.016695,0.667026,0.006104,0.252,0.027749
gru,hashtag_enrichment,0.918015,0.01039,0.415432,0.016618,0.666724,0.00695,0.264,0.046152
gru,hashtag_segmentation,0.918895,0.004609,0.38003,0.029039,0.649462,0.013242,0.222,0.037014
gru,pos_tags_enrichment,0.908064,0.009556,0.403095,0.023206,0.655579,0.012919,0.232,0.032711


# GruBERT Evaluation

## Validation Set

In [19]:
from typing import Dict
import torch
import pandas as pd
from tqdm import tqdm
from metrics import DataFactory, Evaluator

results_val_GruBERT: Dict[int, pd.DataFrame] = {}
seeds = [42, 69, 420, 1, 3]

for seed in seeds:
    print("Seed", seed)
    parent_dir_weights = os.path.join(parent_dir, f'6_training/weights/{seed}/')
    weights_file_names = [
        "GruBERT_base_case.pth",
        "GruBERT_pos_tags_enrichment.pth",
        "GruBERT_hashtag_enrichment.pth",
    ]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Create a DataFactory
    data_factory = DataFactory(
        embedding_model_path=os.path.join(parent_dir, "embedding_models/italian_word2vec_100.bin"),
        bert_model_name="m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"
    )

    # 2) Create an Evaluator
    evaluator = Evaluator(
        data_factory=data_factory,
        device=device,
        gru_hidden_size=32,
        num_gru_layers=2,
        gru_dropout=0.2,
        embedding_dim_gru=100,
        batch_size=8,
        max_len=50
    )

    # 3) Run evaluation
    results_df = evaluator.compute_metrics_over_files(
        file_list=weights_file_names,
        threshold_df=gru_val_set,
        eval_df=gru_val_set,
        parent_dir=parent_dir_weights
    )

    results_val_GruBERT[seed] = results_df


Seed 42


Evaluating models:   0%|          | 0/3 [00:00<?, ?it/s]

Evaluating models: 100%|██████████| 3/3 [00:31<00:00, 10.48s/it]


Seed 69


Evaluating models: 100%|██████████| 3/3 [00:28<00:00,  9.63s/it]


Seed 420


Evaluating models: 100%|██████████| 3/3 [00:29<00:00,  9.75s/it]


Seed 1


Evaluating models: 100%|██████████| 3/3 [00:28<00:00,  9.60s/it]


Seed 3


Evaluating models: 100%|██████████| 3/3 [00:29<00:00,  9.89s/it]


In [20]:
results_val_GruBERT[42]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.917473,0.39834,0.657907,0.2
1,grubert,pos_tags_enrichment,0.904378,0.366412,0.635395,0.19
2,grubert,hashtag_enrichment,0.920113,0.39485,0.657482,0.21


In [21]:
results_val_GruBERT[69]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.909925,0.384314,0.64712,0.18
1,grubert,pos_tags_enrichment,0.918402,0.343891,0.631147,0.25
2,grubert,hashtag_enrichment,0.916195,0.362069,0.639132,0.19


In [22]:
results_val_GruBERT[420]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.930901,0.338624,0.634763,0.22
1,grubert,pos_tags_enrichment,0.931111,0.373737,0.652424,0.26
2,grubert,hashtag_enrichment,0.914351,0.357447,0.635899,0.18


In [23]:
results_val_GruBERT[1]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.927536,0.362745,0.645141,0.26
1,grubert,pos_tags_enrichment,0.911798,0.388889,0.650344,0.19
2,grubert,hashtag_enrichment,0.904046,0.380597,0.642322,0.18


In [24]:
results_val_GruBERT[3]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.918919,0.351351,0.635135,0.2
1,grubert,pos_tags_enrichment,0.932157,0.335135,0.633646,0.25
2,grubert,hashtag_enrichment,0.915676,0.354978,0.635327,0.19


## Test Set

In [25]:
from typing import Dict
import torch
import pandas as pd
from tqdm import tqdm
from metrics import DataFactory, Evaluator

results_test_GruBERT: Dict[int, pd.DataFrame] = {}
seeds = [42, 69, 420, 1, 3]

for seed in seeds:
    print("Seed", seed)
    parent_dir_weights = os.path.join(parent_dir, f'6_training/weights/{seed}/')
    weights_file_names = [
        "GruBERT_base_case.pth",
        "GruBERT_pos_tags_enrichment.pth",
        "GruBERT_hashtag_enrichment.pth",
    ]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Create a DataFactory
    data_factory = DataFactory(
        embedding_model_path=os.path.join(parent_dir, "embedding_models/italian_word2vec_100.bin"),
        bert_model_name="m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"
    )

    # 2) Create an Evaluator
    evaluator = Evaluator(
        data_factory=data_factory,
        device=device,
        gru_hidden_size=32,
        num_gru_layers=2,
        gru_dropout=0.2,
        embedding_dim_gru=100,
        batch_size=8,
        max_len=50
    )

    # 3) Run evaluation
    results_df = evaluator.compute_metrics_over_files(
        file_list=weights_file_names,
        threshold_df=gru_val_set,
        eval_df=gru_test_set,
        parent_dir=parent_dir_weights
    )

    results_test_GruBERT[seed] = results_df


Seed 42


Evaluating models: 100%|██████████| 3/3 [00:29<00:00,  9.80s/it]


Seed 69


Evaluating models: 100%|██████████| 3/3 [00:29<00:00,  9.86s/it]


Seed 420


Evaluating models: 100%|██████████| 3/3 [00:29<00:00,  9.68s/it]


Seed 1


Evaluating models: 100%|██████████| 3/3 [00:30<00:00, 10.03s/it]


Seed 3


Evaluating models: 100%|██████████| 3/3 [00:30<00:00, 10.27s/it]


In [26]:
results_test_GruBERT[42]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.913422,0.462094,0.687758,0.2
1,grubert,pos_tags_enrichment,0.906854,0.453608,0.680231,0.19
2,grubert,hashtag_enrichment,0.917389,0.464419,0.690904,0.21


In [27]:
results_test_GruBERT[69]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.904061,0.454849,0.679455,0.18
1,grubert,pos_tags_enrichment,0.924915,0.45,0.687457,0.25
2,grubert,hashtag_enrichment,0.909621,0.452297,0.680959,0.19


In [28]:
results_test_GruBERT[420]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.926009,0.383178,0.654593,0.22
1,grubert,pos_tags_enrichment,0.926136,0.453782,0.689959,0.26
2,grubert,hashtag_enrichment,0.908353,0.423358,0.665855,0.18


In [29]:
results_test_GruBERT[1]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.925,0.445378,0.685189,0.26
1,grubert,pos_tags_enrichment,0.897076,0.388889,0.642982,0.19
2,grubert,hashtag_enrichment,0.903835,0.462046,0.682941,0.18


In [30]:
results_test_GruBERT[3]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.91056,0.415094,0.662827,0.2
1,grubert,pos_tags_enrichment,0.927731,0.394366,0.661049,0.25
2,grubert,hashtag_enrichment,0.907826,0.417582,0.662704,0.19


## Display

In [31]:
from metrics import average_out_scores_for_seeds

average_results_val = average_out_scores_for_seeds(results_val_GruBERT)

average_results_val

Unnamed: 0_level_0,Unnamed: 1_level_0,F1_not_ironic,F1_not_ironic,F1_ironic,F1_ironic,F1_avg_0_1,F1_avg_0_1,BestThreshold,BestThreshold
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
ModelClass,Configuration,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
grubert,base_case,0.920951,0.008369,0.367075,0.024239,0.644013,0.009596,0.212,0.030332
grubert,hashtag_enrichment,0.914076,0.006004,0.369988,0.017146,0.642032,0.00908,0.19,0.012247
grubert,pos_tags_enrichment,0.919569,0.012085,0.361613,0.021961,0.640591,0.009995,0.228,0.034928


In [32]:
average_results_test = average_out_scores_for_seeds(results_test_GruBERT)

average_results_test

Unnamed: 0_level_0,Unnamed: 1_level_0,F1_not_ironic,F1_not_ironic,F1_ironic,F1_ironic,F1_avg_0_1,F1_avg_0_1,BestThreshold,BestThreshold
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
ModelClass,Configuration,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
grubert,base_case,0.91581,0.009484,0.432119,0.032696,0.673965,0.01454,0.212,0.030332
grubert,hashtag_enrichment,0.909405,0.00496,0.443941,0.021997,0.676673,0.011962,0.19,0.012247
grubert,pos_tags_enrichment,0.916542,0.013785,0.428129,0.033411,0.672336,0.019941,0.228,0.034928
