In [9]:
%load_ext autoreload
%autoreload 2

import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.insert(0, parent_dir)

seeds = [42, 69, 420, 1, 3]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Getting the data

As always, let's import our last updated data.

In [3]:
from utils import load_datasets

gru_data_folder = os.path.join(parent_dir, "3_preprocessing", "output", "gru")
gru_train_set, gru_val_set, gru_test_set = load_datasets(folder=gru_data_folder)

grubert_data_folder = os.path.join(parent_dir, "3_preprocessing", "output", "grubert")
grubert_train_set, grubert_val_set, grubert_test_set = load_datasets(folder=grubert_data_folder)

# Baseline evaluation

In [15]:
from metrics import compute_baseline_metrics

f1_not_ironic_random, f1_ironic_random, zero_one_f1_random = compute_baseline_metrics(gru_val_set['iro'], len(gru_val_set), "random", seeds=seeds)

print("The scores are the mean values across all seeds:")
print()
print("f1_not_ironic_random: ", f1_not_ironic_random)
print("f1_ironic_random: ", f1_ironic_random)
print("zero_one_f1_random: ", zero_one_f1_random)

The scores are the mean values across all seeds:

f1_not_ironic_random:  0.6458375794869037
f1_ironic_random:  0.17243933610161782
zero_one_f1_random:  0.4091384577942607


In [13]:
f1_not_ironic_majority, f1_1_ironic_majority, zero_one_f1_majority = compute_baseline_metrics(gru_val_set['iro'], len(gru_val_set), "majority", seeds=seeds)

print("f1_not_ironic_majority:", f1_not_ironic_majority)
print("f1_ironic_majority: ", f1_1_ironic_majority)
print("zero_one_f1_majority: ", zero_one_f1_majority)

f1_not_ironic_majority: 0.9428571428571428
f1_ironic_majority:  0.0
zero_one_f1_majority:  0.4714285714285714


# Gru Evaluation

## Validation Set

In [17]:
from typing import Dict
import torch
import pandas as pd
from tqdm import tqdm
from metrics import DataFactory, Evaluator

results_val_Gru: Dict[int, pd.DataFrame] = {}

for seed in seeds:
    print("Seed", seed)
    parent_dir_weights = os.path.join(parent_dir, f'6_training/weights/{seed}/')
    weights_file_names = [
        "Gru_base_case.pth",
        "Gru_hashtag_segmentation.pth",
        "Gru_pos_tags_enrichment.pth",
        "Gru_hashtag_enrichment.pth",
    ]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Create a DataFactory
    data_factory = DataFactory(
        embedding_model_path=os.path.join(parent_dir, "embedding_models/italian_word2vec_100.bin"),
        bert_model_name="m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"
    )

    # 2) Create an Evaluator
    evaluator = Evaluator(
        data_factory=data_factory,
        device=device,
        gru_hidden_size=32,
        num_gru_layers=2,
        gru_dropout=0.2,
        embedding_dim_gru=100,
        batch_size=8,
        max_len=50
    )

    # 3) Run evaluation
    results_df = evaluator.compute_metrics_over_files(
        file_list=weights_file_names,
        threshold_df=gru_val_set,
        eval_df=gru_val_set,
        parent_dir=parent_dir_weights
    )

    results_val_Gru[seed] = results_df


Seed 42


Evaluating models: 100%|██████████| 4/4 [01:57<00:00, 29.26s/it]


Seed 69


Evaluating models: 100%|██████████| 4/4 [01:50<00:00, 27.68s/it]


Seed 420


Evaluating models: 100%|██████████| 4/4 [02:02<00:00, 30.58s/it]


Seed 1


Evaluating models: 100%|██████████| 4/4 [02:10<00:00, 32.69s/it]


Seed 3


Evaluating models: 100%|██████████| 4/4 [02:07<00:00, 31.81s/it]


In [18]:
results_val_Gru[42]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.92713,0.392523,0.659827,0.3
1,gru,hashtag_segmentation,0.924986,0.408889,0.666937,0.21
2,gru,pos_tags_enrichment,0.900173,0.34717,0.623671,0.17
3,gru,hashtag_enrichment,0.90971,0.329114,0.619412,0.19


In [19]:
results_val_Gru[69]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.913368,0.407843,0.660605,0.21
1,gru,hashtag_segmentation,0.933407,0.387755,0.660581,0.26
2,gru,pos_tags_enrichment,0.914124,0.333333,0.623729,0.2
3,gru,hashtag_enrichment,0.917369,0.328767,0.623068,0.21


In [20]:
results_val_Gru[420]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.924719,0.385321,0.65502,0.28
1,gru,hashtag_segmentation,0.924888,0.373832,0.64936,0.21
2,gru,pos_tags_enrichment,0.90971,0.329114,0.619412,0.22
3,gru,hashtag_enrichment,0.91238,0.323144,0.617762,0.23


In [21]:
results_val_Gru[1]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.915621,0.393443,0.654532,0.25
1,gru,hashtag_segmentation,0.927374,0.375,0.651187,0.23
2,gru,pos_tags_enrichment,0.90971,0.329114,0.619412,0.22
3,gru,hashtag_enrichment,0.908884,0.338843,0.623863,0.2


In [22]:
results_val_Gru[3]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.91895,0.422764,0.670857,0.22
1,gru,hashtag_segmentation,0.914481,0.385246,0.649864,0.18
2,gru,pos_tags_enrichment,0.906178,0.344,0.625089,0.18
3,gru,hashtag_enrichment,0.913314,0.343348,0.628331,0.19


## Test Set

In [23]:
from typing import Dict
import torch
import pandas as pd
from tqdm import tqdm
from metrics import DataFactory, Evaluator

results_test_Gru: Dict[int, pd.DataFrame] = {}
seeds = [42, 69, 420, 1, 3]

for seed in seeds:
    print("Seed", seed)
    parent_dir_weights = os.path.join(parent_dir, f'6_training/weights/{seed}/')
    weights_file_names = [
        "Gru_base_case.pth",
        "Gru_hashtag_segmentation.pth",
        "Gru_pos_tags_enrichment.pth",
        "Gru_hashtag_enrichment.pth",
    ]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Create a DataFactory
    data_factory = DataFactory(
        embedding_model_path=os.path.join(parent_dir, "embedding_models/italian_word2vec_100.bin"),
        bert_model_name="m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"
    )

    # 2) Create an Evaluator
    evaluator = Evaluator(
        data_factory=data_factory,
        device=device,
        gru_hidden_size=32,
        num_gru_layers=2,
        gru_dropout=0.2,
        embedding_dim_gru=100,
        batch_size=8,
        max_len=50
    )

    # 3) Run evaluation
    results_df = evaluator.compute_metrics_over_files(
        file_list=weights_file_names,
        threshold_df=gru_val_set,
        eval_df=gru_test_set,
        parent_dir=parent_dir_weights
    )

    results_test_Gru[seed] = results_df


Seed 42


Evaluating models: 100%|██████████| 4/4 [01:50<00:00, 27.53s/it]


Seed 69


Evaluating models: 100%|██████████| 4/4 [01:44<00:00, 26.23s/it]


Seed 420


Evaluating models: 100%|██████████| 4/4 [01:45<00:00, 26.32s/it]


Seed 1


Evaluating models: 100%|██████████| 4/4 [01:43<00:00, 25.88s/it]


Seed 3


Evaluating models: 100%|██████████| 4/4 [01:40<00:00, 25.24s/it]


In [24]:
results_test_Gru[42]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.924464,0.401786,0.663125,0.3
1,gru,hashtag_segmentation,0.919318,0.403361,0.66134,0.21
2,gru,pos_tags_enrichment,0.895505,0.37193,0.633717,0.17
3,gru,hashtag_enrichment,0.905227,0.357977,0.631602,0.19


In [25]:
results_test_Gru[69]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.904429,0.41844,0.661434,0.21
1,gru,hashtag_segmentation,0.923767,0.364486,0.644126,0.26
2,gru,pos_tags_enrichment,0.915909,0.378151,0.64703,0.2
3,gru,hashtag_enrichment,0.918644,0.368421,0.643533,0.21


In [26]:
results_test_Gru[420]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.923251,0.39823,0.66074,0.28
1,gru,hashtag_segmentation,0.91898,0.386266,0.652623,0.21
2,gru,pos_tags_enrichment,0.913242,0.382114,0.647678,0.22
3,gru,hashtag_enrichment,0.917847,0.377682,0.647765,0.23


In [27]:
results_test_Gru[1]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.914845,0.430769,0.672807,0.25
1,gru,hashtag_segmentation,0.919618,0.347032,0.633325,0.23
2,gru,pos_tags_enrichment,0.913242,0.382114,0.647678,0.22
3,gru,hashtag_enrichment,0.91929,0.438247,0.678769,0.2


In [28]:
results_test_Gru[3]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,gru,base_case,0.909302,0.438849,0.674076,0.22
1,gru,hashtag_segmentation,0.911392,0.407692,0.659542,0.18
2,gru,pos_tags_enrichment,0.907514,0.402985,0.65525,0.18
3,gru,hashtag_enrichment,0.911899,0.384,0.64795,0.19


## Display

In [29]:
from metrics import average_out_scores_for_seeds

average_results_val = average_out_scores_for_seeds(results_val_Gru)

average_results_val

Unnamed: 0_level_0,Unnamed: 1_level_0,F1_not_ironic,F1_not_ironic,F1_ironic,F1_ironic,F1_avg_0_1,F1_avg_0_1,BestThreshold,BestThreshold
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
ModelClass,Configuration,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
gru,base_case,0.919958,0.00586,0.400379,0.014945,0.660168,0.006573,0.252,0.038341
gru,hashtag_enrichment,0.912332,0.003358,0.332643,0.008227,0.622487,0.004128,0.204,0.016733
gru,hashtag_segmentation,0.925027,0.006837,0.386144,0.014111,0.655586,0.007821,0.218,0.029496
gru,pos_tags_enrichment,0.907979,0.005195,0.336546,0.008503,0.622263,0.002663,0.198,0.022804


In [30]:
average_results_test = average_out_scores_for_seeds(results_test_Gru)

average_results_test

Unnamed: 0_level_0,Unnamed: 1_level_0,F1_not_ironic,F1_not_ironic,F1_ironic,F1_ironic,F1_avg_0_1,F1_avg_0_1,BestThreshold,BestThreshold
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
ModelClass,Configuration,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
gru,base_case,0.915258,0.008683,0.417615,0.017684,0.666436,0.006469,0.252,0.038341
gru,hashtag_enrichment,0.914581,0.006001,0.385265,0.031203,0.649923,0.017444,0.204,0.016733
gru,hashtag_segmentation,0.918615,0.004482,0.381768,0.025806,0.650191,0.011607,0.218,0.029496
gru,pos_tags_enrichment,0.909083,0.008186,0.383459,0.011683,0.646271,0.00779,0.198,0.022804


# GruBERT Evaluation

## Validation Set

In [31]:
from typing import Dict
import torch
import pandas as pd
from tqdm import tqdm
from metrics import DataFactory, Evaluator

results_val_GruBERT: Dict[int, pd.DataFrame] = {}
seeds = [42, 69, 420, 1, 3]

for seed in seeds:
    print("Seed", seed)
    parent_dir_weights = os.path.join(parent_dir, f'6_training/weights/{seed}/')
    weights_file_names = [
        "GruBERT_base_case.pth",
        "GruBERT_pos_tags_enrichment.pth",
        "GruBERT_hashtag_enrichment.pth",
    ]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Create a DataFactory
    data_factory = DataFactory(
        embedding_model_path=os.path.join(parent_dir, "embedding_models/italian_word2vec_100.bin"),
        bert_model_name="m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"
    )

    # 2) Create an Evaluator
    evaluator = Evaluator(
        data_factory=data_factory,
        device=device,
        gru_hidden_size=32,
        num_gru_layers=2,
        gru_dropout=0.2,
        embedding_dim_gru=100,
        batch_size=8,
        max_len=50
    )

    # 3) Run evaluation
    results_df = evaluator.compute_metrics_over_files(
        file_list=weights_file_names,
        threshold_df=gru_val_set,
        eval_df=gru_val_set,
        parent_dir=parent_dir_weights
    )

    results_val_GruBERT[seed] = results_df


Seed 42


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Evaluating models: 100%|██████████| 3/3 [00:40<00:00, 13.66s/it]


Seed 69


Evaluating models: 100%|██████████| 3/3 [00:37<00:00, 12.65s/it]


Seed 420


Evaluating models: 100%|██████████| 3/3 [00:40<00:00, 13.45s/it]


Seed 1


Evaluating models: 100%|██████████| 3/3 [00:41<00:00, 13.73s/it]


Seed 3


Evaluating models: 100%|██████████| 3/3 [00:40<00:00, 13.39s/it]


In [32]:
results_val_GruBERT[42]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.917473,0.39834,0.657907,0.2
1,grubert,pos_tags_enrichment,0.904378,0.366412,0.635395,0.19
2,grubert,hashtag_enrichment,0.896793,0.374558,0.635676,0.17


In [33]:
results_val_GruBERT[69]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.909925,0.384314,0.64712,0.18
1,grubert,pos_tags_enrichment,0.918402,0.343891,0.631147,0.25
2,grubert,hashtag_enrichment,0.910857,0.370968,0.640912,0.19


In [34]:
results_val_GruBERT[420]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.930901,0.338624,0.634763,0.22
1,grubert,pos_tags_enrichment,0.911798,0.388889,0.650344,0.19
2,grubert,hashtag_enrichment,0.928412,0.390476,0.659444,0.26


In [35]:
results_val_GruBERT[1]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.927536,0.362745,0.645141,0.26
1,grubert,pos_tags_enrichment,0.911798,0.388889,0.650344,0.19
2,grubert,hashtag_enrichment,0.922292,0.417021,0.669656,0.23


In [36]:
results_val_GruBERT[3]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.918919,0.351351,0.635135,0.2
1,grubert,pos_tags_enrichment,0.932157,0.335135,0.633646,0.25
2,grubert,hashtag_enrichment,0.910745,0.343096,0.62692,0.19


## Test Set

In [37]:
from typing import Dict
import torch
import pandas as pd
from tqdm import tqdm
from metrics import DataFactory, Evaluator

results_test_GruBERT: Dict[int, pd.DataFrame] = {}
seeds = [42, 69, 420, 1, 3]

for seed in seeds:
    print("Seed", seed)
    parent_dir_weights = os.path.join(parent_dir, f'6_training/weights/{seed}/')
    weights_file_names = [
        "GruBERT_base_case.pth",
        "GruBERT_pos_tags_enrichment.pth",
        "GruBERT_hashtag_enrichment.pth",
    ]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 1) Create a DataFactory
    data_factory = DataFactory(
        embedding_model_path=os.path.join(parent_dir, "embedding_models/italian_word2vec_100.bin"),
        bert_model_name="m-polignano-uniba/bert_uncased_L-12_H-768_A-12_italian_alb3rt0"
    )

    # 2) Create an Evaluator
    evaluator = Evaluator(
        data_factory=data_factory,
        device=device,
        gru_hidden_size=32,
        num_gru_layers=2,
        gru_dropout=0.2,
        embedding_dim_gru=100,
        batch_size=8,
        max_len=50
    )

    # 3) Run evaluation
    results_df = evaluator.compute_metrics_over_files(
        file_list=weights_file_names,
        threshold_df=gru_val_set,
        eval_df=gru_test_set,
        parent_dir=parent_dir_weights
    )

    results_test_GruBERT[seed] = results_df


Seed 42


Evaluating models: 100%|██████████| 3/3 [00:41<00:00, 13.74s/it]


Seed 69


Evaluating models: 100%|██████████| 3/3 [00:38<00:00, 12.97s/it]


Seed 420


Evaluating models: 100%|██████████| 3/3 [00:39<00:00, 13.09s/it]


Seed 1


Evaluating models: 100%|██████████| 3/3 [00:37<00:00, 12.64s/it]


Seed 3


Evaluating models: 100%|██████████| 3/3 [00:38<00:00, 12.87s/it]


In [38]:
results_test_GruBERT[42]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.913422,0.462094,0.687758,0.2
1,grubert,pos_tags_enrichment,0.906854,0.453608,0.680231,0.19
2,grubert,hashtag_enrichment,0.901125,0.459547,0.680336,0.17


In [39]:
results_test_GruBERT[69]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.904061,0.454849,0.679455,0.18
1,grubert,pos_tags_enrichment,0.924915,0.45,0.687457,0.25
2,grubert,hashtag_enrichment,0.906141,0.404412,0.655277,0.19


In [40]:
results_test_GruBERT[420]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.926009,0.383178,0.654593,0.22
1,grubert,pos_tags_enrichment,0.897076,0.388889,0.642982,0.19
2,grubert,hashtag_enrichment,0.921759,0.445344,0.683552,0.26


In [41]:
results_test_GruBERT[1]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.925,0.445378,0.685189,0.26
1,grubert,pos_tags_enrichment,0.897076,0.388889,0.642982,0.19
2,grubert,hashtag_enrichment,0.915371,0.436782,0.676076,0.23


In [42]:
results_test_GruBERT[3]

Unnamed: 0,ModelClass,Configuration,F1_not_ironic,F1_ironic,F1_avg_0_1,BestThreshold
0,grubert,base_case,0.91056,0.415094,0.662827,0.2
1,grubert,pos_tags_enrichment,0.927731,0.394366,0.661049,0.25
2,grubert,hashtag_enrichment,0.914022,0.437736,0.675879,0.19


## Display

In [43]:
from metrics import average_out_scores_for_seeds

average_results_val = average_out_scores_for_seeds(results_val_GruBERT)

average_results_val

Unnamed: 0_level_0,Unnamed: 1_level_0,F1_not_ironic,F1_not_ironic,F1_ironic,F1_ironic,F1_avg_0_1,F1_avg_0_1,BestThreshold,BestThreshold
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
ModelClass,Configuration,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
grubert,base_case,0.920951,0.008369,0.367075,0.024239,0.644013,0.009596,0.212,0.030332
grubert,hashtag_enrichment,0.91382,0.012174,0.379224,0.027159,0.646522,0.017575,0.208,0.036332
grubert,pos_tags_enrichment,0.915707,0.010449,0.364643,0.024901,0.640175,0.009405,0.214,0.032863


In [45]:
average_results_test = average_out_scores_for_seeds(results_test_GruBERT)

average_results_test

Unnamed: 0_level_0,Unnamed: 1_level_0,F1_not_ironic,F1_not_ironic,F1_ironic,F1_ironic,F1_avg_0_1,F1_avg_0_1,BestThreshold,BestThreshold
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
ModelClass,Configuration,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
grubert,base_case,0.91581,0.009484,0.432119,0.032696,0.673965,0.01454,0.212,0.030332
grubert,hashtag_enrichment,0.911684,0.008105,0.436764,0.02025,0.674224,0.011063,0.208,0.036332
grubert,pos_tags_enrichment,0.91073,0.014817,0.41515,0.033559,0.66294,0.020617,0.214,0.032863
