In [153]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from pathlib import Path
import torch
from transformers import (
    BertTokenizer,
    BertModel,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from torch.utils.data import DataLoader
import tqdm.notebook as tq
from datasets import Dataset

In [50]:
from torch import cuda

device = "cuda" if cuda.is_available() else "cpu"

## Corpus

In [51]:
### Load corpus

data_path = Path("data/MELD_train_efr.json")
assert data_path.exists(), "Data file is not present"
df = pd.read_json(data_path, dtype={"speakers": np.array, "triggers": np.array})
EPISODE, SPEAKERS, EMOTIONS, UTTERANCES, TRIGGERS = df.columns

#### Data exploration

In [52]:
df.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[surprise, sadness, surprise, fear]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"


In [53]:
df.describe()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
count,4000,4000,4000,4000,4000
unique,4000,3350,3427,3998,523
top,utterance_0,"[Monica, Chandler, Monica]","[neutral, neutral, joy]",[Happy?! Is that what I'm supposed to be Vic? ...,"[0.0, 1.0, 0.0]"
freq,1,15,30,2,191


In [54]:
### Look for how many groups of episodes with the same first utterance there are and their lenghts

df.sort_values(by=UTTERANCES, inplace=True)
groups = np.zeros((850,), dtype=int)

index = 0
count = 1
for i in range(1, len(df[UTTERANCES])):
    if df[UTTERANCES][i][0] == df[UTTERANCES][i - 1][0]:
        ### still in the same group
        count += 1
    else:
        ### found new group
        groups[index] = count
        index += 1
        count = 1

groups = groups[groups != 0]

print(f"Number of groups: {len(groups)}")
print(f"Avg group len: {np.average(groups):.1f}")
print(f"Longest group: {np.max(groups)}")
print(f"Episodes not in a group: {groups[groups == 1].shape[0]}")

Number of groups: 832
Avg group len: 4.8
Longest group: 16
Episodes not in a group: 128


In [55]:
### Count how many speakers there are in each episode

speakers_count = df[SPEAKERS].apply(lambda arr: np.unique(arr).shape[0]).to_numpy()
min_sp = np.min(speakers_count)
max_sp = np.max(speakers_count)
print("Distribution of number of speakers:")
for count in range(min_sp, max_sp + 1):
    print(f"{count} speakers:  {np.sum(speakers_count == count)}")

Distribution of number of speakers:
1 speakers:  214
2 speakers:  2105
3 speakers:  1030
4 speakers:  405
5 speakers:  161
6 speakers:  74
7 speakers:  10
8 speakers:  1


#### Data cleanup

In [56]:
### Drop not useful column

df.drop(columns=[EPISODE])

Unnamed: 0,speakers,emotions,utterances,triggers
1061,"[Joey, Gunther, Joey]","[joy, neutral, surprise]","[""Happy birthday to you!"", You're paying for t...","[0.0, 1.0, 0.0]"
1062,"[Joey, Gunther, Joey, Gunther]","[joy, neutral, surprise, surprise]","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 1.0, 0.0]"
1063,"[Joey, Gunther, Joey, Gunther, Joey]","[joy, neutral, surprise, surprise, neutral]","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 1.0, 0.0]"
1064,"[Joey, Gunther, Joey, Gunther, Joey, Gunther]","[joy, neutral, surprise, surprise, neutral, ne...","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
1065,"[Joey, Gunther, Joey, Gunther, Joey, Gunther, ...","[joy, neutral, surprise, surprise, neutral, ne...","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
...,...,...,...,...
597,"[Singer, Joey, Phoebe, Chandler, Phoebe, Chand...","[joy, surprise, anger, neutral, neutral, neutr...","[Cause every time I see your face, I can't he...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
598,"[Singer, Joey, Phoebe, Chandler, Phoebe, Chand...","[joy, surprise, anger, neutral, neutral, neutr...","[Cause every time I see your face, I can't he...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
599,"[Singer, Joey, Phoebe, Chandler, Phoebe, Chand...","[joy, surprise, anger, neutral, neutral, neutr...","[Cause every time I see your face, I can't he...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
600,"[Singer, Joey, Phoebe, Chandler, Phoebe, Chand...","[joy, surprise, anger, neutral, neutral, neutr...","[Cause every time I see your face, I can't he...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [57]:
### Remove Nones from the triggers

df[TRIGGERS] = df[TRIGGERS].apply(
    lambda trig_seq: np.array([0.0 if t is None else t for t in trig_seq])
)

In [58]:
df.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
1061,utterance_1061,"[Joey, Gunther, Joey]","[joy, neutral, surprise]","[""Happy birthday to you!"", You're paying for t...","[0.0, 1.0, 0.0]"
1062,utterance_1062,"[Joey, Gunther, Joey, Gunther]","[joy, neutral, surprise, surprise]","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 1.0, 0.0]"
1063,utterance_1063,"[Joey, Gunther, Joey, Gunther, Joey]","[joy, neutral, surprise, surprise, neutral]","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 1.0, 0.0]"
1064,utterance_1064,"[Joey, Gunther, Joey, Gunther, Joey, Gunther]","[joy, neutral, surprise, surprise, neutral, ne...","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
1065,utterance_1065,"[Joey, Gunther, Joey, Gunther, Joey, Gunther, ...","[joy, neutral, surprise, surprise, neutral, ne...","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"


### Data preprocessing:
If an episode contains the same utterances of the previous and a few more then the triggers from the previous episode are replicated in the current episode

In [59]:
for i in range(10):
    print(f"{df[TRIGGERS][i]}")

[0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 1. 0.]
[0. 0. 0. 1. 0.]
[0. 0. 0. 0. 1. 1. 0.]
[0. 1. 0.]
[0. 0. 0. 1.]
[0. 0. 0. 1. 0.]


In [60]:
count = 0
for i in range(1, len(df)):
    is_continuation = np.all([u in df[UTTERANCES][i] for u in df[UTTERANCES][i - 1]])
    if is_continuation:
        count += 1
        for k, t in enumerate(df[TRIGGERS][i - 1]):
            df[TRIGGERS][i][k] = t

In [61]:
for i in range(10):
    print(f"{df[TRIGGERS][i]}")

[0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0. 1. 0.]
[0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0.]
[0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0.]
[0. 0. 1. 0.]
[0. 0. 1. 0. 0.]
[0. 0. 1. 0. 0. 1. 0.]
[0. 1. 0.]
[0. 1. 0. 1.]
[0. 1. 0. 1. 0.]


#### Train/Validation/Test split

In [62]:
### Train Val Test split: 80/10/10


def split_data(df: pd.DataFrame, seed: int = 42):
    df_train, df_test = train_test_split(
        df, test_size=0.2, train_size=0.8, random_state=seed
    )

    df_val, df_test = train_test_split(
        df_test, test_size=0.5, train_size=0.5, random_state=seed
    )

    return df_train, df_val, df_test

In [63]:
### Check
df_train, df_val, df_test = split_data(df)
print(f"df_train len: {len(df_train)}")
print(f"df_val len: {len(df_val)}")
print(f"df_test len: {len(df_test)}")

df_train len: 3200
df_val len: 400
df_test len: 400


In [64]:
### Class imbalance check
classes_count = {}
for emotions in df_train["emotions"]:
    for emotion in emotions:
        if emotion in classes_count:
            classes_count[emotion] += 1
        else:
            classes_count[emotion] = 1

# then we sort the dictionary by occurences
emotions_dict = {
    k: v
    for k, v in sorted(classes_count.items(), key=lambda item: item[1], reverse=True)
}
print("Classes values:")
print(emotions_dict)

# Class imbalance abbastanza alto, potremmo usare dei weights

Classes values:
{'neutral': 12287, 'joy': 4948, 'surprise': 3700, 'anger': 3184, 'sadness': 2159, 'fear': 903, 'disgust': 853}


### Metrics

In [65]:
### given 2 series of sequences of triggers/emotions compute F1 inside each sequence and return avg, flatten out and compute F1
def sequence_f1(y_true, y_pred, avg: bool = True):
    res = [
        f1_score(y_true=y_t, y_pred=y_p, average="micro")
        for y_t, y_p in zip(y_true, y_pred)
    ]
    return np.average(res) if avg else res


def unrolled_f1(y_true, y_pred):
    y_t_flat = []
    for l in y_true:
        for e in l:
            y_t_flat.append(e)

    y_p_flat = []
    for l in y_pred:
        for e in l:
            y_p_flat.append(e)

    return f1_score(y_true=y_t_flat, y_pred=y_p_flat, average="micro")

## Baseline Models: 

In [66]:
### Create baseline models


class SequenceDummyClassifier(DummyClassifier):
    def __init__(self, strategy: str, seed: int = 42) -> None:
        self.seed = seed
        if not strategy.lower() in ("random", "majority"):
            raise ValueError("strategy must be in [random, majority]")
        sklearn_strategy = "uniform" if strategy == "random" else "most_frequent"
        super().__init__(strategy=sklearn_strategy, random_state=seed)

    ### TODO discuss: problem = flattening sequences of != len from the df

    def _flatten_seq(self, df: pd.Series):
        res = []
        for l in df:
            for e in l:
                res.append(e)
        return res

    def _flatten_seq_(self, df: pd.Series):
        max_len = np.max(df.apply(lambda s: len(s)).to_numpy())
        dtype = type(df[0][0])
        pad_element = dtype(999999)
        ### Pad utterances with 0, flatten array
        df = np.array(
            df.apply(
                lambda s: np.hstack(
                    (
                        s,
                        np.repeat(
                            [pad_element],
                            repeats=(max_len - len(s)),
                        ),
                    )
                )
            ).to_list()
        ).flatten()
        ### Remove padding
        return (
            df[np.char.not_equal(df, pad_element)]
            if dtype == str
            else df[df != dtype(pad_element)]
        )

    def _deflatten_seq(self, seq, shape_like: pd.Series):
        data = iter(seq)
        result = [[next(data) for _ in s] for s in shape_like]
        return result

    def fit(self, X: pd.Series, y: pd.Series):
        X_flat = self._flatten_seq(X)
        y_flat = self._flatten_seq(y)
        super().fit(X=X_flat, y=y_flat)

    def predict(self, X: pd.Series, return_flat: bool = False):
        X_flat = self._flatten_seq(X)
        y_flat = super().predict(X_flat)
        return y_flat if return_flat else self._deflatten_seq(seq=y_flat, shape_like=X)

In [67]:
def experiment_baseline(df_train: pd.DataFrame, df_test: pd.DataFrame, seed: int = 42):

    baseline_f1s = {}
    baseline_results = {}

    for strategy in ("Random", "Majority"):
        for target in (EMOTIONS, TRIGGERS):
            clf = SequenceDummyClassifier(strategy=strategy, seed=seed)
            clf.fit(X=df_train[UTTERANCES], y=df_train[target])

            res = clf.predict(X=df_test[UTTERANCES], return_flat=False)
            baseline_results.update({f"{target}_{strategy}": res})

            seq_f1 = sequence_f1(y_true=df_test[target], y_pred=res)
            baseline_f1s.update({f"sequence_f1({target}_{strategy})": seq_f1})

            unr_f1 = unrolled_f1(y_true=df_test[target], y_pred=res)
            baseline_f1s.update({f"unrolled_f1({target}_{strategy})": unr_f1})

    return baseline_f1s, baseline_results


f1s, results = experiment_baseline(df_train, df_test)
for k, v in f1s.items():
    print(f"{k} : {v}")

sequence_f1(emotions_Random) : 0.4233349753853817
unrolled_f1(emotions_Random) : 0.4330935251798561
sequence_f1(triggers_Random) : 0.6519219611872475
unrolled_f1(triggers_Random) : 0.6515107913669065
sequence_f1(emotions_Majority) : 0.4233349753853817
unrolled_f1(emotions_Majority) : 0.4330935251798561
sequence_f1(triggers_Majority) : 0.6519219611872475
unrolled_f1(triggers_Majority) : 0.6515107913669065


# Tokenization

In [68]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [133]:
def tokenize(ds_row, tokenizer=tokenizer):

    encoded_ds_row = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        "emotions": ds_row["emotions"],
        "triggers": ds_row["triggers"],
    }

    for sentence in ds_row["utterances"]:

        tokenized_sentence = tokenizer(
            sentence,
            truncation=True,
            padding="max_length",
            max_length=tokenizer.model_max_length,
            return_tensors="pt",
        )
        # TODO instead of appending to a list try to concatenate the tokens, take care of the sizes (n*512 is wrong)
        encoded_ds_row["input_ids"].append(tokenized_sentence["input_ids"])
        encoded_ds_row["token_type_ids"].append(tokenized_sentence["token_type_ids"])
        encoded_ds_row["attention_mask"].append(tokenized_sentence["attention_mask"])

    return encoded_ds_row

In [134]:
ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)
ds_test = Dataset.from_pandas(df_test)

In [135]:
ds_train_tokenized = ds_train.map(
    function=tokenize,
    fn_kwargs={"tokenizer": tokenizer},
    batched=True,
    remove_columns=[EPISODE, SPEAKERS, UTTERANCES],
)
ds_val_tokenized = ds_val.map(
    function=tokenize,
    fn_kwargs={"tokenizer": tokenizer},
    batched=True,
    remove_columns=[EPISODE, SPEAKERS, UTTERANCES],
)
ds_test_tokenized = ds_test.map(
    function=tokenize,
    fn_kwargs={"tokenizer": tokenizer},
    batched=True,
    remove_columns=[EPISODE, SPEAKERS, UTTERANCES],
)

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [129]:
ds_train_tokenized[0]

{'emotions': ['joy', 'surprise', 'anger', 'neutral', 'neutral'],
 'utterances': ["\x91Cause every time I see your face, I can't help but fall from grace. I know.....",
  'Wow! This girl is good.',
  'Oh-ho yeah! A song with rhyming words. Oo, I never thought of that before.',
  'I like her.',
  'Why? Because she can sing and play guitar and do both at the same time?'],
 'triggers': [0.0, 0.0, 0.0, 1.0, 0.0],
 '__index_level_0__': 596,
 'input_ids': [[101,
   3426,
   2296,
   2051,
   1045,
   2156,
   2115,
   2227,
   1010,
   1045,
   2064,
   1005,
   1056,
   2393,
   2021,
   2991,
   2013,
   4519,
   1012,
   1045,
   2113,
   1012,
   1012,
   1012,
   1012,
   1012,
   102,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
 

In [110]:
# Test della funzione
tokens = tokenize(df_train["utterances"][0], tokenizer)
print(tokens)

TypeError: list indices must be integers or slices, not str

# Bert Models

In [136]:
class BERTClass(torch.nn.Module):
    def __init__(self, num_emotions=7):
        super(BERTClass, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-uncased")

        self.dropout = torch.nn.Dropout(0.3)
        # self.lstm = torch.nn.LSTM()
        # classifiers
        self.l_emotions = torch.nn.Linear(self.bert.config.hidden_size, num_emotions)
        self.l_triggers = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, ids, mask, token_type_ids):
        output = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)

        output_emotions = self.dropout(output)
        output_triggers = self.dropout(output)

        # output_triggers = self.lstm(output_triggers)

        output_emotions = self.l_emotions(output_emotions)
        output_triggers = self.l_triggers(output_triggers)

        return output_emotions, output_triggers

    def freeze_params(self):
        for param in self.bert.parameters():
            param.requires_grad = False

# Training Utils

In [137]:
num_emotions = df["emotions"].explode().nunique()

model_frozen = BERTClass(num_emotions)
model_full = BERTClass(num_emotions)

model_frozen.freeze_params()

# Verifying that the params are actually frozen
for name, param in model_frozen.named_parameters():
    print(name, param.requires_grad)

for name, param in model_full.named_parameters():
    print(name, param.requires_grad)

bert.embeddings.word_embeddings.weight False
bert.embeddings.position_embeddings.weight False
bert.embeddings.token_type_embeddings.weight False
bert.embeddings.LayerNorm.weight False
bert.embeddings.LayerNorm.bias False
bert.encoder.layer.0.attention.self.query.weight False
bert.encoder.layer.0.attention.self.query.bias False
bert.encoder.layer.0.attention.self.key.weight False
bert.encoder.layer.0.attention.self.key.bias False
bert.encoder.layer.0.attention.self.value.weight False
bert.encoder.layer.0.attention.self.value.bias False
bert.encoder.layer.0.attention.output.dense.weight False
bert.encoder.layer.0.attention.output.dense.bias False
bert.encoder.layer.0.attention.output.LayerNorm.weight False
bert.encoder.layer.0.attention.output.LayerNorm.bias False
bert.encoder.layer.0.intermediate.dense.weight False
bert.encoder.layer.0.intermediate.dense.bias False
bert.encoder.layer.0.output.dense.weight False
bert.encoder.layer.0.output.dense.bias False
bert.encoder.layer.0.output.Lay

In [138]:
model_list = [model_full, model_frozen]
num_epochs = 5

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

for model in model_list:
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = torch.nn.CrossEntropyLoss()
    model.train()

    # Tokenizer initiation
    # TODO Check tokenizer parameters
    # TODO A Dataloader seems to be commonly used for this use cases
    # encoding = tokenizer(df_train, truncation=False, padding='max_length', return_tensors='pt')
    # input_ids = encoding['input_ids']
    # attention_mask = encoding['attention_mask']

    # Training loop
    # for epoch in range(num_epochs):
    #
    #    for idx in range(len(df_test)):

    #        text = df_train[idx].drop(TRIGGERS)
    #        label = df_train[idx][TRIGGERS]

    #        optimizer.zero_grad()

    #        logits = model(batch_data)
    #        loss = loss_fn(logits, batch_labels)
    #        loss.backward()
    #        optimizer.step()

In [139]:
def loss_fn(outputs_emotions, outputs_triggers, emotions_labels, triggers_labels):
    return torch.nn.CrossEntropyLoss(
        outputs_emotions, emotions_labels
    ) + torch.nn.BCELoss(outputs_triggers, triggers_labels)

In [140]:
### Training of the model
def train_model(train_dl, model, optimizer):
    losses = []
    correct_predictions_emotions = 0
    correct_predictions_triggers = 0
    num_samples_emotions = 0
    num_samples_triggers = 0

    ### activate dropout, batch norm
    model.train()

    ### initialize progress bar
    batches = tq.tqdm(
        enumerate(train_dl), total=len(train_dl), leave=True, colour="steelblue"
    )

    for batch_idx, data in batches:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        emotions_labels = data["emotions"].to(device, dtype=torch.float)
        triggers_labels = data["triggers"].to(device, dtype=torch.float)
        outputs_emotions, outputs_triggers = model(ids, mask)  ### Forward

        loss = loss_fn(
            outputs_emotions, outputs_triggers, emotions_labels, triggers_labels
        )
        losses.append(loss.cpu().detach().numpy())

        ### apply thresh 0.5
        outputs_emotions = (
            torch.sigmoid(outputs_emotions).cpu().detach().numpy().round()
        )
        outputs_triggers = (
            torch.sigmoid(outputs_triggers).cpu().detach().numpy().round()
        )

        emotions_labels = emotions_labels.cpu().detach().numpy()
        triggers_labels = triggers_labels.cpu().detach().numpy()

        correct_predictions_emotions += np.sum(outputs_emotions == emotions_labels)
        correct_predictions_triggers += np.sum(outputs_triggers == triggers_labels)

        num_samples_emotions += emotions_labels.size
        num_samples_triggers += triggers_labels.size

        ### Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        ### Grad descent step
        optimizer.step()

        ### Update progress bar
        batches.set_description(f"")
        batches.set_postfix(batch_loss=loss)

    # Si potrebbe fare una singola accuracy come media delle due, magari fuori dal training
    accuracy_emotions = float(correct_predictions_emotions) / num_samples_emotions
    accuracy_triggers = float(correct_predictions_triggers) / num_samples_triggers

    return model, accuracy_emotions, accuracy_triggers, losses

In [141]:
# eval model, setup e train_eval da definire

In [142]:
def create_data_loaders(tokenized_datasets, batch_size):
    train_dl = torch.utils.data.DataLoader(
        tokenized_datasets["train"],
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
    )

    validation_dl = torch.utils.data.DataLoader(
        tokenized_datasets["validation"],
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
    )

    test_dl = torch.utils.data.DataLoader(
        tokenized_datasets["test"],
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
    )
    return train_dl, validation_dl, test_dl

In [143]:
class BertTrainer(Trainer):
    def __init__(self, model, training_args, train_ds, eval_ds, metrics):
        super().__init__(model, training_args, train_ds, eval_ds, metrics)

    def compute_loss(self, model, inputs, return_outputs=False):

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        emotions_labels = inputs["emotions"]
        triggers_labels = inputs["triggers"]
        output_emotions, output_triggers = model(ids, mask)

        custom_loss = self.loss_fn(
            output_emotions, output_triggers, emotions_labels, triggers_labels
        )

        return (
            (custom_loss, output_emotions, output_triggers)
            if return_outputs
            else custom_loss
        )

    def loss_fn(outputs_emotions, outputs_triggers, emotions_labels, triggers_labels):
        return torch.nn.CrossEntropyLoss(
            outputs_emotions, emotions_labels
        ) + torch.nn.BCELoss(outputs_triggers, triggers_labels)

In [144]:
training_args = TrainingArguments(
    output_dir="./test",
    do_train=True,
    do_eval=True,
    # evaluate_during_training=True,
    learning_rate=5e-5,
    num_train_epochs=8,
    seed=42,
)

In [159]:
trainer = Trainer(
    model=model_full,
    args=training_args,
    # data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=ds_train,
    eval_dataset=ds_val,
    compute_metrics=sequence_f1,
    # tokenizer=tokenizer,
)

In [160]:
trainer.train()

  0%|          | 0/3200 [00:00<?, ?it/s]

IndexError: Invalid key: 2969 is out of bounds for size 0