In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from pathlib import Path
import torch
from transformers import (
    BertTokenizer,
    BertModel,
    Trainer,
    TrainingArguments,
    # DataCollatorWithPadding,
)

# from torch.utils.data import DataLoader
import tqdm.notebook as tq
from datasets import Dataset
from typing import Tuple

# from sklearn.preprocessing import OneHotEncoder

  return torch._C._cuda_getDeviceCount() if nvml_count < 0 else nvml_count


In [2]:
from torch import cuda

device = "cuda" if cuda.is_available() else "cpu"

## Corpus

In [3]:
### Load corpus

data_path = Path("data/MELD_train_efr.json")
assert data_path.exists(), "Data file is not present"
raw_df = pd.read_json(
    data_path, dtype={"speakers": np.array}
)  # , "triggers": np.array})
EPISODE, SPEAKERS, EMOTIONS, UTTERANCES, TRIGGERS = raw_df.columns

#### Data exploration

In [4]:
raw_df.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[surprise, sadness, surprise, fear]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"


In [5]:
raw_df.describe()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
count,4000,4000,4000,4000,4000
unique,4000,3350,3427,3998,523
top,utterance_0,"[Monica, Chandler, Monica]","[neutral, neutral, joy]",[Happy?! Is that what I'm supposed to be Vic? ...,"[0.0, 1.0, 0.0]"
freq,1,15,30,2,191


In [6]:
### Look for how many groups of episodes with the same first utterance there are and their lenghts

raw_df.sort_values(by=UTTERANCES, inplace=True)
groups = np.zeros((850,), dtype=int)

index = 0
count = 1
for i in range(1, len(raw_df[UTTERANCES])):
    if raw_df[UTTERANCES][i][0] == raw_df[UTTERANCES][i - 1][0]:
        ### still in the same group
        count += 1
    else:
        ### found new group
        groups[index] = count
        index += 1
        count = 1

groups = groups[groups != 0]

print(f"Number of groups: {len(groups)}")
print(f"Avg group len: {np.average(groups):.1f}")
print(f"Longest group: {np.max(groups)}")
print(f"Episodes not in a group: {groups[groups == 1].shape[0]}")

Number of groups: 832
Avg group len: 4.8
Longest group: 16
Episodes not in a group: 128


In [7]:
### Count how many speakers there are in each episode

speakers_count = raw_df[SPEAKERS].apply(lambda arr: np.unique(arr).shape[0]).to_numpy()
min_sp = np.min(speakers_count)
max_sp = np.max(speakers_count)
print("Distribution of number of speakers:")
for count in range(min_sp, max_sp + 1):
    print(f"{count} speakers:  {np.sum(speakers_count == count)}")

Distribution of number of speakers:
1 speakers:  214
2 speakers:  2105
3 speakers:  1030
4 speakers:  405
5 speakers:  161
6 speakers:  74
7 speakers:  10
8 speakers:  1


In [8]:
### Class imbalance check
classes_count = {}
for emotions in raw_df["emotions"]:
    for emotion in emotions:
        if emotion in classes_count:
            classes_count[emotion] += 1
        else:
            classes_count[emotion] = 1

### then we sort the dictionary by occurences
emotions_dict = {
    k: v
    for k, v in sorted(classes_count.items(), key=lambda item: item[1], reverse=True)
}
print("Classes values:")
print(emotions_dict)

### Classes counts are not balanced: the use of weights is recommended

Classes values:
{'neutral': 15263, 'joy': 6317, 'surprise': 4645, 'anger': 3964, 'sadness': 2648, 'fear': 1114, 'disgust': 1049}


#### Data cleanup

In [9]:
### Drop not useful column

raw_df.drop(columns=[SPEAKERS], inplace=True)

In [10]:
### Remove Nones from the triggers

raw_df[TRIGGERS] = raw_df[TRIGGERS].apply(
    lambda trig_seq: np.array([0.0 if t is None else t for t in trig_seq])
)

In [11]:
### Change column "episode" from utterance_xyz to episode_xyz
for i in range(len(raw_df)):
    raw_df[EPISODE][i] = f"episode_{raw_df[EPISODE][i][10:]}"

clean_df = raw_df
clean_df.head()

Unnamed: 0,episode,emotions,utterances,triggers
1061,episode_1061,"[joy, neutral, surprise]","[""Happy birthday to you!"", You're paying for t...","[0.0, 1.0, 0.0]"
1062,episode_1062,"[joy, neutral, surprise, surprise]","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 1.0, 0.0]"
1063,episode_1063,"[joy, neutral, surprise, surprise, neutral]","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 1.0, 0.0]"
1064,episode_1064,"[joy, neutral, surprise, surprise, neutral, ne...","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
1065,episode_1065,"[joy, neutral, surprise, surprise, neutral, ne...","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"


### Data preprocessing:


If an episode contains the same utterances of the previous and a few more then the triggers from the previous episode are replicated in the current episode

In [12]:
for i in range(10):
    print(f"{raw_df[TRIGGERS][i]}")

[0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 1. 0.]
[0. 0. 0. 1. 0.]
[0. 0. 0. 0. 1. 1. 0.]
[0. 1. 0.]
[0. 0. 0. 1.]
[0. 0. 0. 1. 0.]


In [13]:
### Replicate triggers

count = 0
for i in range(1, len(clean_df)):
    is_continuation = np.all(
        [u in clean_df[UTTERANCES][i] for u in clean_df[UTTERANCES][i - 1]]
    )
    if is_continuation:
        count += 1
        for k, t in enumerate(clean_df[TRIGGERS][i - 1]):
            clean_df[TRIGGERS][i][k] = t

In [14]:
for i in range(10):
    print(f"{raw_df[TRIGGERS][i]}")

[0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0. 1. 0.]
[0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0.]
[0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0.]
[0. 0. 1. 0.]
[0. 0. 1. 0. 0.]
[0. 0. 1. 0. 0. 1. 0.]
[0. 1. 0.]
[0. 1. 0. 1.]
[0. 1. 0. 1. 0.]


In [15]:
### Train Val Test split: 80/10/10


def split_data(
    df: pd.DataFrame, seed: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    df_train, df_test = train_test_split(
        df, test_size=0.2, train_size=0.8, random_state=seed
    )

    df_val, df_test = train_test_split(
        df_test, test_size=0.5, train_size=0.5, random_state=seed
    )

    return df_train, df_val, df_test

In [16]:
### Check
df_train_t, df_val_t, df_test_t = split_data(clean_df)
print(f"df_train len: {len(df_train_t)}")
print(f"df_val len: {len(df_val_t)}")
print(f"df_test len: {len(df_test_t)}")

df_train len: 3200
df_val len: 400
df_test len: 400


explode the dataframe: <br>
each rows contains: previous utterance, target utterance, next utterance (for context). <br>
Except for the first and lat utterance of each episode that have no previous and next utterance, respectively

In [17]:
def explode_add_context(df: pd.DataFrame, ctxt_win_len: int = 1) -> pd.DataFrame:
    # TODO ok che context_window sia il numero di utt future == numero utt passate invece che la len di tutta la window?

    ### Flatten the lists of utterances,triggers,emotions into new rows of the dataframe
    exploded_df = df.explode([UTTERANCES, TRIGGERS, EMOTIONS], ignore_index=True)
    exploded_df.rename(columns={UTTERANCES: "current"}, inplace=True)
    exploded_df.head(10)

    ### Pair shifted columns of utterances to the exploded df to make previous and next
    for i in range(1, ctxt_win_len + 1):
        padding_cells = pd.Series([" " for _ in range(i)])

        next_col = pd.concat(
            (exploded_df["current"][i:], padding_cells), copy=False
        ).to_list()
        exploded_df.insert(loc=2, column=f"next_{i}", value=next_col)

        previous_col = pd.concat(
            (padding_cells, exploded_df["current"][:-i]), copy=False
        ).to_list()
        exploded_df.insert(loc=2 + 2 * i, column=f"previous_{i}", value=previous_col)

    ### Remove the previous of the first utterance and the next of the last utterance of each episode
    for i in range(1, len(exploded_df) - 1):
        for j in range(1, ctxt_win_len + 1):
            if exploded_df[EPISODE][i] != exploded_df[EPISODE][i - 1]:
                exploded_df[f"next_{j}"][i - j] = " "
                exploded_df[f"previous_{j}"][i] = " "

    exploded_df.sort_values(by=EPISODE, inplace=True)
    return exploded_df

In [18]:
df_train = explode_add_context(df_train_t, 1)
df_val = explode_add_context(df_val_t, 1)
df_test = explode_add_context(df_test_t, 1)

df_train.head(10)

Unnamed: 0,episode,emotions,next_1,current,previous_1,triggers
864,episode_0,neutral,You must've had your hands full.,also I was the point person on my company's tr...,,0.0
868,episode_0,surprise,,My duties? All right.,So let's talk a little bit about your duties.,0.0
867,episode_0,neutral,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,1.0
866,episode_0,neutral,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,0.0
865,episode_0,neutral,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,0.0
24390,episode_1,neutral,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,0.0
24389,episode_1,neutral,You must've had your hands full.,also I was the point person on my company's tr...,,0.0
24391,episode_1,neutral,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,0.0
24395,episode_1,neutral,,I see.,"Now you'll be heading a whole division, so you...",0.0
24392,episode_1,neutral,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,1.0


In [20]:
dff = explode_add_context(df_train_t, 2)
dff.head(10)

Unnamed: 0,episode,emotions,next_2,next_1,current,previous_1,previous_2,triggers
864,episode_0,neutral,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,,,0.0
868,episode_0,surprise,"Yeah, it kinda grows on you. Actually, I want...",,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,0.0
867,episode_0,neutral,,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,1.0
866,episode_0,neutral,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,0.0
865,episode_0,neutral,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,"You-you-you didn't know that. Well, I guess m...",0.0
24390,episode_1,neutral,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,"Hello, Joey.",0.0
24389,episode_1,neutral,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,,,0.0
24391,episode_1,neutral,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,0.0
24395,episode_1,neutral,Listen to the plinky-plunky music.,,I see.,"Now you'll be heading a whole division, so you...",My duties? All right.,0.0
24392,episode_1,neutral,"Now you'll be heading a whole division, so you...",My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,1.0


In [21]:
dff = explode_add_context(df_train_t, 3)
dff.head(6)

Unnamed: 0,episode,emotions,next_3,next_2,next_1,current,previous_1,previous_2,previous_3,triggers
864,episode_0,neutral,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,,,,0.0
868,episode_0,surprise,Oh good.,"Yeah, it kinda grows on you. Actually, I want...",,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,0.0
867,episode_0,neutral,"Yeah, it kinda grows on you. Actually, I want...",,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,1.0
866,episode_0,neutral,,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,"You-you-you didn't know that. Well, I guess m...",0.0
865,episode_0,neutral,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,"You-you-you didn't know that. Well, I guess m...",That's why you broke up with me?,0.0
24390,episode_1,neutral,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,"Hello, Joey.",Sorry. Wrong boobies.,0.0


In [22]:
dff = explode_add_context(df_train_t, 5)
dff.head(6)

Unnamed: 0,episode,emotions,next_5,next_4,next_3,next_2,next_1,current,previous_1,previous_2,previous_3,previous_4,previous_5,triggers
864,episode_0,neutral,,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,,,,,,0.0
868,episode_0,surprise,"So, I'm sorry I just don't think we should go ...","Look, I Look, I'm having a great time with y...",Oh good.,"Yeah, it kinda grows on you. Actually, I want...",,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,"You-you-you didn't know that. Well, I guess m...",0.0
867,episode_0,neutral,"Look, I Look, I'm having a great time with y...",Oh good.,"Yeah, it kinda grows on you. Actually, I want...",,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,"You-you-you didn't know that. Well, I guess m...",That's why you broke up with me?,1.0
866,episode_0,neutral,Oh good.,"Yeah, it kinda grows on you. Actually, I want...",,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,"You-you-you didn't know that. Well, I guess m...",That's why you broke up with me?,"Ah, uh, I owe you a long overdue apology. I ne...",0.0
865,episode_0,neutral,"Yeah, it kinda grows on you. Actually, I want...",,My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,"You-you-you didn't know that. Well, I guess m...",That's why you broke up with me?,"Ah, uh, I owe you a long overdue apology. I ne...",It's pretty clear.,0.0
24390,episode_1,neutral,I see.,"Now you'll be heading a whole division, so you...",My duties? All right.,So let's talk a little bit about your duties.,That I did. That I did.,You must've had your hands full.,also I was the point person on my company's tr...,"Hello, Joey.",Sorry. Wrong boobies.,Joey!! What the hell were you doing?!,Clear the tracks for the boobie payback expres...,0.0


### Metrics

In [None]:
### Given 2 series of sequences of triggers/emotions compute F1 inside each sequence and return avg,

# TODO remove
# def sequence_f1(y_true, y_pred, avg: bool = True):
#     res = [
#         f1_score(y_true=y_t, y_pred=y_p, average="micro")
#         for y_t, y_p in zip(y_true, y_pred)
#     ]
#     return np.average(res) if avg else res


### Compute F1 score for each flattened dialogue and return avg over dialogues
def sequence_f1(
    y_true: pd.DataFrame,
    y_pred: np.ndarray,
    target_column: str,
    avg: bool = True,
):
    assert len(y_pred) == len(y_true), "y_pred and y_true must be of the same lenght"
    assert (
        y_true[EPISODE].is_monotonic_increasing
        or y_true[EPISODE].is_monotonic_decreasing
    ), "utterances must be sorted over the episodes"

    res = {}
    start = 0
    stop_incl = 0
    for i in range(1, len(y_pred)):
        if y_true[EPISODE][i - 1] != y_true[EPISODE][i]:
            stop_incl = i - 1
            f1 = f1_score(
                y_true=y_true[target_column][start : stop_incl + 1].to_list(),
                y_pred=y_pred[start : stop_incl + 1],
                average="micro",
            )
            res.update({y_true[EPISODE][start]: f1})
            start = i

    # np.std(list(res.values))
    return res if not avg else np.average(list(res.values()))


### Compute F1 score for the unrolled sequence
def unrolled_f1(
    y_true: pd.DataFrame,
    y_pred: np.ndarray,
    target_column: str,
):
    return f1_score(y_true[target_column].to_list(), y_pred, average="micro")


# TODO remove
# def unrolled_f1(y_true, y_pred):
#     y_t_flat = []
#     for l in y_true:
#         for e in l:
#             y_t_flat.append(e)
#
#     y_p_flat = []
#     for l in y_pred:
#         for e in l:
#             y_p_flat.append(e)
#
#     return f1_score(y_true=y_t_flat, y_pred=y_p_flat, average="micro")

## Baseline Models: 

In [None]:
### Create baseline models


# TODO do we still need the class?
class SequenceDummyClassifier(DummyClassifier):
    def __init__(self, strategy: str, seed: int = 42) -> None:
        self.seed = seed
        if not strategy.lower() in ("random", "majority"):
            raise ValueError("strategy must be in [random, majority]")
        sklearn_strategy = "uniform" if strategy == "random" else "most_frequent"
        super().__init__(strategy=sklearn_strategy, random_state=seed)

    # TODO remove
    # def _flatten_seq(self, df: pd.Series):
    #     res = []
    #     for l in df:
    #         for e in l:
    #             res.append(e)
    #     return res

    # TODO remove
    # def _deflatten_seq(self, seq, shape_like: pd.Series):
    #     data = iter(seq)
    #     result = [[next(data) for _ in s] for s in shape_like]
    #     return result

    # TODO remove
    # def fit(self, X: pd.Series, y: pd.Series):
    #     X_flat = self._flatten_seq(X)
    #     y_flat = self._flatten_seq(y)
    #     super().fit(X=X_flat, y=y_flat)

    # TODO remove
    # def predict(self, X: pd.Series, return_flat: bool = False):
    #     X_flat = self._flatten_seq(X)
    #     y_flat = super().predict(X_flat)
    #     return y_flat if return_flat else self._deflatten_seq(seq=y_flat, shape_like=X)

In [26]:
def experiment_baseline(df_train: pd.DataFrame, df_test: pd.DataFrame, seed: int = 42):

    baseline_f1s = {}
    baseline_results = {}

    for strategy in ("Random", "Majority"):
        for target in (EMOTIONS, TRIGGERS):
            clf = SequenceDummyClassifier(strategy=strategy, seed=seed)
            clf.fit(X=df_train["current"], y=df_train[target])

            res = clf.predict(X=df_test["current"])
            baseline_results.update({f"{target}_{strategy}": res})

            seq_f1 = sequence_f1(y_true=df_test, y_pred=res, target_column=target)
            baseline_f1s.update({f"sequence_f1({target}_{strategy})": seq_f1})

            unr_f1 = unrolled_f1(y_true=df_test, y_pred=res, target_column=target)
            baseline_f1s.update({f"unrolled_f1({target}_{strategy})": unr_f1})

    return baseline_f1s, baseline_results


f1s, results = experiment_baseline(df_train, df_test)
for k, v in f1s.items():
    print(f"{k} : {v}")

sequence_f1(emotions_Random) : 0.44657433160540777
unrolled_f1(emotions_Random) : 0.4330935251798561
sequence_f1(triggers_Random) : 0.6522845341422358
unrolled_f1(triggers_Random) : 0.6515107913669065
sequence_f1(emotions_Majority) : 0.44657433160540777
unrolled_f1(emotions_Majority) : 0.4330935251798561
sequence_f1(triggers_Majority) : 0.6522845341422358
unrolled_f1(triggers_Majority) : 0.6515107913669065


# Tokenization

In [27]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

all_emotions = []
for seq in df_train[EMOTIONS]:
    for e in seq:
        all_emotions.append(e)
uniq_emotions = np.sort(np.unique(all_emotions))
one_hot = np.identity(len(uniq_emotions))
emotion_mapping = {e: one_hot[i] for i, e in enumerate(uniq_emotions)}
print(emotion_mapping)

{'a': array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'd': array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'e': array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'f': array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'g': array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'i': array([0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'j': array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), 'l': array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]), 'n': array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]), 'o': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]), 'p': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), 'r': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]), 's': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,

In [28]:
def tokenize(ds_row, tokenizer=tokenizer):

    if type(ds_row["emotions"][0]) != str:  ### batchsize > 1
        emotion_encoding = []
        emotion_encoding = [
            [emotion_mapping[e] for e in emotions_of_one_utterance]
            for emotions_of_one_utterance in ds_row["emotions"]
        ]
    else:  ### batchsize == 1
        emotion_encoding = [emotion_mapping[e] for e in ds_row["emotions"]]

    encoded_ds_row = {
        "input_ids": [],
        "token_type_ids": [],
        "attention_mask": [],
        # "emotions": ds_row["emotions"],
        "emotions": emotion_encoding,
        "triggers": ds_row["triggers"],
    }

    for sentence in ds_row["utterances"]:

        tokenized_sentence = tokenizer(
            sentence,
            truncation=True,
            padding="max_length",
            max_length=tokenizer.model_max_length // 4,
            return_tensors="pt",
            # is_split_into_words=True,
        )
        encoded_ds_row["input_ids"].append(tokenized_sentence["input_ids"])
        encoded_ds_row["token_type_ids"].append(tokenized_sentence["token_type_ids"])
        encoded_ds_row["attention_mask"].append(tokenized_sentence["attention_mask"])

    return encoded_ds_row

In [29]:
ds_train = Dataset.from_pandas(df_train)
ds_val = Dataset.from_pandas(df_val)
ds_test = Dataset.from_pandas(df_test)

In [30]:
### Apply tokenization
batched = True
ds_train_tokenized = ds_train.map(
    function=tokenize,
    fn_kwargs={"tokenizer": tokenizer},
    batched=batched,
    remove_columns=[UTTERANCES],
)
ds_train_tokenized.set_format(type="torch")
ds_val_tokenized = ds_val.map(
    function=tokenize,
    fn_kwargs={"tokenizer": tokenizer},
    batched=batched,
    remove_columns=[UTTERANCES],
)
ds_val_tokenized.set_format(type="torch")
ds_test_tokenized = ds_test.map(
    function=tokenize,
    fn_kwargs={"tokenizer": tokenizer},
    batched=batched,
    remove_columns=[UTTERANCES],
)
ds_test_tokenized.set_format(type="torch")

ValueError: Column to remove ['utterances'] not in the dataset. Current columns in the dataset: ['episode', 'emotions', 'next_1', 'current', 'previous_1', 'triggers', '__index_level_0__']

In [None]:
print(ds_train_tokenized[0]["input_ids"])

In [None]:
print(ds_train_tokenized[0])

In [None]:
print(ds_train[UTTERANCES][0])

In [None]:
tokenized_row = tokenize(ds_train[0])
# print(tokenized_row["input_ids"][0])
print(tokenizer.batch_decode(tokenized_row["input_ids"][0]))
print(tokenizer.batch_decode(tokenized_row["input_ids"][1]))

In [None]:
# Test della funzione
tokens = tokenize(ds_train[0], tokenizer)
print(tokens)

# Bert Models

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self, num_emotions=7):
        super(BERTClass, self).__init__()

        self.bert = BertModel.from_pretrained("bert-base-uncased")

        self.dropout = torch.nn.Dropout(0.3)
        # self.lstm = torch.nn.LSTM()
        # classifiers
        self.l_emotions = torch.nn.Linear(self.bert.config.hidden_size, num_emotions)
        self.l_triggers = torch.nn.Linear(self.bert.config.hidden_size, 1)

    def forward(self, ids, mask, token_type_ids):
        print(ids.shape)
        print(ids.reshape(-1, ids.size(-1)))
        output = self.bert(ids, attention_mask=mask)  # , token_type_ids=token_type_ids)

        output_emotions = self.dropout(output)
        output_triggers = self.dropout(output)

        # output_triggers = self.lstm(output_triggers)

        output_emotions = self.l_emotions(output_emotions)
        output_triggers = self.l_triggers(output_triggers)

        return output_emotions, output_triggers

    def freeze_params(self):
        for param in self.bert.parameters():
            param.requires_grad = False

# Training Utils

In [None]:
num_emotions = df["emotions"].explode().nunique()

model_frozen = BERTClass(num_emotions)
model_full = BERTClass(num_emotions)

model_frozen.freeze_params()

# Verifying that the params are actually frozen
for name, param in model_frozen.named_parameters():
    print(name, param.requires_grad)

for name, param in model_full.named_parameters():
    print(name, param.requires_grad)

In [None]:
model_list = [model_full, model_frozen]
num_epochs = 5

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

for model in model_list:
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = torch.nn.CrossEntropyLoss()
    model.train()

    # Tokenizer initiation
    # TODO Check tokenizer parameters
    # TODO A Dataloader seems to be commonly used for this use cases
    # encoding = tokenizer(df_train, truncation=False, padding='max_length', return_tensors='pt')
    # input_ids = encoding['input_ids']
    # attention_mask = encoding['attention_mask']

    # Training loop
    # for epoch in range(num_epochs):
    #
    #    for idx in range(len(df_test)):

    #        text = df_train[idx].drop(TRIGGERS)
    #        label = df_train[idx][TRIGGERS]

    #        optimizer.zero_grad()

    #        logits = model(batch_data)
    #        loss = loss_fn(logits, batch_labels)
    #        loss.backward()
    #        optimizer.step()

In [None]:
def loss_fn(outputs_emotions, outputs_triggers, emotions_labels, triggers_labels):
    return torch.nn.CrossEntropyLoss(
        outputs_emotions, emotions_labels
    ) + torch.nn.BCELoss(outputs_triggers, triggers_labels)

In [None]:
### Training of the model
def train_model(train_dl, model, optimizer):
    losses = []
    correct_predictions_emotions = 0
    correct_predictions_triggers = 0
    num_samples_emotions = 0
    num_samples_triggers = 0

    ### activate dropout, batch norm
    model.train()

    ### initialize progress bar
    batches = tq.tqdm(
        enumerate(train_dl), total=len(train_dl), leave=True, colour="steelblue"
    )

    for batch_idx, data in batches:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
        emotions_labels = data["emotions"].to(device, dtype=torch.float)
        triggers_labels = data["triggers"].to(device, dtype=torch.float)
        outputs_emotions, outputs_triggers = model(
            ids, mask, token_type_ids
        )  ### Forward

        loss = loss_fn(
            outputs_emotions, outputs_triggers, emotions_labels, triggers_labels
        )
        losses.append(loss.cpu().detach().numpy())

        ### apply thresh 0.5
        outputs_emotions = (
            torch.sigmoid(outputs_emotions).cpu().detach().numpy().round()
        )
        outputs_triggers = (
            torch.sigmoid(outputs_triggers).cpu().detach().numpy().round()
        )

        emotions_labels = emotions_labels.cpu().detach().numpy()
        triggers_labels = triggers_labels.cpu().detach().numpy()

        correct_predictions_emotions += np.sum(outputs_emotions == emotions_labels)
        correct_predictions_triggers += np.sum(outputs_triggers == triggers_labels)

        num_samples_emotions += emotions_labels.size
        num_samples_triggers += triggers_labels.size

        ### Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        ### Grad descent step
        optimizer.step()

        ### Update progress bar
        batches.set_description(f"")
        batches.set_postfix(batch_loss=loss)

    # Si potrebbe fare una singola accuracy come media delle due, magari fuori dal training
    accuracy_emotions = float(correct_predictions_emotions) / num_samples_emotions
    accuracy_triggers = float(correct_predictions_triggers) / num_samples_triggers

    return model, accuracy_emotions, accuracy_triggers, losses

In [None]:
# eval model, setup e train_eval da definire

In [None]:
def eval_model(validation_dl, model):
    losses = []
    correct_predictions = 0
    num_samples = 0
    num_categories = next(iter(validation_dl))["labels"].shape[1]

    ### accumulate data over each batch to compute the f1
    true_positives = np.array([0 for _ in range(num_categories)])
    false_positives = np.array([0 for _ in range(num_categories)])
    false_negatives = np.array([0 for _ in range(num_categories)])

    ### turn off dropout, fix batch norm
    model.eval()

    ### show progress bar
    batches = tq.tqdm(
        enumerate(validation_dl),
        total=len(validation_dl),
        leave=True,
        colour="steelblue",
    )
    with torch.no_grad():
        for batch_idx, data in batches:
            ids = data["input_ids"].to(device, dtype=torch.long)
            mask = data["attention_mask"].to(device, dtype=torch.long)
            token_type_ids = data["token_type_ids"].to(device, dtype=torch.long)
            emotions_labels = data["emotions"].to(device, dtype=torch.float)
            triggers_labels = data["triggers"].to(device, dtype=torch.float)
            outputs_emotions, outputs_triggers = model(ids, mask, token_type_ids)

            loss = loss_fn(
                outputs_emotions, outputs_triggers, emotions_labels, triggers_labels
            )
            losses.append(loss.cpu().detach().numpy())

            ### validation accuracy
            ### training sigmoid is in BCEWithLogitsLoss
            outputs_emotions = (
                torch.sigmoid(outputs_emotions).cpu().detach().numpy().round()
            )
            outputs_triggers = (
                torch.sigmoid(outputs_triggers).cpu().detach().numpy().round()
            )

            emotions_labels = emotions_labels.cpu().detach().numpy()
            triggers_labels = triggers_labels.cpu().detach().numpy()
            correct_predictions_emotions += np.sum(outputs_emotions == emotions_labels)
            correct_predictions_triggers += np.sum(outputs_triggers == triggers_labels)

            num_samples_emotions += emotions_labels.size
            num_samples_triggers += triggers_labels.size

        accuracy_emotions = float(correct_predictions_emotions) / num_samples_emotions
        accuracy_triggers = float(correct_predictions_triggers) / num_samples_triggers
        # precision = true_positives / (true_positives + false_positives)
        # recall = true_positives / (true_positives + false_negatives)
        # f1_per_cat = 2 * (precision * recall) / (precision + recall)
        # f1_overall = np.mean(f1_per_cat)

    return accuracy_emotions, accuracy_triggers, losses  # , f1_overall, f1_per_cat

In [None]:
def train_eval(
    train_dl,
    validation_dl,
    model,
    optimizer,
    n_epochs=1,
    save_name="0",
    train_model_f=train_model,
    eval_model_f=eval_model,
):
    model_folder = Path.cwd().joinpath("models")
    if not model_folder.exists():
        model_folder.mkdir(parents=True)

    history = {}
    best_f1 = 0

    for epoch in range(1, n_epochs + 1):
        print(f"Epoch {epoch}/{n_epochs}")
        model, accuracy_emotions, accuracy_triggers, train_loss = train_model_f(
            train_dl, model, optimizer
        )
        val_accuracy_emotions, val_accuracy_triggers, val_loss = eval_model_f(
            validation_dl, model
        )

        print(
            f"train_loss={np.mean(train_loss):.4f}, val_loss={np.mean(val_loss):.4f}, "
            f"train_acc_emo={accuracy_emotions:.4f}, train_acc_emo={accuracy_triggers:.4f},"
            f"val_acc_emo={val_accuracy_emotions:.4f}, val_acc_tri={val_accuracy_triggers:.4f}, "
        )

        history.update({"train_acc_emo": accuracy_emotions})
        history.update({"train_acc_tri": accuracy_triggers})
        history.update({"train_losses": train_loss})
        history.update({"val_acc_emo": val_accuracy_emotions})
        history.update({"val_acc_emo": val_accuracy_triggers})
        history.update({"val_losses": val_loss})
        # history.update({"f1_overall": f1_overall})
        # history.update({"f1_per_cat": f1_per_cat})

        ### save the best model
        # if f1_overall > best_f1:
        #   torch.save(
        #      model.state_dict(),
        #      Path.joinpath(model_folder, f"model_{save_name}.bin"),
        # )
        # best_f1 = f1_overall

    return history  # (history["f1_overall"], history["f1_per_cat"], history["train_losses"])

In [None]:
def create_data_loaders(tokenized_datasets, batch_size):
    train_dl = torch.utils.data.DataLoader(
        tokenized_datasets["train"],
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
    )

    validation_dl = torch.utils.data.DataLoader(
        tokenized_datasets["validation"],
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
    )

    test_dl = torch.utils.data.DataLoader(
        tokenized_datasets["test"],
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
    )
    return train_dl, validation_dl, test_dl

In [None]:
batch_size = 1

train_dl = torch.utils.data.DataLoader(
    ds_train_tokenized,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
)
validation_dl = torch.utils.data.DataLoader(
    ds_val_tokenized,
    batch_size=batch_size,
    shuffle=True,
    num_workers=0,
)

## Training Custom loop

In [None]:
N_EPOCHS = 3

In [None]:
history = train_eval(
    train_dl,
    validation_dl,
    model=model,
    optimizer=optimizer,
    n_epochs=N_EPOCHS,
    save_name=f"test_model",
)
# history_list_c_lr3.append(history)

tensor [[utt + pad], [utt + pad], [utt + pad]]
tensor [CLS + utt + SEP + utt + SEP + utt + SEP + pad]
[ tensor[cls utt padd] tensor [cls utt padd]]


## Training con Classe Trainer

In [None]:
class BertTrainer(Trainer):
    def __init__(self, model, training_args, train_ds, eval_ds, metrics):
        super().__init__(model, training_args, train_ds, eval_ds, metrics)

    def compute_loss(self, model, inputs, return_outputs=False):

        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        emotions_labels = inputs["emotions"]
        triggers_labels = inputs["triggers"]
        output_emotions, output_triggers = model(ids, mask)

        custom_loss = self.loss_fn(
            output_emotions, output_triggers, emotions_labels, triggers_labels
        )

        return (
            (custom_loss, output_emotions, output_triggers)
            if return_outputs
            else custom_loss
        )

    def loss_fn(outputs_emotions, outputs_triggers, emotions_labels, triggers_labels):
        return torch.nn.CrossEntropyLoss(
            outputs_emotions, emotions_labels
        ) + torch.nn.BCELoss(outputs_triggers, triggers_labels)

In [None]:
training_args = TrainingArguments(
    output_dir="\\test",
    do_train=True,
    do_eval=True,
    # evaluate_during_training=True,
    learning_rate=5e-5,
    num_train_epochs=8,
    seed=42,
)

In [None]:
trainer = Trainer(
    model=model_full,
    args=training_args,
    # data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=ds_train,
    eval_dataset=ds_val,
    compute_metrics=sequence_f1,
    # tokenizer=tokenizer,
)

In [None]:
trainer.train()