In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from pathlib import Path
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader
import tqdm.notebook as tq


In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

## Corpus

In [None]:
### Load corpus

data_path = Path("data/MELD_train_efr.json")
assert data_path.exists(), "Data file is not present"
# TODO download from GDrive?
df = pd.read_json(data_path, dtype={"speakers": np.array, "triggers": np.array})
EPISODE, SPEAKERS, EMOTIONS, UTTERANCES, TRIGGERS = df.columns

#### Data exploration

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
### Look for how many groups of episodes with the same first utterance there are and their lenghts

df.sort_values(by=UTTERANCES, inplace=True)
groups = np.zeros((850,), dtype=int)

index = 0
count = 1
for i in range(1, len(df[UTTERANCES])):
    if df[UTTERANCES][i][0] == df[UTTERANCES][i - 1][0]:
        ### still in the same group
        count += 1
    else:
        ### found new group
        groups[index] = count
        index += 1
        count = 1

groups = groups[groups != 0]

print(f"Number of groups: {len(groups)}")
print(f"Avg group len: {np.average(groups):.1f}")
print(f"Longest group: {np.max(groups)}")
print(f"Episodes not in a group: {groups[groups == 1].shape[0]}")

In [None]:
### Count how many speakers there are in each episode

speakers_count = df[SPEAKERS].apply(lambda arr: np.unique(arr).shape[0]).to_numpy()
min_sp = np.min(speakers_count)
max_sp = np.max(speakers_count)
print("Distribution of number of speakers:")
for count in range(min_sp, max_sp + 1):
    print(f"{count} speakers:  {np.sum(speakers_count == count)}")

#### Data cleanup

In [None]:
### Drop not useful column

df.drop(columns=[EPISODE])

In [None]:
### Remove Nones from the triggers

# TODO no conversion to int, we may need float laters for label smoothing or so
df[TRIGGERS] = df[TRIGGERS].apply(
    lambda trig_seq: np.array([0.0 if t is None else t for t in trig_seq])
)

In [None]:
df.head()

### Data preprocessing:
If an episode contains the same utterances of the previous and a few more then the triggers from the previous episode are replicated in the current episode

In [None]:
for i in range(10):
    print(f"{df[TRIGGERS][i]}")

In [None]:
count = 0
for i in range(1, len(df)):
    # TODO discuss: the version that does all the checks is faster
    is_continuation = np.all([u in df[UTTERANCES][i] for u in df[UTTERANCES][i - 1]])
    # is_continuation = True
    # j = 0
    # while is_continuation and j < len(df[UTTERANCES][i - 1]):
    #     is_continuation = df[UTTERANCES][i - 1][j] in df[UTTERANCES][i]
    #     j += 1
    if is_continuation:
        count += 1
        for k, t in enumerate(df[TRIGGERS][i - 1]):
            df[TRIGGERS][i][k] = t

In [None]:
for i in range(10):
    print(f"{df[TRIGGERS][i]}")

#### Train/Validation/Test split

In [None]:
### Train Val Test split: 80/10/10


def split_data(df: pd.DataFrame, seed: int = 42):
    df_train, df_test = train_test_split(
        df, test_size=0.2, train_size=0.8, random_state=seed
    )

    df_val, df_test = train_test_split(
        df_test, test_size=0.5, train_size=0.5, random_state=seed
    )

    return df_train, df_val, df_test

In [None]:
### Check
df_train, df_val, df_test = split_data(df)
print(f"df_train len: {len(df_train)}")
print(f"df_val len: {len(df_val)}")
print(f"df_test len: {len(df_test)}")

In [None]:
### Class imbalance check
classes_count= {}
for emotions in df_train["emotions"]:
    for emotion in emotions:
        if emotion in classes_count:
            classes_count[emotion] += 1
        else:
            classes_count[emotion] = 1

# then we sort the dictionary by occurences
emotions_dict = {k: v for k, v in sorted(classes_count.items(), key=lambda item: item[1], reverse=True)}
print("Classes values:")
print(emotions_dict)

#Class imbalance abbastanza alto, potremmo usare dei weights

### Metrics

In [None]:
# TODO: give 2 series of sequences of triggers/emotions compute F1 inside each sequence and return avg, flatten out and compute F1
###
def sequence_f1(y_true, y_pred, avg: bool = True):
    res = [
        f1_score(y_true=y_t, y_pred=y_p, average="micro")
        for y_t, y_p in zip(y_true, y_pred)
    ]
    return np.average(res) if avg else res


def unrolled_f1(y_true, y_pred):
    y_t_flat = []
    for l in y_true:
        for e in l:
            y_t_flat.append(e)

    y_p_flat = []
    for l in y_pred:
        for e in l:
            y_p_flat.append(e)

    return f1_score(y_true=y_t_flat, y_pred=y_p_flat, average="micro")

## Baseline Models: 

In [None]:
### Create baseline models


class SequenceDummyClassifier(DummyClassifier):
    def __init__(self, strategy: str, seed: int = 42) -> None:
        self.seed = seed
        # TODO proper exception
        if not strategy.lower() in ("random", "majority"):
            raise ValueError("strategy must be in [random, majority]")
        sklearn_strategy = "uniform" if strategy == "random" else "most_frequent"
        super().__init__(strategy=sklearn_strategy, random_state=seed)

    ### TODO discuss: problem = flattening sequences of != len from the df
    ### sol1 = iterate over df and collect 1by1: bad for memory allocation
    ### sol2 = pad the sequences in the df, create array, remove padding: can it be more efficient?
    # np.array(df[UTTERANCES].tolist()).flatten() does not work because of the != len of the sequences

    def _flatten_seq(self, df: pd.Series):
        res = []
        for l in df:
            for e in l:
                res.append(e)
        return res

    def _flatten_seq_(self, df: pd.Series):
        max_len = np.max(df.apply(lambda s: len(s)).to_numpy())
        dtype = type(df[0][0])
        pad_element = dtype(999999)
        ### Pad utterances with 0, flatten array
        df = np.array(
            df.apply(
                lambda s: np.hstack(
                    (
                        s,
                        np.repeat(
                            [pad_element],
                            repeats=(max_len - len(s)),
                        ),
                    )
                )
            ).to_list()
        ).flatten()
        ### Remove padding
        return (
            df[np.char.not_equal(df, pad_element)]
            if dtype == str
            else df[df != dtype(pad_element)]
        )

    def _deflatten_seq(self, seq, shape_like: pd.Series):
        ### TODO discuss: we may think to use np.reshape but again the row len is not homogeneous!
        data = iter(seq)
        result = [[next(data) for _ in s] for s in shape_like]
        return result

    def fit(self, X: pd.Series, y: pd.Series):
        X_flat = self._flatten_seq(X)
        y_flat = self._flatten_seq(y)
        super().fit(X=X_flat, y=y_flat)

    def predict(self, X: pd.Series, return_flat: bool = False):
        X_flat = self._flatten_seq(X)
        y_flat = super().predict(X_flat)
        return y_flat if return_flat else self._deflatten_seq(seq=y_flat, shape_like=X)

In [None]:
def experiment_baseline(df_train: pd.DataFrame, df_test: pd.DataFrame, seed: int = 42):

    baseline_f1s = {}
    baseline_results = {}

    for strategy in ("Random", "Majority"):
        for target in (EMOTIONS, TRIGGERS):
            clf = SequenceDummyClassifier(strategy=strategy, seed=seed)
            clf.fit(X=df_train[UTTERANCES], y=df_train[target])

            res = clf.predict(X=df_test[UTTERANCES], return_flat=False)
            baseline_results.update({f"{target}_{strategy}": res})

            seq_f1 = sequence_f1(y_true=df_test[target], y_pred=res)
            baseline_f1s.update({f"sequence_f1({target}_{strategy})": seq_f1})

            unr_f1 = unrolled_f1(y_true=df_test[target], y_pred=res)
            baseline_f1s.update({f"unrolled_f1({target}_{strategy})": unr_f1})

    return baseline_f1s, baseline_results


f1s, results = experiment_baseline(df_train, df_test)
for k, v in f1s.items():
    print(f"{k} : {v}")

# Tokenization

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def tokenize(utterances, tokenizer):

    text_tokens = []

    for sentence in utterances:
        tokenized_sentence = tokenizer(
            sentence,
            truncation=True,
            padding="max_length",
            max_length=tokenizer.model_max_length,
            return_tensors="pt",
        )
        encoded_ds_row = {
        "input_ids": tokenized_sentence["input_ids"],
        "token_type_ids": tokenized_sentence["token_type_ids"], # https://huggingface.co/transformers/v3.2.0/glossary.html#token-type-ids
        "attention_mask": tokenized_sentence["attention_mask"],
        }
        #Vanno aggiunte le labels (?)
        # encoded_ds_row["labels"] = add_labels(sentence,emotions,triggers)
        
        text_tokens.append(encoded_ds_row)

    return encoded_ds_row

In [None]:
#Test della funzione
tokens = tokenize(df_train["utterances"][0], tokenizer)
print(tokens)

# Bert Models

In [None]:
class BERTClassifier(torch.nn.Module):
    def __init__(self, num_emotions):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased", return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, num_emotions)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

    def freeze_params(self):
        for param in self.bert.parameters():
            param.requires_grad = False

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()

        self.bert= BertModel.from_pretrained('bert-base-uncased')

        self.dropout = torch.nn.Dropout(0.3)
        self.lstm = torch.nn.LSTM()
        #classifiers
        self.l_emotions = torch.nn.Linear(self.bert.config.hidden_size, num_emotions=7) 
        self.l_triggers = torch.nn.Linear(self.bert.config.hidden_size, 1)
    
    def forward(self, ids, mask, token_type_ids):
        output= self.bert(ids, attention_mask = mask)

        output_emotions = self.dropout(output)
        output_triggers = self.dropout(output)

        output_triggers = self.lstm(output_triggers)

        output_emotions = self.l_emotions(output_emotions)
        output_triggers = self.l_triggers(output_triggers)
        
        return output_emotions, output_triggers

# Training Utils

In [None]:
# TODO Emotions and triggers tuning of class definiton
model_frozen = BERTClassifier(3)
model_full = BERTClassifier(3)

model_frozen.freeze_params()

#Verifying that the params are actually frozen
for name, param in model_frozen.named_parameters():
    print(name, param.requires_grad)

for name, param in model_full.named_parameters():
    print(name, param.requires_grad)

In [None]:
model_list = [model_full, model_frozen]
num_epochs = 5

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

for model in model_list:
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = torch.nn.CrossEntropyLoss()
    model.train()

    #Tokenizer initiation
    # TODO Check tokenizer parameters
    # TODO A Dataloader seems to be commonly used for this use cases
    #encoding = tokenizer(df_train, truncation=False, padding='max_length', return_tensors='pt')
    #input_ids = encoding['input_ids']
    #attention_mask = encoding['attention_mask']

    #Training loop
    #for epoch in range(num_epochs):
    #    
    #    for idx in range(len(df_test)):

    #        text = df_train[idx].drop(TRIGGERS)
    #        label = df_train[idx][TRIGGERS]

    #        optimizer.zero_grad() 

    #        logits = model(batch_data)
    #        loss = loss_fn(logits, batch_labels)
    #        loss.backward()
    #        optimizer.step()

In [None]:
def loss_fn(outputs_emotions, outputs_triggers, emotions_labels, triggers_labels):
    return torch.nn.CrossEntropyLoss(outputs_emotions, emotions_labels) + torch.nn.BCELoss(outputs_triggers, triggers_labels)

In [None]:
### Training of the model
def train_model(train_dl, model, optimizer):
    losses = []
    correct_predictions_emotions = 0
    correct_predictions_triggers = 0
    num_samples_emotions = 0
    num_samples_triggers = 0

    ### activate dropout, batch norm
    model.train()

    ### initialize progress bar
    batches = tq.tqdm(
        enumerate(train_dl), total=len(train_dl), leave=True, colour="steelblue"
    )

    for batch_idx, data in batches:
        ids = data["input_ids"].to(device, dtype=torch.long)
        mask = data["attention_mask"].to(device, dtype=torch.long)
        emotions_labels = data["emotions"].to(device, dtype=torch.float)
        triggers_labels = data["triggers"].to(device, dtype=torch.float)
        outputs_emotions,outputs_triggers = model(ids, mask)  ### Forward

        loss = loss_fn(outputs_emotions, outputs_triggers, emotions_labels, triggers_labels)
        losses.append(loss.cpu().detach().numpy())

        ### apply thresh 0.5
        outputs_emotions = torch.sigmoid(outputs_emotions).cpu().detach().numpy().round()
        outputs_triggers = torch.sigmoid(outputs_triggers).cpu().detach().numpy().round()

        emotions_labels = emotions_labels.cpu().detach().numpy()
        triggers_labels = triggers_labels.cpu().detach().numpy()

        correct_predictions_emotions += np.sum(outputs_emotions == emotions_labels)
        correct_predictions_triggers += np.sum(outputs_triggers == triggers_labels)

        num_samples_emotions += emotions_labels.size
        num_samples_triggers += triggers_labels.size


        ### Backward
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        ### Grad descent step
        optimizer.step()

        ### Update progress bar
        batches.set_description(f"")
        batches.set_postfix(batch_loss=loss)

    #Si potrebbe fare una singola accuracy come media delle due, magari fuori dal training
    accuracy_emotions = float(correct_predictions_emotions) / num_samples_emotions
    accuracy_triggers = float(correct_predictions_triggers) / num_samples_triggers


    return model, accuracy_emotions, accuracy_triggers, losses

In [None]:
# eval model, setup e train_eval da definire

In [None]:
def create_data_loaders(tokenized_datasets, batch_size):
    train_dl = torch.utils.data.DataLoader(
        tokenized_datasets["train"],
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
    )

    validation_dl = torch.utils.data.DataLoader(
        tokenized_datasets["validation"],
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
    )

    test_dl = torch.utils.data.DataLoader(
        tokenized_datasets["test"],
        batch_size=batch_size,
        shuffle=False,
        num_workers=0,
    )
    return train_dl, validation_dl, test_dl