In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score
from pathlib import Path

## Corpus

In [2]:
### Load corpus

data_path = Path("data/MELD_train_efr.json")
assert data_path.exists(), "Data file is not present"
# TODO download from GDrive?
df = pd.read_json(data_path, dtype={"speakers": np.array, "triggers": np.array})
EPISODE, SPEAKERS, EMOTIONS, UTTERANCES, TRIGGERS = df.columns

#### Data exploration

In [3]:
df.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
0,utterance_0,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise]",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 1.0, 0.0]"
1,utterance_1,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
2,utterance_2,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,utterance_3,"[Chandler, The Interviewer, Chandler, The Inte...","[neutral, neutral, neutral, neutral, surprise,...",[also I was the point person on my company's t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,utterance_4,"[Joey, Rachel, Joey, Rachel]","[surprise, sadness, surprise, fear]",[But then who? The waitress I went out with la...,"[0.0, 0.0, 1.0, 0.0]"


In [4]:
df.describe()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
count,4000,4000,4000,4000,4000
unique,4000,3350,3427,3998,523
top,utterance_0,"[Monica, Chandler, Monica]","[neutral, neutral, joy]",[Happy?! Is that what I'm supposed to be Vic? ...,"[0.0, 1.0, 0.0]"
freq,1,15,30,2,191


In [5]:
### Look for how many groups of episodes with the same first utterance there are and their lenghts

df.sort_values(by=UTTERANCES, inplace=True)
groups = np.zeros((850,), dtype=int)

index = 0
count = 1
for i in range(1, len(df[UTTERANCES])):
    if df[UTTERANCES][i][0] == df[UTTERANCES][i - 1][0]:
        ### still in the same group
        count += 1
    else:
        ### found new group
        groups[index] = count
        index += 1
        count = 1

groups = groups[groups != 0]

print(f"Number of groups: {len(groups)}")
print(f"Avg group len: {np.average(groups):.1f}")
print(f"Longest group: {np.max(groups)}")
print(f"Episodes not in a group: {groups[groups == 1].shape[0]}")

Number of groups: 832
Avg group len: 4.8
Longest group: 16
Episodes not in a group: 128


In [6]:
### Count how many speakers there are in each episode

speakers_count = df[SPEAKERS].apply(lambda arr: np.unique(arr).shape[0]).to_numpy()
min_sp = np.min(speakers_count)
max_sp = np.max(speakers_count)
print("Distribution of number of speakers:")
for count in range(min_sp, max_sp + 1):
    print(f"{count} speakers:  {np.sum(speakers_count == count)}")

Distribution of number of speakers:
1 speakers:  214
2 speakers:  2105
3 speakers:  1030
4 speakers:  405
5 speakers:  161
6 speakers:  74
7 speakers:  10
8 speakers:  1


#### Data cleanup

In [7]:
### Drop not useful column

df.drop(columns=[EPISODE])

Unnamed: 0,speakers,emotions,utterances,triggers
1061,"[Joey, Gunther, Joey]","[joy, neutral, surprise]","[""Happy birthday to you!"", You're paying for t...","[0.0, 1.0, 0.0]"
1062,"[Joey, Gunther, Joey, Gunther]","[joy, neutral, surprise, surprise]","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 1.0, 0.0]"
1063,"[Joey, Gunther, Joey, Gunther, Joey]","[joy, neutral, surprise, surprise, neutral]","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 1.0, 0.0]"
1064,"[Joey, Gunther, Joey, Gunther, Joey, Gunther]","[joy, neutral, surprise, surprise, neutral, ne...","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
1065,"[Joey, Gunther, Joey, Gunther, Joey, Gunther, ...","[joy, neutral, surprise, surprise, neutral, ne...","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
...,...,...,...,...
597,"[Singer, Joey, Phoebe, Chandler, Phoebe, Chand...","[joy, surprise, anger, neutral, neutral, neutr...","[Cause every time I see your face, I can't he...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]"
598,"[Singer, Joey, Phoebe, Chandler, Phoebe, Chand...","[joy, surprise, anger, neutral, neutral, neutr...","[Cause every time I see your face, I can't he...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
599,"[Singer, Joey, Phoebe, Chandler, Phoebe, Chand...","[joy, surprise, anger, neutral, neutral, neutr...","[Cause every time I see your face, I can't he...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
600,"[Singer, Joey, Phoebe, Chandler, Phoebe, Chand...","[joy, surprise, anger, neutral, neutral, neutr...","[Cause every time I see your face, I can't he...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [8]:
### Remove Nones from the triggers

# TODO no conversion to int, we may need float laters for label smoothing or so
df[TRIGGERS] = df[TRIGGERS].apply(
    lambda trig_seq: np.array([0.0 if t is None else t for t in trig_seq])
)

In [9]:
df.head()

Unnamed: 0,episode,speakers,emotions,utterances,triggers
1061,utterance_1061,"[Joey, Gunther, Joey]","[joy, neutral, surprise]","[""Happy birthday to you!"", You're paying for t...","[0.0, 1.0, 0.0]"
1062,utterance_1062,"[Joey, Gunther, Joey, Gunther]","[joy, neutral, surprise, surprise]","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 1.0, 0.0]"
1063,utterance_1063,"[Joey, Gunther, Joey, Gunther, Joey]","[joy, neutral, surprise, surprise, neutral]","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 1.0, 0.0]"
1064,utterance_1064,"[Joey, Gunther, Joey, Gunther, Joey, Gunther]","[joy, neutral, surprise, surprise, neutral, ne...","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
1065,utterance_1065,"[Joey, Gunther, Joey, Gunther, Joey, Gunther, ...","[joy, neutral, surprise, surprise, neutral, ne...","[""Happy birthday to you!"", You're paying for t...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"


### Data preprocessing:
If an episode contains the same utterances of the previous and a few more then the triggers from the previous episode are replicated in the current episode

In [10]:
for i in range(10):
    print(f"{df[TRIGGERS][i]}")

[0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
[0. 0. 1. 0.]
[0. 0. 0. 1. 0.]
[0. 0. 0. 0. 1. 1. 0.]
[0. 1. 0.]
[0. 0. 0. 1.]
[0. 0. 0. 1. 0.]


In [11]:
count = 0
for i in range(1, len(df)):
    # TODO discuss: the version that does all the checks is faster
    is_continuation = np.all([u in df[UTTERANCES][i] for u in df[UTTERANCES][i - 1]])
    # is_continuation = True
    # j = 0
    # while is_continuation and j < len(df[UTTERANCES][i - 1]):
    #     is_continuation = df[UTTERANCES][i - 1][j] in df[UTTERANCES][i]
    #     j += 1
    if is_continuation:
        count += 1
        for k, t in enumerate(df[TRIGGERS][i - 1]):
            df[TRIGGERS][i][k] = t

In [12]:
for i in range(10):
    print(f"{df[TRIGGERS][i]}")

[0. 0. 0. 1. 0.]
[0. 0. 0. 1. 0. 1. 0.]
[0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0.]
[0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 1. 0.]
[0. 0. 1. 0.]
[0. 0. 1. 0. 0.]
[0. 0. 1. 0. 0. 1. 0.]
[0. 1. 0.]
[0. 1. 0. 1.]
[0. 1. 0. 1. 0.]


#### Train/Validation/Test split

In [13]:
### Train Val Test split: 80/10/10


def split_data(df: pd.DataFrame, seed: int = 42):
    df_train, df_test = train_test_split(
        df, test_size=0.2, train_size=0.8, random_state=seed
    )

    df_val, df_test = train_test_split(
        df_test, test_size=0.5, train_size=0.5, random_state=seed
    )

    return df_train, df_val, df_test

In [14]:
### Check
df_train, df_val, df_test = split_data(df)
print(f"df_train len: {len(df_train)}")
print(f"df_val len: {len(df_val)}")
print(f"df_test len: {len(df_test)}")

df_train len: 3200
df_val len: 400
df_test len: 400


### Metrics

In [15]:
# TODO: give 2 series of sequences of triggers/emotions compute F1 inside each sequence and return avg, flatten out and compute F1
###
def sequence_f1(y_true, y_pred, avg: bool = True):
    res = [
        f1_score(y_true=y_t, y_pred=y_p, average="micro")
        for y_t, y_p in zip(y_true, y_pred)
    ]
    return np.average(res) if avg else res


def unrolled_f1(y_true, y_pred):
    y_t_flat = []
    for l in y_true:
        for e in l:
            y_t_flat.append(e)

    y_p_flat = []
    for l in y_pred:
        for e in l:
            y_p_flat.append(e)

    return f1_score(y_true=y_t_flat, y_pred=y_p_flat, average="micro")

## Baseline Models: 

In [16]:
### Create baseline models


class SequenceDummyClassifier(DummyClassifier):
    def __init__(self, strategy: str, seed: int = 42) -> None:
        self.seed = seed
        # TODO proper exception
        if not strategy.lower() in ("random", "majority"):
            raise ValueError("strategy must be in [random, majority]")
        sklearn_strategy = "uniform" if strategy == "random" else "most_frequent"
        super().__init__(strategy=sklearn_strategy, random_state=seed)

    ### TODO discuss: problem = flattening sequences of != len from the df
    ### sol1 = iterate over df and collect 1by1: bad for memory allocation
    ### sol2 = pad the sequences in the df, create array, remove padding: can it be more efficient?
    # np.array(df[UTTERANCES].tolist()).flatten() does not work because of the != len of the sequences

    def _flatten_seq(self, df: pd.Series):
        res = []
        for l in df:
            for e in l:
                res.append(e)
        return res

    def _flatten_seq_(self, df: pd.Series):
        max_len = np.max(df.apply(lambda s: len(s)).to_numpy())
        dtype = type(df[0][0])
        pad_element = dtype(999999)
        ### Pad utterances with 0, flatten array
        df = np.array(
            df.apply(
                lambda s: np.hstack(
                    (
                        s,
                        np.repeat(
                            [pad_element],
                            repeats=(max_len - len(s)),
                        ),
                    )
                )
            ).to_list()
        ).flatten()
        ### Remove padding
        return (
            df[np.char.not_equal(df, pad_element)]
            if dtype == str
            else df[df != dtype(pad_element)]
        )

    def _deflatten_seq(self, seq, shape_like: pd.Series):
        ### TODO discuss: we may think to use np.reshape but again the row len is not homogeneous!
        data = iter(seq)
        result = [[next(data) for _ in s] for s in shape_like]
        return result

    def fit(self, X: pd.Series, y: pd.Series):
        X_flat = self._flatten_seq(X)
        y_flat = self._flatten_seq(y)
        super().fit(X=X_flat, y=y_flat)

    def predict(self, X: pd.Series, return_flat: bool = False):
        X_flat = self._flatten_seq(X)
        y_flat = super().predict(X_flat)
        return y_flat if return_flat else self._deflatten_seq(seq=y_flat, shape_like=X)

In [20]:
def experiment_baseline(df_train: pd.DataFrame, df_test: pd.DataFrame, seed: int = 42):

    baseline_f1s = {}
    baseline_results = {}

    for strategy in ("Random", "Majority"):
        for target in (EMOTIONS, TRIGGERS):
            clf = SequenceDummyClassifier(strategy=strategy, seed=seed)
            clf.fit(X=df_train[UTTERANCES], y=df_train[target])

            res = clf.predict(X=df_test[UTTERANCES], return_flat=False)
            baseline_results.update({f"{target}_{strategy}": res})

            seq_f1 = sequence_f1(y_true=df_test[target], y_pred=res)
            baseline_f1s.update({f"sequence_f1({target}_{strategy})": seq_f1})

            unr_f1 = unrolled_f1(y_true=df_test[target], y_pred=res)
            baseline_f1s.update({f"unrolled_f1({target}_{strategy})": unr_f1})

    return baseline_f1s, baseline_results


f1s, results = experiment_baseline(df_train, df_test)
for k, v in f1s.items():
    print(f"{k} : {v}")

sequence_f1(emotions_Random) : 0.4233349753853817
unrolled_f1(emotions_Random) : 0.4330935251798561
sequence_f1(triggers_Random) : 0.6519219611872475
unrolled_f1(triggers_Random) : 0.6515107913669065
sequence_f1(emotions_Majority) : 0.4233349753853817
unrolled_f1(emotions_Majority) : 0.4330935251798561
sequence_f1(triggers_Majority) : 0.6519219611872475
unrolled_f1(triggers_Majority) : 0.6515107913669065
