In [137]:
import json
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from typing import *
from pprint import pprint

# Ergebnisse laden

In [138]:
def get_df_from_results(path_to_results:str) -> pd.DataFrame:
    with open(path_to_results, "r") as f:
        eval_results = f.readlines()
    eval_results = "\n".join(eval_results[17:])
    eval_results = json.loads(eval_results)
    eval_results = pd.DataFrame(eval_results).set_index("index")
    return eval_results

eval_results_path = r"./results/llm/preds_for_eval_unbalanced.text"
eval_results = get_df_from_results(eval_results_path)
eval_results.head()

Unnamed: 0_level_0,input,output,target
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Now you are expert of sentiment and emotional ...,neutral,neutral
1,Now you are expert of sentiment and emotional ...,neutral,neutral
2,Now you are expert of sentiment and emotional ...,angry,angry
3,Now you are expert of sentiment and emotional ...,neutral,angry
4,Now you are expert of sentiment and emotional ...,angry,surprise


# Untersuchen der Ergebnisse

In [139]:
def get_classification_report(eval_results:pd.DataFrame) -> pd.DataFrame:
    res = classification_report(eval_results["target"], eval_results["output"], zero_division=0.0, output_dict=True, digits=2)
    res = pd.DataFrame(res)
    res = res.drop(["accuracy", "macro avg", "weighted avg"], axis=1).swapaxes(0,1).reset_index()
    res.rename(columns={"index": "label"}, inplace=True)
    res = res.round(2)
    res["support"] = res["support"].astype(int)
    res.sort_values("f1-score", ascending=False, inplace=True)

    return res

get_classification_report(eval_results)

Unnamed: 0,label,precision,recall,f1-score,support
10,neutral,0.76,0.84,0.8,1256
9,joyful,0.68,0.53,0.6,402
13,surprise,0.57,0.62,0.59,281
0,angry,0.56,0.53,0.54,345
2,disgust,0.38,0.44,0.41,68
12,sad,0.48,0.34,0.4,208
6,fear,0.35,0.28,0.31,50
1,curious,0.0,0.0,0.0,0
3,embarrassed,0.0,0.0,0.0,0
4,excited,0.0,0.0,0.0,0


## Falsche Output-Labels
In der Tabelle sind zu viele Emotionen enthalten.
Dafür suche ich jetzt die Ursache

In [140]:
target_labels = eval_results["target"].unique()
predicted_labels = eval_results["output"].unique()

print(f"Gewünschte Labels: {', '.join(target_labels)}.")
print(f"Erhaltene Labels: {', '.join(predicted_labels)}.")

Gewünschte Labels: neutral, angry, surprise, sad, fear, joyful, disgust.
Erhaltene Labels: neutral, angry, surprise, joyful, fear, sad, disgust, hopeful, relieved, curious, embarrassed, excited, greedy, fascinated.


Das Modell scheint auch Emotionen zu benennen, die nicht gefordert sind. Welchen Anteil machen diese aus?

In [141]:
wrong_targets = eval_results[~eval_results["output"].isin(target_labels)]
print(f"Es gibt {len(wrong_targets)} falsche Labels")
print(f"Das entspricht {round((len(wrong_targets) / len(eval_results)) * 100, 2)}% des Datensatzes")

Es gibt 8 falsche Labels
Das entspricht 0.31% des Datensatzes


Diese Anzahl ist vernachlässigbar. Zur weiteren Analyse werden diese Zeilen ignoriert.

In [142]:
eval_results = eval_results[eval_results["output"].isin(target_labels)]

## Ergebnisse

In [143]:
from lets_plot import *
from lets_plot.mapping import as_discrete
LetsPlot.setup_html()

### Erfolg in Abhängigkeit der Anzahl Samples

In [144]:
cls_tabel = get_classification_report(eval_results)
cls_tabel["prop"] = cls_tabel["support"] / cls_tabel["support"].sum()
display(cls_tabel.sort_values("f1-score", ascending=False))

ggplot(cls_tabel, aes(y="f1-score", x="support", label="label")) + \
    geom_point() + \
    geom_text(vjust="bottom", nudge_y=0.02) + \
    xlim(0, 1300) + \
    ylim(0, 1)

Unnamed: 0,label,precision,recall,f1-score,support,prop
4,neutral,0.76,0.84,0.8,1253,0.481553
3,joyful,0.68,0.54,0.6,398,0.152959
6,surprise,0.57,0.62,0.59,281,0.107994
0,angry,0.56,0.53,0.54,345,0.13259
1,disgust,0.38,0.44,0.41,68,0.026134
5,sad,0.48,0.34,0.4,208,0.079939
2,fear,0.35,0.29,0.31,49,0.018832


Im Plot lässt sich eine Klare Korrelation zwischen der Anzahl Beispielen und dem F1-Score festmachen. Eine neutale Stimmung ist im (Test-) Datensatz mehr als 3x so oft vertreten, wie die zweithäufigste Emotion (joyful). Aktuell gibt es keine Skallierung des Fehlers beim Training, hier könnte man nochmal rein gucken

### Confusion Matrix

*By definition a confusion matrix $C$ is such that $C_{ij}$ is equal to the number of observations known to be in group $i$ and predicted to be in group $j$.
Thus in binary classification, the count of true negatives is $C_{0,0}$, false negatives is $C_{1,0}$, true positives is $C_{1,1}$ and false positives is $C_{0,1}$. [Ref](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html)*

In [145]:
import numpy as np

def print_confusion_matrix(targets:pd.Series, predictions:pd.Series, target_labels:Union[List[str],None] = None) -> None:
    target_labels = targets.unique() if target_labels is None else target_labels
    cm = confusion_matrix(targets, predictions, labels = target_labels)
    cm_percent:np.ndarray = (np.sqrt(cm.astype('float')) / np.sqrt(cm.sum(axis=1))[:, np.newaxis])

    img_coordinates = np.arange(len(target_labels)) + 0.5
    x, y = np.meshgrid(img_coordinates, img_coordinates[::-1])
    pairs = np.column_stack((x.ravel(), y.ravel()))

    cm_labels = pd.DataFrame({"labels": cm.ravel(), "x": pairs[:,0], "y": pairs[:,1]})

    p = ggplot() + \
            geom_imshow(cm_percent, vmin=0, vmax=1, cmap="magma",show_legend=False) + \
            geom_text(aes(label = "labels", x = "x", y = "y"), data = cm_labels, size = 8, color="white") + \
            scale_x_discrete(labels=target_labels, breaks = img_coordinates, position="top") + \
            scale_y_discrete(labels=target_labels[::-1], breaks = img_coordinates) + \
            xlab("Predicted") + \
            ylab("True") + \
            theme(axis_line_x='blank', legend_title={"title": "Prozentualer Anteil pro Zeile"})
    
    display(p)

print_confusion_matrix(eval_results["target"], eval_results["output"], target_labels)

#### Auffälligkeiten
- In den meisten Falschklassifizierungen wird "neutral" vorhergesagt
- "suprise" und "joyful" werden häufiger miteinander verwechselt
    - Ist mMn auch recht schwer auseinanderzuhalten bzw. eine Emotion, die sich oft auch überschneiden kann (z.B. freudiges überrascht werden)
- "angry" und "supprise" werden häufiger miteinander verwechselt
    - Hier könnten akkustische Feature hilfreich sein
- Die schlechteste Erkennung hat "fear". Sowofl der F1-Score, als auch die Anzahl Beispiele sind hier am geringsten

**Wo könnten akustische Feature helfen?**
- Ich denke besonders in der Abgrenzung zu "neutral", da eine neutrale Tonlage besonders von *stärkeren* Emotionen unterscheidbar ist. Bei "angry" oder "joyful" stelleich mir vor, dass sich die Tonlage stark verändert und wahrscheinlich auch das Sprechtempo.

## Leistung in Abhängigkeit vom Text
Welchen Einfluss haben Textlänge und Fortschritt, Anzahl der Sprecher ... im Gespräch auf die Leistung?

In [146]:
import re
# from nltk.tokenize import word_tokenize
def get_dialog(prompt:str) -> Tuple[List[str], Set[str]]:
    base_prompt = "Now you are expert of sentiment and emotional analysis. The following conversation noted between '### ###' involves several speakers."
    prompt = prompt.replace(base_prompt, "")
    dialog = re.findall(r"###(.*)###", prompt)[0]
    dialog_text = re.findall(r"Speaker_\d+:\"(.*?)(?=\"\t|\"\s?$)", dialog, re.MULTILINE)
    involved_speakers = set(re.findall(r"(Speaker_\d+)", dialog))
    return dialog_text, involved_speakers

def get_dialog_features(results:pd.DataFrame) -> pd.DataFrame:
    feature = results.copy()
    feature["num_speakers"] = feature["input"].apply(lambda x: len(get_dialog(x)[1]))
    feature["dialog_length"] = feature["input"].apply(lambda x: len(get_dialog(x)[0]))
    feature["target_utterance"] = feature["input"].apply(lambda x: get_dialog(x)[0][-1])
    feature["target_utterance_length"] = feature["target_utterance"].apply(lambda x: len(x.split()))
    feature["correct"] = feature["output"] == feature["target"]
    feature["correct"] = feature["correct"].astype(str)
    return feature

eval_results = get_dialog_features(eval_results)
display(eval_results.sort_values("target_utterance_length").head())
display(eval_results.sort_values("target_utterance_length").head().iloc[0,0])


Unnamed: 0_level_0,input,output,target,num_speakers,dialog_length,target_utterance,target_utterance_length,correct
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
629,Now you are expert of sentiment and emotional ...,neutral,joyful,2,14,What?,1,False
1850,Now you are expert of sentiment and emotional ...,angry,disgust,3,6,Arghh!!,1,False
1233,Now you are expert of sentiment and emotional ...,joyful,neutral,3,3,Hi!,1,False
1235,Now you are expert of sentiment and emotional ...,neutral,neutral,3,5,Bye!,1,True
2085,Now you are expert of sentiment and emotional ...,joyful,joyful,1,1,Hi!,1,True


'Now you are expert of sentiment and emotional analysis. The following conversation noted between \'### ###\' involves several speakers. ### \t Speaker_0:"Come on Joey!!!"\t Speaker_1:"Rach, I told you everything I knew last night!"\t Speaker_1:"Look, it\'s not that big of a deal, so Monica and Chandler are doing it."\t Speaker_0:"I can\'t believe you would say that!"\t Speaker_1:"Sorry. Monica and Chandler are making love."\t Speaker_0:"No! I mean come on! This is a"\t Speaker_1:"I don\'t know."\t Speaker_0:"Is he romantic with her?"\t Speaker_1:"I don\'t know."\t Speaker_0:"Are they in love?"\t Speaker_1:"I don\'t know."\t Speaker_0:"You don\'t know anything."\t Speaker_1:"Ohh, I know one thing!"\t Speaker_0:"What?" ### Please select the emotional label of < Speaker_0:"What?"> from <neutral, surprise, fear, sad, joyful, disgust, angry>:'

### Einfluss der Sprechermenge

In [151]:
from numpy.lib.stride_tricks import sliding_window_view

def normalized_results(results:pd.DataFrame, column:str, correct_column:str = "correct") -> pd.DataFrame:
    num_frequencys = np.bincount(results[column])
    grouped_freqs = results.value_counts([correct_column, column], normalize=False).unstack().swapaxes(0,1).reset_index()
    num_frequencys = num_frequencys[grouped_freqs[column]]
    grouped_freqs[["True", "False"]] = grouped_freqs[["True", "False"]].apply(lambda x: x / num_frequencys, axis=0)
    num_proportion = grouped_freqs.melt(id_vars=[column], value_vars=["True", "False"])
    return num_proportion

def normalized_bar_plot(results:pd.DataFrame, column:str, title:str = None, xlabel:str = None, ylabel:str = "Proportion", show_data_dist:bool = True, bin_width:int = 1, correct_column:str = "correct") -> None:
    num_proportion = normalized_results(results, column, correct_column)
    p = ggplot(num_proportion, aes(x=column, y="value")) + \
            geom_bar(aes(fill=correct_column), position="stack", stat="identity", tooltips=None) + \
            geom_hline(yintercept=0.5) + \
            ylab(ylabel) + \
            xlab(xlabel) + \
            ggtitle(title) + \
            scale_fill_manual(values={"True": "#4ec9b0", "False": "#e41a1c"})
    
    if show_data_dist:
        value_distribution = np.bincount(results[column])
        value_range = value_distribution.size - 1
        value_distribution = np.sum(sliding_window_view(value_distribution, bin_width)[::bin_width], axis=1)
        positions = np.linspace(bin_width//2, value_range - bin_width//2, num = value_distribution.size)
        value_distribution_norm = value_distribution / len(results)
        data = {"x": positions, "prop": value_distribution_norm, "Total": value_distribution}
        p = p + geom_area(
                    aes(y="prop", x="x"), 
                    data = data,
                    tooltips=layer_tooltips(["prop", "Total"]).format("@prop", ".3f")
                ) + \
                geom_point(aes(y="prop", x="x"), data = data,tooltips=None) + \
                ggtitle(title, subtitle="Datenverteilung wird als Linie dargestellt")
    
    return p

normalized_bar_plot(eval_results, "num_speakers", "Anteile (in-)korrekter Vorhersagen je Anzahl der Sprecher", "Anzahl der Sprecher")

Die Vorhersagequalität scheint unabhängig von der Anzahl am Gespräch beteiligter Personen zu sein. <br>
Dennoch ist ein leichter abfallender Trend zu erkennen, wo mit der Anzahl Sprecher die Qualität der Vorhersage abnimmt.<br>
Zu berücksichtigen ist aber auch, dass die Anzahl der Beispiele für viele Sprecher weit aus kleiner ist, als für wenige.

### Einfluss der Dialoglänge
*Note: Die Dialoghistorie wurde auf 20 limitiert*

In [12]:
normalized_bar_plot(eval_results, "dialog_length", "Anteile (in-)korrekter Vorhersagen pro Dialoglänge", "Dialoglänge")

Es ist kein Trend zu erkennen. Da die Peak-Performance bei 3 Schritten (Historie von 2) liegt, könnte man ausprobieren, wie sich ein kleineres Fenster auf die Qualität auswirkt. Allerdings kann dies auch reiner Zufall sein.

### Einfluss der zu klassifizierenden Textlänge 

In [13]:
normalized_bar_plot(eval_results, "target_utterance_length", "Anteile (in-)korrekter Vorhersagen pro Äußerungslänge", "Äußerungslänge", bin_width=3)

Es besteht ein Abwärtstrend bezüglich der Vorhersagequalität und der Länge der zu klassifizierenden Äußerung.
Die Ergebnisse bei besonders langen Ausdrücke weisen Schwankungen auf. Allerdings ist auch die Samplegröße in diesem Bereich verschwindend Gering, wodurch es hier die Verteilung auch Zufall sein kann.

# Ergebnis vorhersagen

In [14]:
from sklearn.linear_model import LogisticRegression
x = eval_results[['target','num_speakers', 'dialog_length', 'target_utterance_length']]
y = eval_results[["correct"]]
def transform_data(x:pd.DataFrame) -> pd.DataFrame:
    to_scale = x.select_dtypes(include=["number"])
    to_one_hot = x.select_dtypes(include=["object"])
    out = to_scale.apply(lambda col: col / col.max(), axis=1)
    if len(to_one_hot.columns) > 0:
        one_hot = pd.get_dummies(to_one_hot).astype(np.int8)
        out = pd.concat([one_hot, out], axis=1)
    return out

x_t = transform_data(x)
y_t = transform_data(y)
# display(x_t.head())

split_point = int(len(x_t)*0.1)
x_e = x_t.iloc[:split_point]
x_t = x_t.iloc[split_point:]
y_e = y_t.iloc[:split_point]
y_t = y_t.iloc[split_point:]
print(len(x_e), len(x_t))

260 2342


In [15]:
from sklearn.metrics import accuracy_score
logRe = LogisticRegression(class_weight='balanced')
logRe:LogisticRegression = logRe.fit(x_t.to_numpy(), y_t.to_numpy()[:,0])
log_eval = logRe.predict(x_e.to_numpy())

In [16]:
accuracy_score(y_e.to_numpy()[:,0],log_eval)

0.6730769230769231

**Das Modell kann nur Raten**

# EDA

In [17]:
from glob import glob
import os

all_data_paths = glob(r"./results/llm/*.json")
def load_raw_data(path:str) -> pd.DataFrame:
    with open(path, "r") as f:
        json_lines = f.readlines()
    json_lines = list(map(json.loads, json_lines))
    df = pd.DataFrame(json_lines)
    df["source"] = os.path.basename(path).split(".")[0]
    df["output"] = df["target"]
    return df

all_data = pd.concat(list(map(load_raw_data, all_data_paths))).reset_index(drop=True)
all_data = get_dialog_features(all_data).drop(["output", "correct"], axis=1)
all_data.head()

Unnamed: 0,input,target,source,num_speakers,dialog_length,target_utterance,target_utterance_length
0,Now you are expert of sentiment and emotional ...,surprise,test,1,1,Why do all youre coffee mugs have numbers on ...,11
1,Now you are expert of sentiment and emotional ...,angry,test,2,2,Oh. Thats so Monica can keep track. That way ...,22
2,Now you are expert of sentiment and emotional ...,neutral,test,2,3,Y'know what?,2
3,Now you are expert of sentiment and emotional ...,neutral,test,1,1,"Come on, Lydia, you can do it.",7
4,Now you are expert of sentiment and emotional ...,joyful,test,1,2,Push!,1


Gibt es einen Zusammenhang zwischen der Ausdruckslänge und der Emotion?

In [18]:
ggplot(all_data, aes(y="target_utterance_length", x="target")) + geom_boxplot()

In [19]:
ggplot(all_data, aes(slice="..count..", fill="target")) + \
    geom_pie(size=15, hole=0.5) + \
    facet_wrap("source", nrow=1) + \
    theme_void()
    

# Klassengewichte

Die Datenverteilung und Confusionmatrix legt nah, dass ein Problem bei der Klassifizierung die Klassenbalance ist. Besonders "fear" wird häufiger als "neutral" klassifiziert als "fear" selbst. <br>
Daher nutze ich in einem weiteren Versuch Klassengewichte, um den Fehlereinfluss der Klassen zu verändern

Berechnung der Klassengewichte mittels `class_weights = n_samples / (n_classes * np.bincount(y))`. class_weights ist ein Vektor, der die Gewichte pro Klasse enthält. <br>
Da ein so starkes Ungleichgewicht besteht, müssen die Gewichte etwas abgeschwächt werden, um keinen übermäßigen Fehler zu erzeugen. Dafür wird folgende Funktion verwendet:
$$
f(x) = 2 - \frac{1}{0.5 + 0.5\cdot \exp(-\alpha x)}
$$
Für $\alpha = 1$ ergibt sich die Kurve im folgenden Plot

In [20]:
def inv_sigmoid(x, alpha = 1) -> np.ndarray:
    return 2 - (1 / (0.5 + 0.5*np.exp(-alpha*x)))

x = np.linspace(0, 10, 100)
y = inv_sigmoid(x, 1)
ggplot({'x': x, 'y': y}, aes(x='x', y='y')) + geom_line()

In [21]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced', classes=target_labels, y=all_data[all_data["source"] == "train"]["target"])
label_weight_mapping = {l:w for l, w in zip(target_labels, class_weights*0.5)}
label_weight_mapping = pd.DataFrame(zip(target_labels, class_weights), columns=["label", "weight"])
label_weight_mapping["new_weight"] = label_weight_mapping["weight"] * inv_sigmoid(label_weight_mapping["weight"], alpha=0)
ggplot(label_weight_mapping, aes(x="label", y="weight")) + geom_bar(stat="identity") + geom_bar(aes(y="new_weight"), stat="identity", fill="red")


## Ergebnis

In [126]:
balanced_results = get_df_from_results(r"./results/llm/preds_for_eval_balanced.text")
balanced_results = balanced_results[balanced_results["output"].isin(balanced_results["target"].unique())]
display(cls_tabel)
display(get_classification_report(balanced_results))
print_confusion_matrix(balanced_results["target"], balanced_results["output"])

Unnamed: 0,label,precision,recall,f1-score,support,prop
4,neutral,0.76,0.84,0.8,1253,0.481553
3,joyful,0.68,0.54,0.6,398,0.152959
6,surprise,0.57,0.62,0.59,281,0.107994
0,angry,0.56,0.53,0.54,345,0.13259
1,disgust,0.38,0.44,0.41,68,0.026134
5,sad,0.48,0.34,0.4,208,0.079939
2,fear,0.35,0.29,0.31,49,0.018832


Unnamed: 0,label,precision,recall,f1-score,support
4,neutral,0.79,0.8,0.79,1253
6,surprise,0.53,0.68,0.59,281
3,joyful,0.67,0.51,0.58,402
0,angry,0.5,0.51,0.5,345
1,disgust,0.4,0.44,0.42,68
5,sad,0.46,0.39,0.42,208
2,fear,0.24,0.35,0.28,49


**Gewichte haben leider in allen, bis auf zwei Kategorien, schlechtere Ergebnisse erzielt**

# Vergleich zwischen Akustik und Text

In [23]:
ac_results = pd.read_csv("./results/audio/test_results.csv", index_col=0)
ac_results.head()

Unnamed: 0,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,predicted_emotion
0,Why do all youre coffee mugs have numbers on t...,Speaker_0,surprise,positive,0,0,3,19,"00:14:38,127","00:14:40,378",neutral
1,Oh. Thats so Monica can keep track. That way i...,Speaker_1,anger,negative,0,1,3,19,"00:14:40,629","00:14:47,385",neutral
2,Y'know what?,Speaker_1,neutral,neutral,0,2,3,19,"00:14:56,353","00:14:57,520",neutral
3,"Come on, Lydia, you can do it.",Speaker_0,neutral,neutral,1,0,1,23,"0:10:44,769","0:10:46,146",neutral
4,Push!,Speaker_0,joy,positive,1,1,1,23,"0:10:46,146","0:10:46,833",neutral


In [24]:
eval_results.head()

Unnamed: 0_level_0,input,output,target,num_speakers,dialog_length,target_utterance,target_utterance_length,correct
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Now you are expert of sentiment and emotional ...,neutral,neutral,3,6,"Okay, okay. Umm, well ah, maybe he, maybe he f...",16,True
1,Now you are expert of sentiment and emotional ...,neutral,neutral,2,4,"It's okay. Chandler, are you afraid of me or s...",10,True
2,Now you are expert of sentiment and emotional ...,angry,angry,2,11,If you give up every time you'd have a fight w...,19,True
3,Now you are expert of sentiment and emotional ...,neutral,angry,3,16,Is not.,2,False
4,Now you are expert of sentiment and emotional ...,angry,surprise,3,8,"Oh, wait a second! I didnt say I wasnt free!",10,False


## Datensätze mittels fuzzy join verbinden

In [39]:
from thefuzz import fuzz

eval_results["key"] = 1
eval_results["row_id"] = range(len(eval_results))
ac_results["key"] = 1

cross_results = pd.merge(eval_results, ac_results, on ='key').drop("key", axis=1)
def calc_fuzzy_score(df:pd.DataFrame):
    wer_ut = df.apply(lambda row: fuzz.ratio(row["target_utterance"], row["Utterance"])/100, axis=1).to_numpy()
    wer_target = df.apply(lambda row: fuzz.ratio(row["target"], row["Emotion"])/100, axis=1).to_numpy()
    return {"ut_fuzz_score": wer_ut, "target_fuzz_score": wer_target}
    
cross_results = cross_results.assign(**calc_fuzzy_score(cross_results))

In [81]:
joined_results = cross_results[(cross_results["ut_fuzz_score"] > 0.8) & (cross_results["target_fuzz_score"] > 0.5)]
best_utt_match_ids = joined_results.groupby("row_id")["ut_fuzz_score"].idxmax()
joined_results = joined_results.loc[best_utt_match_ids]
joined_results = joined_results.drop("row_id", axis=1).sort_values(["Dialogue_ID",	"Utterance_ID"]).reset_index(drop=True)

print(f"Länge LLM ergebnisse {len(eval_results)}")
print(f"Länge Akustische ergebnisse {len(ac_results)}")
print(f"Länge gemeinsamer ergebnisse {len(joined_results)}")
display(joined_results[["target", "Emotion", "target_utterance", "Utterance", "ut_fuzz_score", "target_fuzz_score"]].sort_values("ut_fuzz_score").head())

joined_results = joined_results.rename(columns={
    "output": "llm_output", 
    "predicted_emotion": "ac_output", 
    "correct": "llm_correct", 
    "target": "llm_target"
}).drop([
    "input",
    "Sentiment",
    "Season",
    "Episode",
    "StartTime",
    "EndTime",
    "dialog_length",
    "num_speakers",
    "target_utterance",
    "ut_fuzz_score",
    "target_fuzz_score"
], axis=1)

Länge LLM ergebnisse 2602
Länge Akustische ergebnisse 2608
Länge gemeinsamer ergebnisse 2600


Unnamed: 0,target,Emotion,target_utterance,Utterance,ut_fuzz_score,target_fuzz_score
754,surprise,surprise,OH.MY.GAWD!!!,OH….MY….GAWD!!!,0.87,1.0
831,angry,anger,"Ow, no woo-hooing, no woo-hooing.","Ow, no 'woo-hooing, no 'woo-hooing.",0.92,0.8
2201,neutral,neutral,Its-its hot,Its-its hot,0.92,1.0
2029,neutral,neutral,Yknow?,Yknow?,0.92,1.0
1878,neutral,neutral,Where Where you will seea uha bunch of uh...,Where… Where you will see…a uh…a bunch of uh…...,0.93,1.0


In [82]:
joined_results.head()

Unnamed: 0,llm_output,llm_target,target_utterance_length,llm_correct,Utterance,Speaker,Emotion,Dialogue_ID,Utterance_ID,ac_output
0,neutral,surprise,11,False,Why do all youre coffee mugs have numbers on t...,Speaker_0,surprise,0,0,neutral
1,neutral,angry,22,False,Oh. Thats so Monica can keep track. That way i...,Speaker_1,anger,0,1,neutral
2,neutral,neutral,2,True,Y'know what?,Speaker_1,neutral,0,2,neutral
3,neutral,neutral,7,True,"Come on, Lydia, you can do it.",Speaker_0,neutral,1,0,neutral
4,angry,joyful,1,False,Push!,Speaker_0,joy,1,1,neutral


Die Ergebnisse wurden mittels fuzzy matching über die Utterance verknüpft. Hier können gerade bei kleinen Ausdrücken Fehler entstanden sein. Es wurde aber darauf geachtet, dass die Targets gleich sein müssen, damit ein match entsteht

## Analyse der Unterschiede

In [104]:
joined_results["ac_correct"] = joined_results["Emotion"] == joined_results["ac_output"]
joined_results["only_ac_correct"] = ~joined_results["llm_correct"].replace({'True': True, 'False': False}) & joined_results["ac_correct"]
joined_results["only_llm_correct"] = joined_results["llm_correct"].replace({'True': True, 'False': False}) & ~joined_results["ac_correct"]
joined_results["both_correct"] = joined_results["llm_correct"].replace({'True': True, 'False': False}) & joined_results["ac_correct"]
joined_results["any_correct"] = joined_results["llm_correct"].replace({'True': True, 'False': False}) | joined_results["ac_correct"]

print("Insgesamt korrekt:", len(joined_results[joined_results["any_correct"]]))
print("davon llm:", len(joined_results[joined_results["llm_correct"].replace({'True': True, 'False': False})]))
print("davon akustik:", len(joined_results[joined_results["ac_correct"]]))
print("davon nur akustik:", len(joined_results[joined_results["only_ac_correct"]]))
print("davon nur llm:", len(joined_results[joined_results["only_llm_correct"]]))
print("Prozentualer Anteil korrekt:", round(len(joined_results[joined_results["any_correct"]]) / len(joined_results) * 100, 2), "%")

Insgesamt korrekt: 1979
davon llm: 1739
davon akustik: 1307
davon nur akustik: 240
davon nur llm: 672
Prozentualer Anteil korrekt: 76.12 %


Frage: In welchen Kategorien hat die Akustik Vorteile?

In [114]:
joined_results["exclusive_correct"] = pd.from_dummies(joined_results[["only_ac_correct", "only_llm_correct"]], default_category="none_or_both")
ggplot(joined_results[joined_results["exclusive_correct"] != "none_or_both"], aes(slice="..count..", fill="Emotion")) + \
    geom_pie(size=15, hole=0.5) + \
    facet_wrap("exclusive_correct", nrow=1) + \
    theme_void()
    

In [116]:
ggplot(joined_results[joined_results["both_correct"]], aes(fill="Emotion")) + geom_pie(size=15, hole=0.5) + theme_void()

Wie sieht die Confusionsmatrix der Akustik aus

In [134]:
target_labels = list(joined_results["Emotion"].value_counts().index)
print_confusion_matrix(joined_results["Emotion"], joined_results["ac_output"], target_labels=target_labels)

### Zusammenhänge beim Erkennungserfolg

In [160]:

normalized_bar_plot(joined_results.astype({"only_ac_correct": str}), "target_utterance_length", correct_column="only_ac_correct")