# Gedichte, Dramen, Prosa 1850-1920


In [2]:
import re
from collections import Counter
from pathlib import Path
import time
from itertools import combinations

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

from tqdm.notebook import tqdm_notebook

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_community.chat_models import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score
import sklearn.metrics as sm
from sklearn.preprocessing import MultiLabelBinarizer

#import keys  #sets keys for accessing the LLMs

In [6]:
#path to data files
data_dir = Path('../resources')

In [7]:
#some functions we need later on
def date_string():
    t = time.ctime().split(' ')
    h,m,s = t[3].split(':')
    return f"{t[4]}_{t[1]}_{t[2]}_{h}_{m}"


def get_poem(path, filename):
    try:
        with open(path / filename) as fin:
            return fin.read()
    except FileNotFoundError:
        print(f"ERROR: Couldn't find file {filename}")


def vote(series):
    data = Counter(series)
    return data.most_common(1)[0][0]

def compute_ensemble(df):
    """
    computes the voting results for a dataframe
    assumes columns for gpt4o, gemini, claude 
    """
    ensemble = []
    for i, row in df.iterrows():
        ensemble.append(vote([row.gpt4o, row.gemini, row.claude]))
    return ensemble


def add_labels(df):
    #add the emotion labels to the data. makes eval easier
    labels = []
    for lnr, row in df.iterrows():
        label = []
        if row.Agitation > 0.0: 
            label.append('AGITATION')
        if row.Anger > 0.0: 
            label.append('ANGER')
        if row.Fear > 0.0: 
            label.append('FEAR')
        if row.Joy > 0.0: 
            label.append('JOY')
        if row.Love > 0.0: 
            label.append('LOVE')
        if row.Sadness > 0.0: 
            label.append('SADNESS')
        if (row.Agitation==0.0) & (row.Anger==0.0) & (row.Fear==0.0) & (row.Joy==0.0) & (row.Love==0.0) & (row.Sadness==0.0):
            label.append('NONE')
        labels.append("#".join(label))
    
    df['label'] = labels
    return df


# how many poems do we have with no emotions at all?
def no_emotions(df, sample_size=None):
    values = 6*[0]
    df_tmp = df.query(f"Agitation=={values[0]} and \
                         Anger=={values[1]} and \
                         Fear=={values[2]} and \
                         Joy=={values[3]} and \
                         Love=={values[4]} and \
                         Sadness=={values[5]}")
    if sample_size == 'max':    
        return df_tmp.sample(n=len(df_tmp), random_state=42)
    else:
        return df_tmp.sample(n=sample_size, random_state=42)

#cleanup
def extract_terms(series):
    buffer = series.str.findall(f"ANNOTATION: ([A-Z#]+)")
    buffer_ = pd.Series([e[0] if len(e) > 0 else e for e in buffer.values])
    return buffer_.str.split('#').values



# LLMs as annotators
## Data preparation 

In [8]:
df = pd.read_csv(data_dir / 'lyrik_shaver_stanza.csv', sep='\t', index_col=0)
df.head()

Unnamed: 0,DocName,Text,Freude,Glück,Hoffnung,Geborgenheit,Begeisterung,Stolz,Trost,Ausgeglichenheit,...,Agitation,Sadness,Love,Joy,Anger,Fear,Gattung,Ungewissheit,Context,ID
0,"Am Morgen Lingg, Hermann l0080333","Am Morgen\nLingg, Hermann",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Lyrik,0.0,"Am Morgen\nLingg, Hermann",l0080333
1,"Am Morgen Lingg, Hermann l0080333","Ich sah dich im azurnen Schleier,\nIn deinen R...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Lyrik,0.0,"Am Morgen\nLingg, Hermann Ich sah dich im azur...",l0080333
2,"Am Morgen Lingg, Hermann l0080333",Im Lichte deiner Sterne wähnen\nDie treuen Bli...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,Lyrik,0.0,"Am Morgen\nLingg, Hermann Ich sah dich im azur...",l0080333
3,"Am Morgen Lingg, Hermann l0080333",Und eine Hand im Schatten gleitet\nHerüber aus...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,Lyrik,0.0,"Am Morgen\nLingg, Hermann Ich sah dich im azur...",l0080333
7,PHANTASIE WILHELM KLEMM l00300187,PHANTASIE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Lyrik,0.0,PHANTASIE,l00300187


In [9]:
#look at the distributions of the emotions
results = {}
emotions = ['Agitation', 'Anger', 'Fear', 'Joy', 'Love', 'Sadness']
for emotion in emotions:
    results[emotion] = df[emotion].sum()
results    

{'Agitation': 332.0,
 'Anger': 344.0,
 'Fear': 285.0,
 'Joy': 1772.0,
 'Love': 1577.0,
 'Sadness': 1670.0}

In [10]:
#what kind of combinations do we have (limit to 2)

for e in combinations(emotions, 2):
    print(f"{e} - {df.DocName[(df[e[0]] == 1.0) & (df[e[1]]==1.0)].count()}")

('Agitation', 'Anger') - 17
('Agitation', 'Fear') - 19
('Agitation', 'Joy') - 93
('Agitation', 'Love') - 91
('Agitation', 'Sadness') - 78
('Anger', 'Fear') - 36
('Anger', 'Joy') - 95
('Anger', 'Love') - 108
('Anger', 'Sadness') - 115
('Fear', 'Joy') - 90
('Fear', 'Love') - 72
('Fear', 'Sadness') - 113
('Joy', 'Love') - 537
('Joy', 'Sadness') - 597
('Love', 'Sadness') - 490


### Stratified Sampling

In [15]:
SAMPLE_SIZE = 50  #sample size per emotion/None
samples = []
#sample n from each emotion
for e in emotions:
    samples.append(df[df[e] == 1.0].sample(n=SAMPLE_SIZE, random_state=45))

#sample n text segments without any emotions
samples.append(df[(df.Agitation==0.0) & (df.Anger==0.0) & (df.Fear==0.0) & (df.Joy==0.0) & (df.Love==0.0) & (df.Sadness==0.0)].sample(n=SAMPLE_SIZE, random_state=42))

df_strat = pd.concat(samples)   

#randomize sequence of rows
df_strat = df_strat.sample(n=len(df_strat), random_state=42)  


In [16]:
df_strat = add_labels(df_strat)
df_strat.to_csv(data_dir / 'data_stratified_sampling.tsv', sep='\t')
df_strat.head()

Unnamed: 0,DocName,Text,Agitation,Anger,Fear,Joy,Love,Sadness,Context,ID,label
1269,DAS BAD AUF DEM LANDE MAX BROD l0029012,"Ihr Glücklichen! — Ich sah das Liebespaar,\nDa...",0.0,0.0,0.0,1.0,1.0,0.0,DAS BAD AUF DEM LANDE Ihr Glücklichen! — Ich s...,l0029012,JOY#LOVE
4189,"Papa Opitz: Schlaf, Johannes","Papa Opitz:\nSchlaf, Johannes",0.0,0.0,0.0,0.0,0.0,0.0,"Papa Opitz:\nSchlaf, Johannes",l00100092,NONE
1990,"Nachtigallenlied Leander, Richard","Nachtigallenlied\nLeander, Richard",0.0,0.0,0.0,0.0,0.0,0.0,"Nachtigallenlied\nLeander, Richard",l0010130,NONE
2252,"Erst seit Du mein geworden Berthold, Gustav","Erst seit Du mein geworden,\nDäucht mir das Le...",0.0,0.0,0.0,1.0,1.0,0.0,"Erst seit Du mein geworden\nBerthold, Gustav E...",l0080056,JOY#LOVE
6571,"Sehnsucht Weiß, Emil Rudolf","Schmerz und Trost der Schmerzen,\nbist in eine...",0.0,0.0,0.0,1.0,0.0,1.0,"Sehnsucht\nWeiß, Emil Rudolf Oft am langen Tag...",l0030432,JOY#SADNESS


### General Sampling

In [14]:
df_gen = df.sample(n=350, random_state=45)
df_gen = add_labels(df_gen)
df_gen.to_csv(data_dir / 'data_gen_sampling.tsv', sep='\t')
df_gen.head()

Unnamed: 0,DocName,Text,Agitation,Anger,Fear,Joy,Love,Sadness,Context,ID,label
5308,"Acherontisches Frösteln Liliencron, Detlev von...","Schon nascht der Staar die rote Vogelbeere,\nZ...",0.0,0.0,0.0,1.0,0.0,1.0,"Acherontisches Frösteln\nLiliencron, Detlev vo...",l00180014,JOY#SADNESS
4261,"Hand in Hand Grosse, Julius [alt. Große, Julius]","Hand in Hand\nGrosse, Julius [alt. Große, Julius]",0.0,0.0,0.0,0.0,0.0,0.0,"Hand in Hand\nGrosse, Julius [alt. Große, Julius]",l0080170,NONE
6542,"Ballade Des Äusseren Lebens Hofmannsthal, Hugo...",Und süße Früchte werden aus den herben\nUnd fa...,0.0,0.0,0.0,0.0,0.0,0.0,"Ballade Des Äusseren Lebens\nHofmannsthal, Hug...",l00180152,NONE
5798,OPHELIA GEORG HEYM l00300075,"Vorbei, vorbei. Da sich dem Dunkel weiht\nDer ...",0.0,0.0,0.0,0.0,0.0,1.0,Die blauen Lider schatten sanft herab.\nUnd be...,l00300075,SADNESS
5528,"Ein glücklicher Schreibfehler Beyer, Karl Frie...","Hans dankt dem Doktor Braß,\nDaß er am Leben b...",0.0,0.0,0.0,0.0,1.0,0.0,"Ein glücklicher Schreibfehler\nBeyer, Karl Fri...",l00130270,LOVE


## Annotation guide lines

In [15]:
annotation_guidelines = """
A text segment represents an emotion only if it contains at least one clear emotion marker. Possible emotion markers are 
- single words that lexically indicate an emotion ('joy', 'laughter', 'happy', etc.)
- Linguistic images (especially metaphors, such as 'the broken heart')
- Situations typically associated with certain emotions (e.g., the death of a loved one)
- Descriptions of behaviors that typically indicate emotionality ("She pounded her fist on the table...")
- Formal features of direct speech that indicate the speaker's emotionality (e.g., omissions and/or frequent exclamation marks, as in 'This ... is ... just ... madness!!!')

If there are no clear emotion markers, do not be afraid to annotate NONE.

EXAMPLE 1:

Ich lauscht' entzückt dem jubelnden Gesang,
Ich sog erquickt den süßen Waldduft ein,
Der mir in vollem Strom entgegendrang,
Und leise sprach ich in mich selbst hinein:

The text segment contains several clear emotion markers indicating JOY, such as the words „entzücket“ and „jubelndem“.

ANSWER: JOY

EXAMPLE 2:

Was quäl ich mich! Hier trieb vielleicht
schon manches Paar sein loses Spiel,
und sind erglüht und sind erbleicht
und sprachen dann vom Tode viel.
Die See rauscht.

The text segment contains several clear emotion markers indicating SADNESS, such as the word „quäl“ and the mentioning of the situation „death“.  
The text segment also contains several clear emotion markers indicating LOVE, such as the behavior of „playing a loose game“ and the metaphor of „erglüht“. 

ANSWER: SADNESS, LOVE

EXAMPLE 3:

Hier klafft ein Maul, das zahnlos auf sich reißt.
Hier hebt sich zweier Arme schwarzer Stumpf.
Ein Irrer lallt die hohlen Lieder dumpf,
Wo hockt ein Greis, des Schädel Aussatz weißt.

It could be argued that the text segment describes something "negative" in a general sense, but there are not clear enough emotion markers according to the Annotation Guidelines.

ANSWER: NONE
"""


## Preparing the Prompts

### Short Prompt

In [5]:
#short prompt template used in Exp. 1, 3. The placeholders 'stanza' and 'context' are filled when the template is called.
short_prompt_text = """Your task is to annotate the representation of emotion in a stanza of a poem.

There are six possible emotions: ANGER, FEAR, JOY, LOVE, SADNESS, AGITATION. Never use other emotions.

A stanza can represent exactly one emotion, but it can also represent several or NONE of the emotions listed above.

{stanza}

Take into account that the stanza is part of a poem that reads like this before the stanza just quoted:

{context}

In your answer, don't repeat the stanza or the text in your answer! If the stanza does not represent one of the six emotions, answer NONE. Otherwise, answer with the name(s) of the emotion(s) represented in the stanza (ANGER, FEAR, JOY, LOVE, SADNESS, SURPRISE). 
If there is more than one emotion in the stanza, separate their names with a #, e.g. ANGER#FEAR.

"""

short_prompt = ChatPromptTemplate.from_template(short_prompt_text)

### Long Prompt

In [6]:
examples = """
EXAMPLE:

Was quäl ich mich! Hier trieb vielleicht
schon manches Paar sein loses Spiel,
und sind erglüht und sind erbleicht
und sprachen dann vom Tode viel.
Die See rauscht.

ANNOTATION: SADNESS#LOVE
REASONING: The stanza contains several clear emotion markers indicating SADNESS, such as the word „quäl“ and the mentioning of the situation „death“.  The stanza also contains several clear emotion markers indicating LOVE, such as the behavior of „playing a loose game“ and the metaphor of „erglüht“. 


EXAMPLE:

Hier klafft ein Maul, das zahnlos auf sich reißt.
Hier hebt sich zweier Arme schwarzer Stumpf.
Ein Irrer lallt die hohlen Lieder dumpf,
Wo hockt ein Greis, des Schädel Aussatz weißt.

ANNOTATION: NONE
REASONING: It could be argued that the stanza describes something "negative" in a general sense, but there are not clear enough emotion markers according to the Annotation Guidelines.


EXAMPLE:

Der Mond liegt müd am Himmelssaum,
kein Ton, kein Hauch im weiten Raum.
Sein Herz nur schlägt so ungestüm
und lautlos spricht die Zeit mit ihm.
Und zeigt ihm auf den Bergeshöhn
Sein eignes Bildnis mächtig gehn.

ANNOTATION: AGITATION
REASONING: The stanza contains clear emotion markers indicating AGITATION, especially the behavior “Sein Herz nur schlägt so ungestüm”.

EXAMPLE:

Ob auch die Drosseln ihn erstaunt geschaut?
Denn ihr Geschmetter, eben noch so laut,
Verstummte plötzlich. Uns zur Seite trat
Der wackre Arzt, der leisen Schritts genaht:

ANNOTATION: AGITATION
REASONING: The stanza contains clear emotion markers indicating AGITATION, especially the word “erstaunt”, which means surprise. Surprise is part of AGITATION in the sense of the annotation.

EXAMPLE: 

Erzürnt erhob ein Waldbrand seine Flügel,
Die ganze Insel ward zum Aschenhügel,
Und aus der Asche wieder sproßten Reben.

ANNOTATION: ANGER
REASONING: The stanza contains clear emotion markers indicating ANGER, especially the word “erzürnt”.

EXAMPLE:

Denn hassen, hassen muß der Wicht,
Der ein beschränkter Tropf ist,
Das überlegene Gedicht
Deß, der ein freier Kopf ist,

ANNOTATION: ANGER
REASONING: The stanza contains clear emotion markers indicating ANGER, especially the word “hassen”, which means hate. Hate is part of ANGER in the sense of the annotation.

EXAMPLE: 

Und rings kein Laut, als nur ein bang Gewimmer
von einem Zügenglöcklein, das im Dorfe zog
der greise Küster, immer betend, immer.

ANNOTATION: FEAR
REASONING: The stanza contains clear emotion markers indicating FEAR, especially the words “bang” and “Gewimmer”.

EXAMPLE:

Wer mein trübes Geheimnis ahnt,
Wird erblassen,
Er ist gemahnt
Und verlassen.

ANNOTATION: FEAR
REASONING: The stanza contains clear emotion markers indicating FEAR, especially the behavior “erblassen”, which indicates that someone is scared or frightened.

EXAMPLE:

Ja reimt euch Seelen —
bis jauchzend schallt,
eine Riesenorgel,
der Weltenwald!“

ANNOTATION: JOY
REASONING: The stanza contains clear emotion markers indicating JOY, especially the word “jauchzend”.

EXAMPLE:

Alles Frohe will mit mir gehn,
Jünglinge, selige Frauen,
Jauchzend den Frühling zu schauen,
Glückumschlungen am Tor zu stehn.

ANNOTATION: JOY
REASONING: The stanza contains many clear emotion markers indicating JOY, such as the words “Frohe”, “selige”, “Jauchzend” and “Glückumschlungen”.

EXAMPLE:

Ich glaube an der Schöpfung ungemessne Not,
Die gellend auf zum fernen Himmel schreit;
Ich glaube an die Gräber und den Tod,
An alles Herrlichen Vergänglichkeit.

ANNOTATION: SADNESS
REASONING: The stanza contains clear emotion markers indicating SADNESS, especially the words “Not” and “schreit”, but also the situation “Tod”.

EXAMPLE:

Hat mit Ähren sich das Mieder
Unschuldig geschmückt,
Sich den Hut verlegen nieder
In die Stirn gedrückt.

ANNOTATION: SADNESS
REASONING: The stanza contains clear emotion markers indicating SADNESS, especially the word “verlegen”, which indicates shame, which is part of SADNESS in the sense of the annotation.

EXAMPLE:

Es hat die Nacht die bleiche Hand erhoben
Und tausend Sterne hingesät.
Durch mondeshelle Lüfte weht
Zitternd die Sehnsucht von dort oben; —

ANNOTATION: LOVE
REASONING: The stanza contains clear emotion markers indicating LOVE, especially the word “Sehnsucht”. Sehnsucht means longing, which is part of LOVE in the sense of the annotation.

EXAMPLE:

Es hängen meine Lippen reif wie Beeren
Aus heißem Blut und betteln um die Güte,
Küssen zu dürfen. Ach, ich will verhehren
Mit meinen Küssen der Geliebten Blüte.

ANNOTATION: LOVE
REASONING: The stanza contains clear emotion markers indicating LOVE, e.g., the word “Küssen”, which indicates sexual desire, which is part of LOVE in the sense of the annotation.

EXAMPLE:

Die Tanne ragt so hoch und stolz —
Die Sträucher kichern und schwätzen,
Sie haben am stolzen Tannenbaum
Gar vieles auszusetzen.

ANNOTATION: ANGER#JOY
REASONING: The stanza contains clear emotion markers indicating JOY, especially the word “stolz”, which means pride, which is part of JOY in the sense of the annotation. The stanza also contains clear emotion markers indicating ANGER, since the bushes make fun of the fir, which is a behavior indicating dislike.

EXAMPLE:

Schied auch die Muschel lange schon
Vom Meer, das ihre Heimath war —
In ihrer Tiefe rauscht ein Ton
Wie Meeresheimath immerdar.

ANNOTATION: NONE
REASONING: The stanza contains no clear emotion markers.

EXAMPLE:

Auf Moos und Wurzeln klang hohl der Tritt,
Und hinter uns gingen bei jedem Schritt
Waldbäume in schweren Scharen mit.

ANNOTATION: NONE
REASONING: The stanza contains some phrases that could be described as "negative" in a very broad sense ("hohl der Tritt", "schweren Scharen"), but this is certainly not clear enough for an annotation.

"""

### Binary classification

In [7]:
long_prompt_text = """Your task is to annotate the representation of emotion in a stanza of a poem.

There are six possible emotions: AGITATION, ANGER, FEAR, JOY, LOVE, SADNESS. Never use other emotions.

Understand the 6 emotions as technical terms that designate a specific range of sub-emotions each:
ANGER includes dislike, disgust, hatred, envy, contempt and anger.
FEAR includes fear and fright.
JOY includes balance, enthusiasm, joy, solace, happiness, hope, pride and comfort.
LOVE includes admiration, gratitude, love, lust (non-sexual), lust (sexual), longing and affection.
SADNESS includes loneliness, disappointment, sorrow, pity, powerlessness, remorse, shame, grief, discomfort, impatience, insecurity, despair and melancholy.
AGITATION includes unspecified emotionality, surprise, agitation and excitement.

A stanza can represent exactly one emotion, but it can also represent several or NONE of the emotions listed above.

Take these additional guidelines into account:

A stanza represents an emotion only if it contains at least one clear emotion marker. Possible emotion markers are 
- single words that lexically indicate an emotion ('joy', 'laughter', 'happy', etc.)
- Linguistic images (especially metaphors, such as 'the broken heart')
- Situations typically associated with certain emotions (e.g., the death of a loved one)
- Descriptions of behaviors that typically indicate emotionality ("She pounded her fist on the table...")
- Formal features of direct speech that indicate the speaker's emotionality (e.g., omissions and/or frequent exclamation marks, as in 'This ... is ... just ... madness!!!')

If there are no clear emotion markers, the annotation is NONE. Only annotate an emotion, if there are enough explicit emotion markers!

Your answer consists of two parts, an annotation and a reasoning part. Each part is on a separate line. See the following examples for details.

EXAMPLE:

Was quäl ich mich! Hier trieb vielleicht
schon manches Paar sein loses Spiel,
und sind erglüht und sind erbleicht
und sprachen dann vom Tode viel.
Die See rauscht.

ANNOTATION: SADNESS#LOVE
REASONING: The stanza contains several clear emotion markers indicating SADNESS, such as the word „quäl“ and the mentioning of the situation „death“.  The stanza also contains several clear emotion markers indicating LOVE, such as the behavior of „playing a loose game“ and the metaphor of „erglüht“. 


EXAMPLE:

Hier klafft ein Maul, das zahnlos auf sich reißt.
Hier hebt sich zweier Arme schwarzer Stumpf.
Ein Irrer lallt die hohlen Lieder dumpf,
Wo hockt ein Greis, des Schädel Aussatz weißt.

ANNOTATION: NONE
REASONING: It could be argued that the stanza describes something "negative" in a general sense, but there are not clear enough emotion markers according to the Annotation Guidelines.


EXAMPLE:

Der Mond liegt müd am Himmelssaum,
kein Ton, kein Hauch im weiten Raum.
Sein Herz nur schlägt so ungestüm
und lautlos spricht die Zeit mit ihm.
Und zeigt ihm auf den Bergeshöhn
Sein eignes Bildnis mächtig gehn.

ANNOTATION: AGITATION
REASONING: The stanza contains clear emotion markers indicating AGITATION, especially the behavior “Sein Herz nur schlägt so ungestüm”.

EXAMPLE:

Ob auch die Drosseln ihn erstaunt geschaut?
Denn ihr Geschmetter, eben noch so laut,
Verstummte plötzlich. Uns zur Seite trat
Der wackre Arzt, der leisen Schritts genaht:

ANNOTATION: AGITATION
REASONING: The stanza contains clear emotion markers indicating AGITATION, especially the word “erstaunt”, which means surprise. Surprise is part of AGITATION in the sense of the annotation.

EXAMPLE: 

Erzürnt erhob ein Waldbrand seine Flügel,
Die ganze Insel ward zum Aschenhügel,
Und aus der Asche wieder sproßten Reben.

ANNOTATION: ANGER
REASONING: The stanza contains clear emotion markers indicating ANGER, especially the word “erzürnt”.

EXAMPLE:

Denn hassen, hassen muß der Wicht,
Der ein beschränkter Tropf ist,
Das überlegene Gedicht
Deß, der ein freier Kopf ist,

ANNOTATION: ANGER
REASONING: The stanza contains clear emotion markers indicating ANGER, especially the word “hassen”, which means hate. Hate is part of ANGER in the sense of the annotation.

EXAMPLE: 

Und rings kein Laut, als nur ein bang Gewimmer
von einem Zügenglöcklein, das im Dorfe zog
der greise Küster, immer betend, immer.

ANNOTATION: FEAR
REASONING: The stanza contains clear emotion markers indicating FEAR, especially the words “bang” and “Gewimmer”.

EXAMPLE:

Wer mein trübes Geheimnis ahnt,
Wird erblassen,
Er ist gemahnt
Und verlassen.

ANNOTATION: FEAR
REASONING: The stanza contains clear emotion markers indicating FEAR, especially the behavior “erblassen”, which indicates that someone is scared or frightened.

EXAMPLE:

Ja reimt euch Seelen —
bis jauchzend schallt,
eine Riesenorgel,
der Weltenwald!“

ANNOTATION: JOY
REASONING: The stanza contains clear emotion markers indicating JOY, especially the word “jauchzend”.

EXAMPLE:

Alles Frohe will mit mir gehn,
Jünglinge, selige Frauen,
Jauchzend den Frühling zu schauen,
Glückumschlungen am Tor zu stehn.

ANNOTATION: JOY
REASONING: The stanza contains many clear emotion markers indicating JOY, such as the words “Frohe”, “selige”, “Jauchzend” and “Glückumschlungen”.

EXAMPLE:

Ich glaube an der Schöpfung ungemessne Not,
Die gellend auf zum fernen Himmel schreit;
Ich glaube an die Gräber und den Tod,
An alles Herrlichen Vergänglichkeit.

ANNOTATION: SADNESS
REASONING: The stanza contains clear emotion markers indicating SADNESS, especially the words “Not” and “schreit”, but also the situation “Tod”.

EXAMPLE:

Hat mit Ähren sich das Mieder
Unschuldig geschmückt,
Sich den Hut verlegen nieder
In die Stirn gedrückt.

ANNOTATION: SADNESS
REASONING: The stanza contains clear emotion markers indicating SADNESS, especially the word “verlegen”, which indicates shame, which is part of SADNESS in the sense of the annotation.

EXAMPLE:

Es hat die Nacht die bleiche Hand erhoben
Und tausend Sterne hingesät.
Durch mondeshelle Lüfte weht
Zitternd die Sehnsucht von dort oben; —

ANNOTATION: LOVE
REASONING: The stanza contains clear emotion markers indicating LOVE, especially the word “Sehnsucht”. Sehnsucht means longing, which is part of LOVE in the sense of the annotation.

EXAMPLE:

Es hängen meine Lippen reif wie Beeren
Aus heißem Blut und betteln um die Güte,
Küssen zu dürfen. Ach, ich will verhehren
Mit meinen Küssen der Geliebten Blüte.

ANNOTATION: LOVE
REASONING: The stanza contains clear emotion markers indicating LOVE, e.g., the word “Küssen”, which indicates sexual desire, which is part of LOVE in the sense of the annotation.

EXAMPLE:

Die Tanne ragt so hoch und stolz —
Die Sträucher kichern und schwätzen,
Sie haben am stolzen Tannenbaum
Gar vieles auszusetzen.

ANNOTATION: ANGER#JOY
REASONING: The stanza contains clear emotion markers indicating JOY, especially the word “stolz”, which means pride, which is part of JOY in the sense of the annotation. The stanza also contains clear emotion markers indicating ANGER, since the bushes make fun of the fir, which is a behavior indicating dislike.

EXAMPLE:

Schied auch die Muschel lange schon
Vom Meer, das ihre Heimath war —
In ihrer Tiefe rauscht ein Ton
Wie Meeresheimath immerdar.

ANNOTATION: NONE
REASONING: The stanza contains no clear emotion markers.

EXAMPLE:

Auf Moos und Wurzeln klang hohl der Tritt,
Und hinter uns gingen bei jedem Schritt
Waldbäume in schweren Scharen mit.

ANNOTATION: NONE
REASONING: The stanza contains some phrases that could be described as "negative" in a very broad sense ("hohl der Tritt", "schweren Scharen"), but this is certainly not clear enough for an annotation.

Now annotate this stanza:

{stanza}

Take into account that the stanza is part of a poem that reads like this before the stanza just quoted:

{context}

Always use the formating of the examples for your answer. In your answer, don't repeat the stanza or the text in your answer! 
If the stanza does not represent one of the six emotions, answer NONE. Otherwise, answer with the name(s) of the emotion(s) represented in the stanza (AGITATION, ANGER, FEAR, JOY, LOVE, SADNESS). 
If there is more than one emotion in the stanza, separate their names with a #, e.g. AGITATION#ANGER#FEAR.

"""

long_prompt = ChatPromptTemplate.from_template(long_prompt_text)

### Binary classification short prompt

see experiment 6

## Defining the models

In [272]:
models = {}
models['gpt4o'] = ChatOpenAI(model_name = 'gpt-4o')
models['gemini'] = ChatGoogleGenerativeAI(model="gemini-1.5-pro")  #gemini-1.5-pro
models['claude'] = ChatAnthropic(model='claude-3-5-sonnet-20240620')


## 1. Experiment: General sampling. Short prompt.

In [81]:
short_prompt = ChatPromptTemplate.from_template("""Your task is to annotate the representation of emotion in a stanza of a poem.

There are six possible emotions: ANGER, FEAR, JOY, LOVE, SADNESS, AGITATION. Never use other emotions.

A stanza can represent exactly one emotion, but it can also represent several or NONE of the emotions listed above.

{stanza}

Take into account that the stanza is part of a poem that reads like this before the stanza just quoted:

{context}

In your answer, don't repeat the stanza or the text in your answer! If the stanza does not represent one of the six emotions, answer NONE. Otherwise, answer with the name(s) of the emotion(s) represented in the stanza (ANGER, FEAR, JOY, LOVE, SADNESS, SURPRISE). If there is more than one emotion in the stanza, separate their names with a #, e.g. ANGER#FEAR.

""")

### Running the experiment

In [224]:
start = time.time()
all_responses = {}            #collect all raw answers from llms

for model_label, model in tqdm_notebook(models.items()):  
    responses = []            #list of llm responses
    chain = short_prompt | model | StrOutputParser()   #create langchain pipeline
    
    for lnr, row in tqdm_notebook(df_gen.iterrows()):     
        
        try:
            response = chain.invoke({
                                "stanza": {row.Text},
                                "context": {row.Context}, 
                            }) #get answer from llm with the 
        except InternalServerError:
            print(f"Internal Server Error at line {lnr}")

        responses.append(response)

    all_responses[model_label] = responses

end = time.time()
duration = end - start
print(f"\n\ntime: {duration} (secs)")

  0%|          | 0/3 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]



time: 1602.8906149864197 (secs)


### Evaluation

In [226]:
dr = pd.DataFrame(all_responses)
dr['gold'] = df_gen.label.values


In [227]:
#for some reason we have to remove the white space in the answers from gemini
dr.gemini = dr.gemini.str.rstrip(' \n')
dr.head()

Unnamed: 0,gpt4o,gemini,claude,gold
0,"SADNESS,Fear","SADNESS, FEAR","SADNESS,FEAR\n\nThe stanza evokes a sense of m...",JOY#SADNESS
1,LOVE,NONE,NONE\n\nThis response is based on the fact tha...,NONE
2,SADNESS,SADNESS,SADNESS\n\nThe stanza evokes a sense of melanc...,NONE
3,SADNESS,SADNESS,SADNESS\n\nThis stanza evokes a sense of melan...,SADNESS
4,JOY,"JOY,SURPRISE","JOY,SURPRISE\n\nThe stanza represents a mix of...",LOVE


In [228]:
dr.to_csv("results_exp_1_raw.tsv", sep='\t')

In [229]:
#clean up the responses
emo_str = 'AGITATION|ANGER|FEAR|JOY|LOVE|SADNESS|NONE'
dr.gpt4o = dr.gpt4o.str.findall(emo_str)
dr.gemini = dr.gemini.str.findall(emo_str)

#claude answers contains the same category repeatedly, so we need to create a set of values
dr.claude = dr.claude.str.findall(emo_str)
buffer = []
for entry in dr.claude.values:
    try:
        buffer.append(set(entry))
    except TypeError:
        buffer.append(entry)

dr.claude =  buffer

In [230]:
dr.gold = dr.gold.str.split('#')

In [231]:
dr

Unnamed: 0,gpt4o,gemini,claude,gold
0,[SADNESS],"[SADNESS, FEAR]","{FEAR, SADNESS}","[JOY, SADNESS]"
1,[LOVE],[NONE],"{LOVE, SADNESS, FEAR, JOY, ANGER, NONE}",[NONE]
2,[SADNESS],[SADNESS],{SADNESS},[NONE]
3,[SADNESS],[SADNESS],{SADNESS},[SADNESS]
4,[JOY],[JOY],{JOY},[LOVE]
...,...,...,...,...
345,[NONE],[SADNESS],{NONE},[NONE]
346,[LOVE],[LOVE],"{LOVE, SADNESS}",[LOVE]
347,[SADNESS],[SADNESS],{SADNESS},[JOY]
348,"[SADNESS, FEAR]","[SADNESS, FEAR]","{FEAR, SADNESS}",[JOY]


In [232]:
emotions = list(map(str.upper, emotions))
emotions.append('NONE')

In [233]:
mlb = MultiLabelBinarizer()

mlb.fit([emotions])
gold = mlb.transform(dr.gold)
gpt4o = mlb.transform(dr.gpt4o)
claude = mlb.transform(dr.claude)
gemini = mlb.transform(dr.gemini)

In [234]:
# compute hamming loss. hamm_loss has 0 if the prediction is perfect and goes to 1 
r_gp = sm.hamming_loss(gold, gpt4o)
r_cl = sm.hamming_loss(gold, claude)
r_go = sm.hamming_loss(gold, gemini)

print(f"Results: \nChatGPT: \t{r_gp}\nClaude: \t{r_cl}\nGemini: \t{r_go}\n")

Results: 
ChatGPT: 	0.17510204081632652
Claude: 	0.2016326530612245
Gemini: 	0.18244897959183673



In [235]:
gold = pd.DataFrame(gold, columns=mlb.classes_)
gpt4o = pd.DataFrame(gpt4o, columns=mlb.classes_)
gemini = pd.DataFrame(gemini, columns=mlb.classes_)
claude = pd.DataFrame(claude, columns=mlb.classes_)

In [236]:
print("*** ACCURACY ***\n")

for modelname, modelresults in zip(['gpt4o', 'claude', 'gemini'], [gpt4o, claude, gemini]):
    print(f"**{modelname}**")
    for emotion in emotions:
        print(f"{emotion}: {round(accuracy_score(gold[emotion], modelresults[emotion]), 2)}")
    print()


*** ACCURACY ***

**gpt4o**
AGITATION: 0.93
ANGER: 0.95
FEAR: 0.91
JOY: 0.75
LOVE: 0.72
SADNESS: 0.75
NONE: 0.76

**claude**
AGITATION: 0.93
ANGER: 0.93
FEAR: 0.84
JOY: 0.69
LOVE: 0.75
SADNESS: 0.69
NONE: 0.75

**gemini**
AGITATION: 0.93
ANGER: 0.93
FEAR: 0.91
JOY: 0.76
LOVE: 0.74
SADNESS: 0.75
NONE: 0.7



In [237]:

print("*** PRECISION ***\n")

for modelname, modelresults in zip(['gpt4o', 'claude', 'gemini'], [gpt4o, claude, gemini]):
    print(f"**{modelname}**")
    for emotion in emotions:
        print(f"{emotion}: {round(sm.precision_score(gold[emotion], modelresults[emotion], zero_division=np.nan), 2)}")
    print()


*** PRECISION ***

**gpt4o**
AGITATION: nan
ANGER: 0.43
FEAR: 0.17
JOY: 0.5
LOVE: 0.48
SADNESS: 0.52
NONE: 0.86

**claude**
AGITATION: nan
ANGER: 0.27
FEAR: 0.11
JOY: 0.42
LOVE: 0.53
SADNESS: 0.45
NONE: 0.93

**gemini**
AGITATION: nan
ANGER: 0.26
FEAR: 0.12
JOY: 0.53
LOVE: 0.51
SADNESS: 0.51
NONE: 0.73



In [238]:
print("*** RECALL ***\n")

for modelname, modelresults in zip(['gpt4o', 'claude', 'gemini'], [gpt4o, claude, gemini]):
    print(f"**{modelname}**")
    for emotion in emotions:
        print(f"{emotion}: {round(sm.recall_score(gold[emotion], modelresults[emotion]), 2)}")
    print()

*** RECALL ***

**gpt4o**
AGITATION: 0.0
ANGER: 0.43
FEAR: 0.42
JOY: 0.51
LOVE: 0.68
SADNESS: 0.77
NONE: 0.44

**claude**
AGITATION: 0.0
ANGER: 0.5
FEAR: 0.5
JOY: 0.58
LOVE: 0.76
SADNESS: 0.77
NONE: 0.38

**gemini**
AGITATION: 0.0
ANGER: 0.36
FEAR: 0.25
JOY: 0.53
LOVE: 0.72
SADNESS: 0.68
NONE: 0.33



In [239]:
print("*** F1-SCORE ***\n")

for modelname, modelresults in zip(['gpt4o', 'claude', 'gemini'], [gpt4o, claude, gemini]):
    print(f"**{modelname}**")
    for emotion in emotions:
        print(f"{emotion}: {round(f1_score(gold[emotion], modelresults[emotion]), 2)}")
    print()


*** F1-SCORE ***

**gpt4o**
AGITATION: 0.0
ANGER: 0.43
FEAR: 0.24
JOY: 0.5
LOVE: 0.57
SADNESS: 0.62
NONE: 0.58

**claude**
AGITATION: 0.0
ANGER: 0.35
FEAR: 0.18
JOY: 0.49
LOVE: 0.62
SADNESS: 0.57
NONE: 0.53

**gemini**
AGITATION: 0.0
ANGER: 0.3
FEAR: 0.16
JOY: 0.53
LOVE: 0.6
SADNESS: 0.59
NONE: 0.46



In [241]:
#how good is an ensemble approach here?
#ensemble = compute_ensemble(dr)
#accuracy_score(dr.gold, ensemble)

In [242]:
dr.to_csv(data_dir / "exp_1_gen_sample_short_prompt.csv", sep='\t')

## 2. Experiment: General Sampling. Long prompt.

In [19]:
models = {}
#models['gpt4o'] = ChatOpenAI(model_name = 'gpt-4o')
#models['gemini'] = ChatGoogleGenerativeAI(model="gemini-1.5-pro")  #gemini-1.5-pro
models['claude'] = ChatAnthropic(model='claude-3-5-sonnet-20240620')


In [None]:
start = time.time()
all_responses = {}            #collect all raw answers from llms
start_nr = 228     #allows to start at specific point, if the process was interrupted before
cnt = 0

with open(f"logging_2_exp_{date_string()}.txt", "w") as fout:

    for model_label, model in tqdm_notebook(models.items()):  
        responses = []            #list of llm responses
        chain = long_prompt | model | StrOutputParser()   #create langchain pipeline
        
        for lnr, row in tqdm_notebook(df_gen.iterrows()):     
            if start_nr <= cnt:            
                try:
                    response = chain.invoke({
                                        "stanza": {row.Text},
                                        "context": {row.Context}, 
                                    }) #get answer from llm with the 
                except InternalServerError:
                    print(f"Internal Server Error at line {lnr}")
        
                responses.append(response)
                fout.write(f"{lnr}: {response}\n\n")
            cnt += 1
    
        all_responses[model_label] = responses

end = time.time()
duration = end - start
print(f"\n\ntime: {duration} (secs)")

## 3. Experiment: Stratified Sampling. Short Prompt.

### Running the experiment

In [243]:
start = time.time()
all_responses = {}            #collect all raw answers from llms

for model_label, model in tqdm_notebook(models.items()):  
    responses = []            #list of llm responses
    chain = short_prompt | model | StrOutputParser()   #create langchain pipeline
    
    for lnr, row in tqdm_notebook(df_strat.iterrows()):     
        
        try:
            response = chain.invoke({
                                "stanza": {row.Text},
                                "context": {row.Context}, 
                            }) #get answer from llm with the 
        except InternalServerError:
            print(f"Internal Server Error at line {lnr}")

        responses.append(response)

    all_responses[model_label] = responses

end = time.time()
duration = end - start
print(f"\n\ntime: {duration} (secs)")

  0%|          | 0/3 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]



time: 1659.6629469394684 (secs)


### Evaluation

In [258]:
dr = pd.DataFrame(all_responses)
dr['gold'] = df_strat.label.values
#for some reason we have to remove the white space in the answers from gemini
dr.gemini = dr.gemini.str.rstrip(' \n')
dr.to_csv("results_exp_3_raw.tsv", sep='\t')

#clean up the responses
emo_str = 'AGITATION|ANGER|FEAR|JOY|LOVE|SADNESS|NONE'
dr.gpt4o = dr.gpt4o.str.findall(emo_str)
dr.gemini = dr.gemini.str.findall(emo_str)

#claude answers contains the same category repeatedly, so we need to create a set of values
dr.claude = dr.claude.str.findall(emo_str)
buffer = []
for entry in dr.claude.values:
    try:
        buffer.append(list(set(entry)))
    except TypeError:
        buffer.append(entry)

dr.claude =  buffer

dr.gold = dr.gold.str.split('#')

dr

Unnamed: 0,gpt4o,gemini,claude,gold
0,"[LOVE, JOY]","[LOVE, SADNESS]","[JOY, LOVE]","[JOY, LOVE]"
1,[LOVE],"[LOVE, SADNESS]",[NONE],[NONE]
2,[NONE],[NONE],[NONE],[NONE]
3,"[LOVE, JOY]","[JOY, LOVE]","[JOY, LOVE]","[JOY, LOVE]"
4,"[SADNESS, LOVE]","[LOVE, SADNESS]","[LOVE, SADNESS]","[JOY, SADNESS]"
...,...,...,...,...
345,[SADNESS],"[ANGER, SADNESS]","[ANGER, SADNESS]","[AGITATION, ANGER]"
346,"[SADNESS, LOVE]","[SADNESS, LOVE]","[FEAR, SADNESS]",[FEAR]
347,"[FEAR, SADNESS]","[SADNESS, FEAR]",[SADNESS],[SADNESS]
348,"[LOVE, SADNESS]","[LOVE, SADNESS]","[LOVE, SADNESS]",[NONE]


In [259]:
mlb = MultiLabelBinarizer()

mlb.fit([emotions])
gold = mlb.transform(dr.gold)
gpt4o = mlb.transform(dr.gpt4o)
claude = mlb.transform(dr.claude)
gemini = mlb.transform(dr.gemini)

In [260]:
gold = pd.DataFrame(gold, columns=mlb.classes_)
gpt4o = pd.DataFrame(gpt4o, columns=mlb.classes_)
gemini = pd.DataFrame(gemini, columns=mlb.classes_)
claude = pd.DataFrame(claude, columns=mlb.classes_)

In [261]:
gold.sum()

AGITATION     64
ANGER         68
FEAR          64
JOY          124
LOVE         124
NONE          50
SADNESS      125
dtype: int64

In [262]:
gpt4o

Unnamed: 0,AGITATION,ANGER,FEAR,JOY,LOVE,NONE,SADNESS
0,0,0,0,1,1,0,0
1,0,0,0,0,1,0,0
2,0,0,0,0,0,1,0
3,0,0,0,1,1,0,0
4,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...
345,0,0,0,0,0,0,1
346,0,0,0,0,1,0,1
347,0,0,1,0,0,0,1
348,0,0,0,0,1,0,1


In [263]:
print("*** ACCURACY ***\n")

for modelname, modelresults in zip(['gpt4o', 'claude', 'gemini'], [gpt4o, claude, gemini]):
    print(f"**{modelname}**")
    for emotion in emotions:
        print(f"{emotion}: {round(accuracy_score(gold[emotion], modelresults[emotion]), 2)}")
    print()


*** ACCURACY ***

**gpt4o**
AGITATION: 0.82
ANGER: 0.85
FEAR: 0.87
JOY: 0.69
LOVE: 0.73
SADNESS: 0.73
NONE: 0.87

**claude**
AGITATION: 0.82
ANGER: 0.84
FEAR: 0.83
JOY: 0.68
LOVE: 0.75
SADNESS: 0.67
NONE: 0.89

**gemini**
AGITATION: 0.82
ANGER: 0.86
FEAR: 0.87
JOY: 0.7
LOVE: 0.74
SADNESS: 0.73
NONE: 0.87



In [264]:
print("*** PRECISION ***\n")

for modelname, modelresults in zip(['gpt4o', 'claude', 'gemini'], [gpt4o, claude, gemini]):
    print(f"**{modelname}**")
    for emotion in emotions:
        print(f"{emotion}: {round(sm.precision_score(gold[emotion], modelresults[emotion], zero_division=np.nan), 2)}")
    print()


*** PRECISION ***

**gpt4o**
AGITATION: nan
ANGER: 0.79
FEAR: 0.67
JOY: 0.58
LOVE: 0.62
SADNESS: 0.59
NONE: 0.59

**claude**
AGITATION: nan
ANGER: 0.7
FEAR: 0.53
JOY: 0.56
LOVE: 0.64
SADNESS: 0.53
NONE: 0.78

**gemini**
AGITATION: nan
ANGER: 0.77
FEAR: 0.68
JOY: 0.62
LOVE: 0.62
SADNESS: 0.6
NONE: 0.6



In [265]:
print("*** RECALL ***\n")

for modelname, modelresults in zip(['gpt4o', 'claude', 'gemini'], [gpt4o, claude, gemini]):
    print(f"**{modelname}**")
    for emotion in emotions:
        print(f"{emotion}: {round(sm.recall_score(gold[emotion], modelresults[emotion]), 2)}")
    print()

*** RECALL ***

**gpt4o**
AGITATION: 0.0
ANGER: 0.32
FEAR: 0.62
JOY: 0.4
LOVE: 0.65
SADNESS: 0.77
NONE: 0.34

**claude**
AGITATION: 0.0
ANGER: 0.34
FEAR: 0.56
JOY: 0.51
LOVE: 0.69
SADNESS: 0.79
NONE: 0.36

**gemini**
AGITATION: 0.0
ANGER: 0.4
FEAR: 0.56
JOY: 0.42
LOVE: 0.68
SADNESS: 0.71
NONE: 0.3



In [266]:
print("*** F1-SCORE ***\n")

for modelname, modelresults in zip(['gpt4o', 'claude', 'gemini'], [gpt4o, claude, gemini]):
    print(f"**{modelname}**")
    for emotion in emotions:
        print(f"{emotion}: {round(f1_score(gold[emotion], modelresults[emotion]), 2)}")
    print()


*** F1-SCORE ***

**gpt4o**
AGITATION: 0.0
ANGER: 0.46
FEAR: 0.65
JOY: 0.48
LOVE: 0.63
SADNESS: 0.67
NONE: 0.43

**claude**
AGITATION: 0.0
ANGER: 0.46
FEAR: 0.55
JOY: 0.53
LOVE: 0.66
SADNESS: 0.63
NONE: 0.49

**gemini**
AGITATION: 0.0
ANGER: 0.52
FEAR: 0.62
JOY: 0.5
LOVE: 0.65
SADNESS: 0.65
NONE: 0.4



In [267]:
print("*** Cohen Kappa ***\n")

for modelname, modelresults in zip(['gpt4o', 'claude', 'gemini'], [gpt4o, claude, gemini]):
    print(f"**{modelname}**")
    for emotion in emotions:
        print(f"{emotion}: {round(sm.cohen_kappa_score(gold[emotion], modelresults[emotion]), 2)}")
    print()


*** Cohen Kappa ***

**gpt4o**
AGITATION: 0.0
ANGER: 0.39
FEAR: 0.57
JOY: 0.26
LOVE: 0.42
SADNESS: 0.44
NONE: 0.36

**claude**
AGITATION: 0.0
ANGER: 0.38
FEAR: 0.44
JOY: 0.29
LOVE: 0.47
SADNESS: 0.36
NONE: 0.44

**gemini**
AGITATION: 0.0
ANGER: 0.45
FEAR: 0.54
JOY: 0.3
LOVE: 0.44
SADNESS: 0.43
NONE: 0.34



## 4. Experiment: Stratified Sampling. Long Prompt.

In [415]:
models = {}
#models['gpt4o'] = ChatOpenAI(model_name = 'gpt-4o')
#models['gemini'] = ChatGoogleGenerativeAI(model="gemini-1.5-pro")  #gemini-1.5-pro
models['claude'] = ChatAnthropic(model='claude-3-5-sonnet-20240620')


In [None]:
start = time.time()
all_responses = {}            #collect all raw answers from llms
start_nr = 64                  #allows to start later at a specific point, if the process was interrupted before
cnt = 0

with open(f"logging_2_exp_{date_string()}.txt", "w") as fout:

    for model_label, model in tqdm_notebook(models.items()):  
        print(model_label)
        responses = []            #list of llm responses
        chain = long_prompt | model | StrOutputParser()   #create langchain pipeline
        
        for lnr, row in tqdm_notebook(df_strat.iterrows()):     
            if start_nr <= cnt:            
                try:
                    response = chain.invoke({
                                        "stanza": {row.Text},
                                        "context": {row.Context}, 
                                    }) #get answer from llm with the 
                except InternalServerError:
                    print(f"Internal Server Error at line {lnr}")
                except RateLimitError:
                    print(f"RateLimitError with model {model_label}.\nCounter:  {cnt}")
                    print("Starting next model...")
                    break
        
                responses.append(response)
                fout.write(f"{lnr}: {response}\n\n")
                time.sleep(2)
            cnt += 1
    
        all_responses[model_label] = responses

end = time.time()
duration = end - start
print(f"\n\ntime: {duration} (secs)")

### Evaluation 

In [407]:
dr = pd.DataFrame(all_responses, index=df_strat.index)
dr['gold'] = df_strat.label.values
#for some reason we have to remove the white space in the answers from gemini
dr.gemini = dr.gemini.str.rstrip(' \n')
dr.to_csv("results_exp_4_raw.tsv", sep='\t')
dr.head(2)


Unnamed: 0,gpt4o,gemini,gold
1269,ANNOTATION: JOY#LOVE\nREASONING: The stanza co...,ANNOTATION: LOVE#SADNESS\nREASONING: The stanz...,JOY#LOVE
4189,ANNOTATION: NONE\nREASONING: The stanza contai...,ANNOTATION: NONE\nREASONING: The stanza contai...,NONE


In [408]:

#clean up the responses
dr.gpt4o = extract_terms(dr.gpt4o)
dr.gemini = extract_terms(dr.gemini)
#dr.claude = extract_terms(dr.claude)

dr.gold = dr.gold.str.split('#')

dr.head()

Unnamed: 0,gpt4o,gemini,gold
1269,"[JOY, LOVE]","[LOVE, SADNESS]","[JOY, LOVE]"
4189,[NONE],[NONE],[NONE]
1990,[NONE],[NONE],[NONE]
2252,"[JOY, LOVE]",,"[JOY, LOVE]"
6571,"[SADNESS, LOVE]","[SADNESS, LOVE]","[JOY, SADNESS]"


In [410]:
mlb = MultiLabelBinarizer()

mlb.fit([emotions])
gold = mlb.transform(dr.gold)
gpt4o = mlb.transform(dr.gpt4o)
#claude = mlb.transform(dr.claude)
#gemini = mlb.transform(dr.gemini)

In [411]:
gold = pd.DataFrame(gold, columns=mlb.classes_)
gpt4o = pd.DataFrame(gpt4o, columns=mlb.classes_)
#gemini = pd.DataFrame(gemini, columns=mlb.classes_)
#claude = pd.DataFrame(claude, columns=mlb.classes_)

In [None]:
print("*** F1-SCORE ***\n")

for modelname, modelresults in zip(['gpt4o', 'claude', 'gemini'], [gpt4o, claude, gemini]):
    print(f"**{modelname}**")
    for emotion in emotions:
        print(f"{emotion}: {round(f1_score(gold[emotion], modelresults[emotion]), 2)}")
    print()


In [412]:
for modelname, modelresults in zip(['gpt4o'], [gpt4o]):
    print(f"**{modelname}**")
    for emotion in emotions:
        print(f"{emotion}: {round(f1_score(gold[emotion], modelresults[emotion]), 2)}")
    print()


**gpt4o**
AGITATION: 0.19
ANGER: 0.66
FEAR: 0.66
JOY: 0.6
LOVE: 0.68
SADNESS: 0.71
NONE: 0.55



## 5. Experiment: Binary Classification. Stratified Sampling. Short/Long Prompt. 

In [9]:
model = ChatOpenAI(model_name = 'gpt-4o')

### Prompt

In [68]:
#short prompt template used in Exp. 1, 3. The placeholders 'stanza' and 'context' are filled when the template is called.
short_binary_prompt_text = """Your task is to annotate the representation of a specific emotion in a stanza of a poem, the emotion {emotion}.

A stanza can represent this one emotion {emotion}, or it represents either other emotions or no emotions. 
If {emotion} is present, annotate the stanza with the label {emotion}. If the emotion is not present, label this stanza as OTHER. 

Here is the stanza

{stanza}

Take into account that the stanza is part of a poem that reads like this before the stanza just quoted:

{context}

In your answer, don't repeat the stanza or the text in your answer! Only use the label {emotion} or OTHER in your answer.
Use capital letters for the labels.

"""

short_binary_prompt = ChatPromptTemplate.from_template(short_binary_prompt_text)

###  Running the experiment 
Data creation is done in the experimental loop

In [19]:
emotion_list = ['Agitation','Anger', 'Fear', 'Joy', 'Love', 'Sadness']

start = time.time()
all_responses = {}            #collect all raw answers from llms
start_nr = 0                  #allows to start later at a specific point, if the process was interrupted before
cnt = 0

with open(f"logging_5_exp_{date_string()}.txt", "w") as fout:

    for emotion in tqdm_notebook(emotion_list):
        #build testdata    
        dfs_1 = df_strat[df_strat[emotion]==1.0].sample(n=60, random_state=42)
        dfs_2 = df_strat[df_strat[emotion]==0.0].sample(n=60, random_state=42)
        df_select = pd.concat([dfs_1, dfs_2])
        df_select = df_select.sample(n=len(df_select), random_state=1239)
        df_select.to_csv(f'bc_{emotion}.csv', sep='\t')
        fout.write(f"emotion: {emotion.upper()}\n\n") 
        
        responses = []            #list of llm responses
        chain = short_binary_prompt | model | StrOutputParser()   #create langchain pipeline
            
        for lnr, row in tqdm_notebook(df_select.iterrows()):     
            response = chain.invoke({
                                "stanza": {row.Text},
                                "context": {row.Context},
                                "emotion": {emotion.upper()}
                            }) #get answer from llm with the 
    
            responses.append(response)
            fout.write(f"{lnr}: {response}\n\n")
    
        all_responses[emotion] = responses

end = time.time()
duration = end - start
print(f"\n\ntime: {duration} (secs)")

  0%|          | 0/6 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]



time: 425.36023354530334 (secs)


## 6. Experiment: *Strict* binary classification

Is there a correlation between the amount of other emotions in a category and the classification performance? If this is the case, the classification results should change markedly, when we use a strict binary classification- We us the term 'strict binary' to refer to a setup, where we make sure that each text segment either shows emotion A (*and only this emotion*) or doesn't show emotion A (*could be None or any other emotion*). 

As many stanzas have more than one emotion, first we have to make sure that we have enough examples for these

In [71]:
model = ChatOpenAI(model_name = 'gpt-4o')

### Prompt

In [69]:
#short prompt template used in Exp. 1, 3. The placeholders 'stanza' and 'context' are filled when the template is called.
short_binary_prompt_text = """Your task is to annotate the representation of a specific emotion in a stanza of a poem, the emotion {emotion}.

A stanza can represent this one emotion {emotion}, or it represents either other emotions or no emotions. 
If {emotion} is present, annotate the stanza with the label {emotion}. If the emotion is not present, label this stanza as OTHER. 

Here is the stanza

{stanza}

Take into account that the stanza is part of a poem that reads like this before the stanza just quoted:

{context}

In your answer, don't repeat the stanza or the text in your answer! Only use the label {emotion} or OTHER in your answer.
Use capital letters for the labels.

"""

short_binary_prompt = ChatPromptTemplate.from_template(short_binary_prompt_text)

### Data Preparation

In [133]:
emotions = ['Agitation','Anger', 'Fear', 'Joy', 'Love', 'Sadness']

In [53]:
    
    
print(len(no_emotions(df, sample_size='max')))

3017


In [135]:
data = {}
length = 6
SAMPLE_SIZE=60
for i, emotion in enumerate(emotions):
    values = length*[0]
    values[i] = 1
    #get one emotion=1 and all other emotions=0
    data[emotion] = df.query(f"Agitation=={values[0]} and \
                     Anger=={values[1]} and \
                     Fear=={values[2]} and \
                     Joy=={values[3]} and \
                     Love=={values[4]} and \
                     Sadness=={values[5]}").sample(n=SAMPLE_SIZE, random_state=42)

    #negative category:
    df_none = df.query(f"{emotion}==0.").sample(n=SAMPLE_SIZE, random_state=43)
    data[emotion] = pd.concat([data[emotion], df_none]).sample(n=2*SAMPLE_SIZE, random_state=43)


In [137]:
#sanity check
len(data['Love'])

120

### Running the experiment

In [138]:
emotion_list = ['Agitation','Anger', 'Fear', 'Joy', 'Love', 'Sadness']

start = time.time()
all_responses = {}            #collect all raw answers from llms
start_nr = 0                  #allows to start later at a specific point, if the process was interrupted before
cnt = 0

with open(f"logging_6_exp_{date_string()}.txt", "w") as fout:

    for emotion in tqdm_notebook(emotion_list):
        df_select = data[emotion]
        df_select.to_csv(f'bcs_{emotion}.csv', sep='\t')
        fout.write(f"emotion: {emotion.upper()}\n\n") 
        
        responses = []            #list of llm responses
        chain = short_binary_prompt | model | StrOutputParser()   #create langchain pipeline
            
        for lnr, row in tqdm_notebook(df_select.iterrows()):     
            response = chain.invoke({
                                "stanza": {row.Text},
                                "context": {row.Context},
                                "emotion": {emotion.upper()}
                            }) #get answer from llm with the 
    
            responses.append(response)
            fout.write(f"{lnr}: {response}\n\n")
    
        all_responses[emotion] = responses

end = time.time()
duration = end - start
print(f"\n\ntime: {duration} (secs)")

  0%|          | 0/6 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]



time: 376.23755621910095 (secs)


### Evaluation

In [139]:
dr = pd.DataFrame(all_responses)
dr.head(3)


Unnamed: 0,Agitation,Anger,Fear,Joy,Love,Sadness
0,OTHER,OTHER,OTHER,OTHER,OTHER,{'SADNESS'}
1,OTHER,OTHER,OTHER,{'JOY'},OTHER,OTHER
2,OTHER,OTHER,OTHER,OTHER,OTHER,OTHER


In [140]:
def extract_terms(series, term):
    return series.str.contains(f"{term.upper()}").astype(int)


In [141]:
for emotion in emotion_list:
    score = accuracy_score(data[emotion][emotion], extract_terms(dr[emotion], emotion))
    print(f"{emotion}: {round(score, 2)}")

Agitation: 0.52
Anger: 0.77
Fear: 0.79
Joy: 0.71
Love: 0.69
Sadness: 0.82


In [142]:
scores = []
for emotion in emotion_list:
    score = f1_score(data[emotion][emotion], extract_terms(dr[emotion], emotion))
    print(f"{emotion}: {round(score, 2)}")
    scores.append(score)

Agitation: 0.41
Anger: 0.71
Fear: 0.79
Joy: 0.65
Love: 0.66
Sadness: 0.82


In [143]:
bc_strict = pd.Series(scores, index=bc_loose.index)
bc_strict.to_csv("results_binare_classification_negative_is_all_None.csv")
bc_strict

Agitation    0.408163
Anger        0.708333
Fear         0.793388
Joy          0.653465
Love         0.660550
Sadness      0.819672
dtype: float64

In [144]:
bc_loose = pd.DataFrame.from_dict({'Agitation': 0.5, 'Anger': 0.56, 'Fear': 0.75, 'Joy': 0.46, 'Love': 0.66 , 'Sadness': 0.76}, orient='index', columns=['value'])
bc_loose

Unnamed: 0,value
Agitation,0.5
Anger,0.56
Fear,0.75
Joy,0.46
Love,0.66
Sadness,0.76
