In [1]:
import pandas as pd
import numpy as np
from prettytable import PrettyTable

from sklearn.metrics import cohen_kappa_score

In [2]:
ANNOTATION_BASE_PATH = f'../../../../03_datasets/sentiment-analysis_stories/annotations'

MODEL = {
    'Llama-3.3-70B': f'{ANNOTATION_BASE_PATH}/25-04-24_Annotator-Training_Llama3-3.jsonl',
    'Llama-2-7B': f'{ANNOTATION_BASE_PATH}/25-04-24_Annotator-Training_Llama2.jsonl',
    'Mistral-7B': f'{ANNOTATION_BASE_PATH}/25-04-24_Annotator-Training_Mistral.jsonl',
}

ANNOTATION_BASE_PATH = f'../../../../03_datasets/animals/annotations'
IMAGE_MODEL = {
    'Stable Diffusion 3.5': f'{ANNOTATION_BASE_PATH}/25-05-05_Stable-Diffusion-3.5.jsonl',
    'Stable Cascade': f'{ANNOTATION_BASE_PATH}/25-05-05_Stable-Cascade.jsonl',
    'FLUX.1-dev': f'{ANNOTATION_BASE_PATH}/25-05-05_FLUX.1-dev.jsonl'
}


In [3]:
sentiments = ['positive', 'neutral', 'negative']
def get_sentiment(match):
    for sentiment in sentiments:
        if sentiment in match:
            return sentiment
        
title_match = ['match', 'no_match']
def get_title_match(match):
    for m in title_match:
        if m in match:
            return m
        
annotators = ['saaf', 'deri', 'vode']
def get_annotator(match):
    for annotator in annotators:
        if annotator in match:
            return annotator
        
animal_match = ['animal_match', 'animal_no_match']
def get_animal_match(match):
    for m in animal_match:
        if m in match:
            return m
        
count_match = ['count_match', 'count_no_match']
def get_count_match(match):
    for m in count_match:
        if m in match:
            return m

def get_oracle_ratings_per_annotator(path):
    df = pd.read_json(path, orient='records', lines=True)
    df['id'] = df['meta'].apply(lambda m: m['id'])
    df['annotator'] = df['_annotator_id'].apply(get_annotator)
    df['oracle'] = df['accept'].apply(get_sentiment)
    df['oracle_title_match'] = df['accept'].apply(get_title_match)

    df_an1 = df.loc[df['annotator'] == 'deri'][['id', 'oracle', 'oracle_title_match']]
    df_an2 = df.loc[df['annotator'] == 'vode'][['id', 'oracle', 'oracle_title_match']]

    return df_an1, df_an2
    
def get_oracle_ratings_per_annotator_image(path):
    df = pd.read_json(path, orient='records', lines=True)
    df['id'] = df['meta'].apply(lambda m: m['id'])
    df['annotator'] = df['_annotator_id'].apply(get_annotator)
    df['oracle_animal'] = df['accept'].apply(get_animal_match)
    df['oracle_count'] = df['accept'].apply(get_count_match)

    df_an1 = df.loc[df['annotator'] == 'deri'][['id', 'oracle_animal', 'oracle_count']]
    df_an2 = df.loc[df['annotator'] == 'vode'][['id', 'oracle_animal', 'oracle_count']]

    return df_an1, df_an2

def pretty_print_latex(latex_str):
    lines = latex_str.replace(r" \\ ", r" \\" + "\n").splitlines()
    formatted_lines = []
    indent_level = 0
    for line in lines:
        if r"\begin" in line:
            formatted_lines.append(line)
            indent_level += 1
        elif r"\end" in line:
            indent_level -= 1
            formatted_lines.append(line)
        else:
            formatted_lines.append("    " * indent_level + line)
    return "\n".join(formatted_lines)
    

In [4]:
t = PrettyTable(['Generator', 'Kappa Sentiment', 'Kappa Title'])

kappa_sum = 0
kappa_title_sum = 0
for id, path in MODEL.items():
    df_an1, df_an2 = get_oracle_ratings_per_annotator(path)
    kappa = cohen_kappa_score(df_an1['oracle'], df_an2['oracle'])
    kappa_title = cohen_kappa_score(df_an1['oracle_title_match'], df_an2['oracle_title_match'])

    kappa_sum += kappa
    kappa_title_sum += kappa_title
    
    t.add_row([id, round(kappa, 2), round(kappa_title, 2)])


t.add_row(['Avg', round(kappa_sum/len(MODEL), 2), round(kappa_title_sum/len(MODEL), 2)])
t

Generator,Kappa Sentiment,Kappa Title
Llama-3.3-70B,0.84,-0.01
Llama-2-7B,0.72,-0.02
Mistral-7B,0.72,0.17
Avg,0.76,0.05


In [5]:
l = pretty_print_latex(t.get_latex_string())
print(l)

\begin{tabular}{ccc}
    Generator & Kappa Sentiment & Kappa Title \\
    Llama-3.3-70B & 0.84 & -0.01 \\
    Llama-2-7B & 0.72 & -0.02 \\
    Mistral-7B & 0.72 & 0.17 \\
    Avg & 0.76 & 0.05 \\
\end{tabular}


In [6]:
t = PrettyTable(['Generator', 'Kappa Animal Type', 'Kappa Animal Count'])

kappa_sum = 0
kappa_title_sum = 0
for id, path in IMAGE_MODEL.items():
    df_an1, df_an2 = get_oracle_ratings_per_annotator_image(path)
    kappa = cohen_kappa_score(df_an1['oracle_animal'], df_an2['oracle_animal'])
    kappa_title = cohen_kappa_score(df_an1['oracle_count'], df_an2['oracle_count'])

    kappa = np.nan_to_num(kappa, nan=1)

    kappa_sum += kappa
    kappa_title_sum += kappa_title
    
    t.add_row([id, round(kappa, 2), round(kappa_title, 2)])


t.add_row(['Avg', round(kappa_sum/len(MODEL), 2), round(kappa_title_sum/len(MODEL), 2)])
t

  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


Generator,Kappa Animal Type,Kappa Animal Count
Stable Diffusion 3.5,1.0,0.71
Stable Cascade,0.83,0.79
FLUX.1-dev,1.0,0.98
Avg,0.94,0.82


In [7]:
l = pretty_print_latex(t.get_latex_string())
print(l)

\begin{tabular}{ccc}
    Generator & Kappa Animal Type & Kappa Animal Count \\
    Stable Diffusion 3.5 & 1.0 & 0.71 \\
    Stable Cascade & 0.83 & 0.79 \\
    FLUX.1-dev & 1.0 & 0.98 \\
    Avg & 0.94 & 0.82 \\
\end{tabular}
