In [14]:
import krippendorff
from typing import List
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score, cohen_kappa_score
import numpy as np
from scipy.stats import bootstrap, t, norm
from pathlib import Path
import json
import shutil
from statistics import mode, mean, median
from sequence_level_trigger_warning_assignment.utility import major_warning, infer_label, multilabel_to_multiclass
from scipy import stats
from typing import Dict

import plotly.graph_objects as go


ImportError: cannot import name '_r' from 'sequence_level_trigger_warning_assignment.utility' (/home/mike4537/code/git-webis-de/code-research/computational-ethics/sequence-level-trigger-warning-assignment/sequence_level_trigger_warning_assignment/utility.py)

In [45]:
scores_dir = Path("../../resources/classification-results/scores")
predictions_dir = Path("../../resources/classification-results/scores")
output_dir = Path("../../resources/classification-results/figures")
output_dir.mkdir(exist_ok=True)
warning_colors = {
    "misogyny": 'rgba(3, 66, 87, .4)',
    "racism": 'rgba(2, 82, 43, .4)',
    "ableism": 'rgba(91, 24, 27, .4)',
    "homophobia": 'rgba(62, 35, 93, .4)',
    "death": 'rgba(58, 145, 165, .4)',
    "violence": 'rgba(52, 152, 80, .4)',
    "abduction": 'rgba(230, 71, 39, .4)',
    "war": 'rgba(153, 81, 230, .4)',
}

model_colors = {
    "gpt3-5-turbo": 'rgba(187, 134, 243, 1)',
    "gpt4-turbo": 'rgba(109, 52, 167, 1)',
    "llama-7b": 'rgba(89, 186, 97, 1)',
    "llama-13b": 'rgba(18, 117, 62, 1)',
    "mistral-7b": 'rgba(255, 111, 82, 1)',
    "mixtral-8x7b": 'rgba(158, 552, 35, 1)',
    "multilabel": 'rgba(211, 248, 247, 1)',
    "multilabel-extended": 'rgba(144, 215, 222, 1)',
    "multilabel-negatives": 'rgba(144, 215, 222, 1)',
    "binary": 'rgba(104, 180, 194, 1)',
    "binary-extended": 'rgba(58, 145, 165, 1)',
    "multiclass": 'rgba(21, 106, 130, 1)',
}

warning_plot_order = ["violence", "death", "war", "abduction", "racism", "homophobia", "misogyny", "ableism"]
model_plot_order = ["binary", "binary-extended", "multiclass", "multilabel", "multilabel-extended", "gpt3-5-turbo", "gpt4-turbo", "mistral-7b", "mixtral-8x7b", "llama-7b", "llama-13b"]

def _r(value, decimals=2, pct=False):
    v = round(value, decimals)
    if pct:
        v = v * 100
    v = str(v)
    if len(v) == 3:
        v = v + "0"
    return v


In [43]:
def _aggregate_scores(scores_dir: Path):
    """aggregate the scores 

    :param scores_dir: {
        "model": {"distribution": {"aggregation": {"dsi:" {"warning": {"measure": [values over samples]}}}}}}
    }
    """
    results = {}
    for scores_file in scores_dir.glob("*"):
        scores = json.loads(open(scores_file).read())
        model = scores_file.stem[:-3].removeprefix("acl24-")\
            .removeprefix("fanbert-").removesuffix("-20ep")\
            .removesuffix("-1e-5lr").removesuffix("-2e-5lr").removesuffix("-5e-5lr")
        aggregation = "minority" if model.endswith("minority") else "majority"
        model = model.removesuffix("-minority").removesuffix("-majority")
        distribution = "id" if model.endswith("id") else "ood"
        model = model.removesuffix("-id").removesuffix("-ood")
        for warning, score_dict in scores.items():
            for measure, value in score_dict.items():
                results.setdefault(model, {}).setdefault(distribution, {}).setdefault(aggregation, {}).setdefault(warning, {}).setdefault(measure, []).append(value)

    return results

scores = _aggregate_scores(scores_dir)
scores


{'gpt4-turbo': {'ood': {'majority': {'total': {'accuracy': [0.63,
      0.65,
      0.65,
      0.67,
      0.63],
     'f1': [0.63, 0.65, 0.65, 0.67, 0.63],
     'precision': [0.63, 0.65, 0.65, 0.67, 0.63],
     'recall': [0.63, 0.65, 0.65, 0.67, 0.63],
     'f1_cls1': [0.7093596059113301,
      0.7245657568238214,
      0.7286063569682152,
      0.729113924050633,
      0.7135922330097088],
     'f1_cls0': [0.49572649572649574,
      0.5316455696202531,
      0.5194805194805195,
      0.563265306122449,
      0.48245614035087714],
     'pr': [0.76875, 0.759375, 0.778125, 0.734375, 0.7875]},
    'misogyny': {'accuracy': [0.57, 0.75, 0.72, 0.75, 0.57],
     'f1': [0.57, 0.75, 0.73, 0.75, 0.57],
     'precision': [0.57, 0.75, 0.72, 0.75, 0.57],
     'recall': [0.57, 0.75, 0.72, 0.75, 0.57],
     'f1_cls1': [0.6530612244897959,
      0.75,
      0.7441860465116279,
      0.761904761904762,
      0.6382978723404256],
     'f1_cls0': [0.45161290322580644,
      0.75,
      0.70270270270270

In [None]:
# Plots (individual plots for each distribution and aggregation): 
# """{"model": {"distribution": {"aggregation": {"warning": {"measure": [values over samples]}}}}}"""
def _scatter_individual_values(model, model_scores, aggregation, distribution, measure='accuracy'):
    fig = go.Figure()
    for warning in warning_plot_order:
        if warning == 'total':
            continue
        if distribution not in model_scores:
            print(f"{distribution} is not in {model}")
            continue
        
        y = model_scores[distribution][aggregation][warning][measure]
        # points
        fig.add_trace(go.Scatter(x=[warning]*len(y), y=y, mode='markers', name=warning,
                    marker_color=warning_colors[warning]))
        # mean
        _mean = np.mean(y)
        # conf_int = stats.t.interval(0.95, 
        #                             len(y)-1,
        #                             loc=_mean,
        #                             scale=np.std(y) / np.sqrt(len(y)))
        fig.add_trace(go.Scatter(x=[warning], y=[_mean],
                                name=f"{warning}", 
                                marker_color=warning_colors[warning],  
                                # error_y={"type": 'data',
                                #          "symmetric": False,
                                #          "arrayminus": [_mean-conf_int[0]],
                                #          "array": [conf_int[1]-_mean],
                                #          "thickness": 1.5,
                                #          "width": 3
                                #          }
                                ))
        fig.add_trace(go.Scatter(x=[warning], y=[np.mean(y)], mode='markers', name=warning, marker_symbol="line-ew",
                    marker_line_color=warning_colors[warning], marker_color=warning_colors[warning],
                    marker_line_width=2
                    ))
    fig.update_layout(title_text=f"{model}-{aggregation}-{distribution}", 
                    plot_bgcolor='white',
                    yaxis_range=[0.3,1], 
                    showlegend=False,
                    xaxis={'ticks': 'outside',
                            'showline': True,
                            "linecolor": 'black'},
                    yaxis={
                            'ticks': 'outside',
                            'showline': True,
                            "linecolor": 'black',
                            "gridcolor": 'lightgrey'})
    fig.write_image(output_dir / f"{model}-{aggregation}-{distribution}.svg")
    fig.show()

combinations = [("id","minority"),("id","majority"),("ood","minority"),("ood","majority")]

for distribution, aggregation in combinations:
    for model, model_scores in scores.items():
        _scatter_individual_values(model, model_scores, aggregation, distribution, measure='accuracy')

In [47]:
# Plots (plots with errors, aggregated by model (all values), keep aggregation separate): 
# """{"model": {"distribution": {"aggregation": {"warning": {"measure": [values over samples]}}}}}"""

fig = go.Figure()

combinations = [("id","minority"),("id","majority"),("ood","minority"),("ood","majority")]
means = {}

for distribution, aggregation in combinations:
    # aggregate values for all models across warning
    for model, model_scores in scores.items():
        name=f"{model[:4]}-{model[-3:]}-{aggregation[:2]}-{distribution[-2:]}"
        for warning in warning_plot_order:
            if warning == 'total':
                continue
            if distribution not in model_scores:
                print(f"{distribution} is not in {model}")
                continue
            y = model_scores[distribution][aggregation][warning]['accuracy']
            # points
            means.setdefault(distribution, {})\
                .setdefault(aggregation, {})\
                .setdefault(model, []).extend(y)


for distribution, aggregation in combinations:
    for model, y in means[distribution][aggregation].items():
        name=f"{model[:4]}-{model[-3:]}-{aggregation[:2]}-{distribution[-2:]}"
        # t.interval(.95, df, loc=0, scale=1)
        _mean = np.mean(y)
        # conf_int = stats.norm.interval(0.95, loc=_mean, scale=np.std(y) / np.sqrt(len(y)))
        conf_int = stats.t.interval(0.95, 
                                    len(y)-1,
                                    loc=_mean,
                                    scale=np.std(y) / np.sqrt(len(y)))
        fig.add_trace(go.Bar(x=[name], y=[_mean],
                                name=f"{name}", 
                                marker_color=model_colors[model],  
                                error_y={"type": 'data',
                                         "symmetric": False,
                                         "arrayminus": [_mean-conf_int[0]],
                                         "array": [conf_int[1]-_mean],
                                         "thickness": 1.5,
                                         "width": 3
                                         }
                                ))
        
fig.update_layout(title_text=f"all", 
                plot_bgcolor='white',
                yaxis_range=[0.0,1], 
                showlegend=False,
                xaxis={'ticks': 'outside',
                        'showline': True,
                        "linecolor": 'black'},
                yaxis={
                        'ticks': 'outside',
                        'showline': True,
                        "linecolor": 'black',
                        "gridcolor": 'lightgrey'})
fig.write_image(output_dir / f"aggregated.svg")
fig.show()
    

In [42]:
# Plots (boxes, aggregated by model (all values), keep aggregation separate): 
# """{"model": {"distribution": {"aggregation": {"warning": {"measure": [values over samples]}}}}}"""

fig = go.Figure()

combinations = [("id","minority"),("id","majority"),("ood","minority"),("ood","majority")]
means = {}

for distribution, aggregation in combinations:
    # aggregate values for all models across warning
    for model, model_scores in scores.items():
        name=f"{model[:4]}-{model[-3:]}-{aggregation[:2]}-{distribution[-2:]}"
        for warning in warning_plot_order:
            if warning == 'total':
                continue
            if distribution not in model_scores:
                print(f"{distribution} is not in {model}")
                continue
            y = model_scores[distribution][aggregation][warning]['accuracy']
            means.setdefault(distribution, {})\
                .setdefault(aggregation, {})\
                .setdefault(model, []).extend(y)


for distribution, aggregation in combinations:
    for model, y in means[distribution][aggregation].items():
        name=f"{model[:4]}-{model[-3:]}-{aggregation[:2]}-{distribution[-2:]}"    
        fig.add_trace(go.Box(x=[name]*len(y), 
                             y=y, 
                             name=f"{name}",
                             line_width=1,
                             marker_color='rgb(104,180,194, 1)'))
        
fig.update_layout(title_text=f"all", 
                plot_bgcolor='white',
                yaxis_range=[0.0,1], 
                showlegend=False,
                xaxis={'ticks': 'outside',
                        'showline': True,
                        "linecolor": 'black'},
                yaxis={
                        'ticks': 'outside',
                        'showline': True,
                        "linecolor": 'black',
                        "gridcolor": 'lightgrey'})
fig.write_image(output_dir / f"boxes.svg")
fig.show()
    

In [None]:
# Plots (plots for warnings, mean of all models (+CI) + mean of best model)
# """{"model": {"distribution": {"aggregation": {"warning": {"measure": [values over samples]}}}}}"""
measure = 'accuracy'
combinations = [("id","minority"), ("id","majority"), ("ood","minority"),("ood","majority")]
y_by_warning = {}
top_model_mean_by_warning = {}
bot_model_mean_by_warning = {}

for distribution, aggregation in combinations:
    for warning in warning_plot_order:
        for model, model_scores in scores.items():           
            if distribution not in model_scores:
                print(f"{distribution} is not in {model}")
                continue
            name = f"{distribution}-{aggregation}-{warning}"
            y = model_scores[distribution][aggregation][warning][measure]
            y_by_warning.setdefault(name, []).extend(y)
            top_model_mean_by_warning[name] = max(np.mean(y), top_model_mean_by_warning.get(name, 0))
            bot_model_mean_by_warning[name] = min(np.mean(y), bot_model_mean_by_warning.get(name, 1))
            

for distribution, aggregation in combinations:
    fig = go.Figure()
    for warning in warning_plot_order:
        name = f"{distribution}-{aggregation}-{warning}"
        y = y_by_warning[name]
        _mean = np.mean(y)
        conf_int = stats.t.interval(0.95, len(y)-1, loc=_mean,
                                    scale=np.std(y) / np.sqrt(len(y)))
        fig.add_trace(go.Bar(x=[warning], y=[_mean], name=warning,
                             marker_color=warning_colors[warning],  
                             error_y={"type": 'data',
                                      "symmetric": False,
                                      "arrayminus": [_mean-conf_int[0]],
                                      "array": [conf_int[1]-_mean],
                                      "thickness": 1.5,
                                      "width": 3
                                      }
                                      ))
        
        fig.add_trace(go.Scatter(x=[warning], y=[top_model_mean_by_warning[name]],
                                name=f"{warning}", 
                                mode='markers',
                                marker_symbol="line-ew",
                                marker_line_color='black', 
                                marker_color='black',
                                marker_line_width=2
                                ))
        fig.add_trace(go.Scatter(x=[warning], y=[bot_model_mean_by_warning[name]],
                                name=f"{warning}", 
                                mode='markers',
                                marker_symbol="line-ew",
                                marker_line_color='black', 
                                marker_color='black',
                                marker_line_width=2
                                ))
        
    fig.update_layout(title_text=f"{model}-{aggregation}-{distribution}", 
                    plot_bgcolor='white',
                    yaxis_range=[0.0,1], 
                    showlegend=False,
                    xaxis={'ticks': 'outside',
                            'showline': True,
                            "linecolor": 'black'},
                    yaxis={
                            'ticks': 'outside',
                            'showline': True,
                            "linecolor": 'black',
                            "gridcolor": 'lightgrey'})
    fig.write_image(output_dir / f"{distribution}-{aggregation}-warnings.svg")
    fig.show()

In [None]:
# Plots (plots with errors, aggregated by distribution (all values)): 

In [48]:
# Table - best model by warning
measure = 'accuracy'
combinations = [("id","minority"), ("id","majority"), ("ood","minority"),("ood","majority")]

def _model_acc(scores):
    return f"{_r(np.mean(scores['accuracy']))} & "\
                 f"{_r(np.mean(scores['pr']))}"\

def _model_f1(scores):
    return f"{_r(np.mean(scores['f1_cls1']))} & "\
           f"{_r(np.mean(scores['f1_cls0']))} "

print(scores.keys())
for distribution, aggregation in combinations:
    print(distribution, aggregation)
    for model in model_plot_order:
        m = "\t& ".join([_model_acc(scores[model][distribution][aggregation][warning])
            for warning in warning_plot_order])
        m = f"{m}\t& {_model_acc(scores[model][distribution][aggregation]['total'])}"
        print(f"{model.title()}\t& {m} \\\\")
    w = ""
    for warning in warning_plot_order:
        try:
            w += f"{_r(np.mean([np.mean(scores[model][distribution][aggregation][warning]['accuracy']) for model in model_plot_order]))} & "\
                 f"{_r(np.mean([np.mean(scores[model][distribution][aggregation][warning]['pr']) for model in model_plot_order]))}\t& "
        except:
            print([scores[model][distribution][aggregation][warning]['accuracy'] for model in model_plot_order])
            print([scores[model][distribution][aggregation][warning]['pr'] for model in model_plot_order])
    print(f"Mean\t& {w} \\\\")


for distribution, aggregation in combinations:
    for model in ["gpt3-5-turbo", "gpt4-turbo", "mistral-7b", "mixtral-8x7b", "llama-7b", "llama-13b"]:
        mean_pr_ft = np.mean([np.mean(scores[model][distribution][aggregation][warning]['pr']) for warning in warning_plot_order])
    for model in ["binary", "binary-extended", "multiclass", "multilabel", "multilabel-extended"]:
        mean_pr_llm = np.mean([np.mean(scores[model][distribution][aggregation][warning]['pr']) for warning in warning_plot_order])
    print(f"{distribution} {aggregation} {mean_pr_ft} {mean_pr_llm} {np.abs(mean_pr_llm-mean_pr_ft)}")

dict_keys(['gpt4-turbo', 'mistral-7b', 'gpt3-5-turbo', 'binary', 'multilabel', 'llama-7b', 'binary-extended', 'llama-13b', 'mixtral-8x7b', 'multilabel-extended', 'multiclass', 'multilabel-negatives'])
id minority
Binary	& 0.80 & 0.44	& 0.81 & 0.45	& 0.70 & 0.42	& 0.76 & 0.53	& 0.74 & 0.57	& 0.64 & 0.74	& 0.63 & 0.46	& 0.59 & 0.83	& 0.71 & 0.56 \\
Binary-Extended	& 0.77 & 0.43	& 0.74 & 0.48	& 0.71 & 0.43	& 0.67 & 0.60	& 0.68 & 0.38	& 0.72 & 0.58	& 0.65 & 0.38	& 0.71 & 0.55	& 0.71 & 0.48 \\
Multiclass	& 0.78 & 0.28	& 0.78 & 0.29	& 0.77 & 0.27	& 0.86 & 0.36	& 0.81 & 0.32	& 0.91 & 0.41	& 0.79 & 0.28	& 0.90 & 0.40	& 0.82 & 0.32 \\
Multilabel	& 0.58 & 0.73	& 0.48 & 0.90	& 0.56 & 0.82	& 0.68 & 0.76	& 0.51 & 0.77	& 0.55 & 0.93	& 0.51 & 0.86	& 0.68 & 0.75	& 0.57 & 0.82 \\
Multilabel-Negatives	& 0.58 & 0.72	& 0.47 & 0.89	& 0.55 & 0.80	& 0.63 & 0.81	& 0.51 & 0.82	& 0.54 & 0.94	& 0.53 & 0.88	& 0.63 & 0.74	& 0.56 & 0.83 \\
Multilabel-Extended	& 0.61 & 0.62	& 0.48 & 0.86	& 0.54 & 0.78	& 0.67 & 0.73	