# Imports and Installs

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!pip install spacy==3.1.4
!python -m spacy download de_core_news_md
!python -m spacy download en_core_web_sm
!pip install datasets transformers seqeval
!pip install -q iterative-stratification
!pip install pandas-ods-reader
!pip install weasyprint==52.5 
!pip install pdf2image
!apt-get install poppler-utils
!pip install pyyaml==5.4.1 # plotly bug
!pip install kaleido

%load_ext autoreload
%autoreload 2

Mounted at /content/drive
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy==3.1.4
  Downloading spacy-3.1.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[K     |████████████████████████████████| 5.9 MB 7.4 MB/s 
Collecting typing-extensions<4.0.0.0,>=3.7.4
  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Collecting pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 27.4 MB/s 
Collecting thinc<8.1.0,>=8.0.12
  Downloading thinc-8.0.17-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (660 kB)
[K     |████████████████████████████████| 660 kB 45.0 MB/s 
Installing collected packages: typing-extensions, pydantic, thinc, spacy
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 4.1.1
    Uninstalling typing-extensions-4.1.1:
 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-md==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_md-3.1.0/de_core_news_md-3.1.0-py3-none-any.whl (47.8 MB)
[K     |████████████████████████████████| 47.8 MB 1.3 MB/s 
Installing collected packages: de-core-news-md
Successfully installed de-core-news-md-3.1.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_md')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[K     |████████████████████████████████| 13.6 MB 6.9 MB/s 
Installing collected packages: en-core-web-sm
  Attempting uninstall: en-core-web-sm
    Found existing installation: 

In [2]:
import spacy
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from spacy import displacy
import re
import pickle
import torch
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoTokenizer, AutoModel, get_scheduler, BertTokenizer, BertForMaskedLM
from torch.optim import AdamW
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from torch.utils.data import Dataset as TDataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import networkx as nx
import seaborn as sns                      
import matplotlib.pyplot as plt
from seqeval.metrics import classification_report
from sklearn.metrics import confusion_matrix, classification_report as classification_report_sk
from sklearn.model_selection import KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from pandas_ods_reader import read_ods
from IPython.core.display import display as display_html, HTML as jupyter_HTML
from weasyprint import HTML, CSS
from pdf2image import convert_from_path
import datetime
import functools
import os
import json
import itertools
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import plotly.express as px
pd.plotting.backend = "plotly"

pd.set_option('display.max_columns', None)

  defaults = yaml.load(f)


In [3]:
PROJECT_PATH = "/content/drive/My Drive/master thesis/"
FONDSFORSTE_DATSET_PATH = "/content/drive/My Drive/master thesis/data/fondsforste/"
FORSTVERMESSUNG_DATSET_PATH = "/content/drive/My Drive/master thesis/data/forstvermessung/"
REHBEIN_DATA_PATH = "/content/drive/My Drive/master thesis/data/causal_language_DE_release1.0/"
DUNIETZ_DATA_PATH = "/content/drive/My Drive/master thesis/data/BECAUSE/"
CURRENT_TIME = datetime.datetime.now().strftime('%Y_%m_%d_%H%M')

import sys
sys.path.append(PROJECT_PATH + "code/historic/")

from helper_functions import *
from model import *
from metrics import *
from visualization import *
from training_utils import *

In [4]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [5]:
num_relations = pd.DataFrame({
    "German (SF & FV)": {
        "Cause": 488,
        "Effect": 498,
        "Actor": 52,
        "Aff.": 80,
        "Sup.": 12,
        "Contr.": 53,
    },
    "English (BECAUSE)": {
        "Cause": 10,
        "Effect": 10,
        "Actor": "-",
        "Aff.": "-",
        "Sup.": "-",
        "Contr.": "-",
    }
})

print(num_relations.to_latex())

\begin{tabular}{lrl}
\toprule
{} &  German (SF \& FV) & English (BECAUSE) \\
\midrule
Cause  &               488 &                10 \\
Effect &               498 &                10 \\
Actor  &                52 &                 - \\
Aff.   &                80 &                 - \\
Sup.   &                12 &                 - \\
Contr. &                53 &                 - \\
\bottomrule
\end{tabular}



# Params and Config

In [6]:
tqdm.pandas()

causal_arguments = ["Cause", "Effect", "Affected", "Actor", "Controlling", "Support", "Trigger"]

entity_options = {
    'colors': {
        'Cause': '#99FCE0',
        'Effect': '#6779CB',
        'Affected': '#84F72D',
        'Actor': '#108482',
        'Controlling': '#E3AF32',
        'Support': '#C44C6D',
        'Trigger': '#C5E95E',
    },
    'ents': causal_arguments
}

config = {
    "seed": 42,
    "test_size": 0.2,
    "val_size": 0.1,
    "models_directory": PROJECT_PATH + f"models/{CURRENT_TIME}/",
    "rehbein_model_file_path": PROJECT_PATH + "models/pretrained/CBERT_rehbein",
    "dunietz_model_file_path": PROJECT_PATH + "models/pretrained/CBERT_dunietz",
    "rehbein_dunietz_model_file_path": PROJECT_PATH + "models/pretrained/CBERT_rehbein_dunietz",
    "fondsforste_rehbein_dunietz_model_file_path": PROJECT_PATH + "models/pretrained/CBERT_fondsforste_rehbein_dunietz",
    "forstvermessung_rehbein_dunietz_model_file_path": PROJECT_PATH + "models/pretrained/CBERT_forstvermessung_rehbein_dunietz",
    "fondsforste_forstvermessung_rehbein_dunietz_model_file_path": PROJECT_PATH + "models/pretrained/CBERT_fondsforste_forstvermessung_rehbein_dunietz",
    "retrain_pretrained": False,
    "model_checkpoints": [
                        #   "dbmdz/bert-base-german-uncased", 
                          "dbmdz/bert-base-german-cased",
                          "dbmdz/bert-base-german-europeana-cased",
                          "bert-base-multilingual-cased",
    ],
    "batch_size": 2,
    "max_length": 500,
    "num_epochs": 75,
    "learning_rate": 2.5e-5,
    "bert_embedding_size": 768,
    "bert_dropout": 0.1,
    "early_stop_patience": 5,
    "causal_arguments": causal_arguments,
    "nlp_ger": spacy.load('de_core_news_md'),
    "nlp_eng": spacy.load("en_core_web_sm"),
    "entity_options": entity_options,
    "val_epochs": 5,
    "add_coreference": False,
    "debug": False,
    "time": CURRENT_TIME,
    "use_normalized": False,
}
    
if config["debug"]:
    config["num_epochs"] = 1
    config["model_checkpoints"] = config["model_checkpoints"][:1]
    config["models_directory"] = config["models_directory"][:-1] + "_debug/"
    config["rehbein_model_file_path"] = config["rehbein_model_file_path"] + "_debug"
    config["dunietz_model_file_path"] = config["dunietz_model_file_path"] + "_debug"
    config["rehbein_dunietz_model_file_path"] = config["rehbein_dunietz_model_file_path"] + "_debug"
    config["fondsforste_rehbein_dunietz_model_file_path"] = config["fondsforste_rehbein_dunietz_model_file_path"] + "_debug"
    config["forstvermessung_rehbein_dunietz_model_file_path"] = config["forstvermessung_rehbein_dunietz_model_file_path"] + "_debug"
    config["fondsforste_forstvermessung_rehbein_dunietz_model_file_path"] = config["fondsforste_forstvermessung_rehbein_dunietz_model_file_path"] + "_debug"

try:
    os.mkdir(config["models_directory"])
except:
    print(f"Model directory {config['models_directory']} already created")

config["label_dict"] = {
    'B-Trigger': 0,
    'O': 1, # "Trigger" and "O" 012 so that Trigger-Detection task is automatically aligned
    'B-Actor': 2,
    'I-Actor': 3,
    'B-Affected': 4,
    'I-Affected': 5,
    'B-Cause': 6,
    'I-Cause': 7,
    'B-Controlling': 8,
    'I-Controlling': 9,
    'B-Effect': 10,
    'I-Effect': 11,
    'B-Support': 12,
    'I-Support': 13,
}

config["type_dict"] = {
    "Purpose": 0,
    "Motivation": 1,
    "Consequence": 2,
    "None": 3,
}

config["degree_dict"] = {
    "Facilitate": 0,
    "Inhibit": 1,
    "None": 2,
}

config["label_list"] = list(config["label_dict"].keys())
config["type_list"] = list(config["type_dict"].keys())
config["degree_list"] = list(config["degree_dict"].keys())

metric = load_metric("seqeval")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# https://stackoverflow.com/questions/26646362/numpy-array-is-not-json-serializable
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

print(config)

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

{'seed': 42, 'test_size': 0.2, 'val_size': 0.1, 'models_directory': '/content/drive/My Drive/master thesis/models/2022_08_15_1902/', 'rehbein_model_file_path': '/content/drive/My Drive/master thesis/models/pretrained/CBERT_rehbein', 'dunietz_model_file_path': '/content/drive/My Drive/master thesis/models/pretrained/CBERT_dunietz', 'rehbein_dunietz_model_file_path': '/content/drive/My Drive/master thesis/models/pretrained/CBERT_rehbein_dunietz', 'fondsforste_rehbein_dunietz_model_file_path': '/content/drive/My Drive/master thesis/models/pretrained/CBERT_fondsforste_rehbein_dunietz', 'forstvermessung_rehbein_dunietz_model_file_path': '/content/drive/My Drive/master thesis/models/pretrained/CBERT_forstvermessung_rehbein_dunietz', 'fondsforste_forstvermessung_rehbein_dunietz_model_file_path': '/content/drive/My Drive/master thesis/models/pretrained/CBERT_fondsforste_forstvermessung_rehbein_dunietz', 'retrain_pretrained': False, 'model_checkpoints': ['dbmdz/bert-base-german-cased', 'dbmdz/b

# Visualize Results

In [7]:
baseline_corpus_name = "evaluation_data"
baseline_model_name = "baseline"
baseline_time = "2022_07_22_1950"

baseline_results_path = PROJECT_PATH + f"output/{baseline_time}_baseline/"  + f"{baseline_corpus_name}_predictions_{baseline_time}_{baseline_model_name}.json"

with open(baseline_results_path, "r") as f:
    baseline_results = json.load(f)

baseline_results.keys()

dict_keys(['dbmdz/bert-base-german-cased'])

In [8]:
transfer_learning_names_dict = {
    "no_transfer": "\multirow{3}{*}{\makecell{---}}",
    "rehbein": "RULE" "\multirow{3}{*}{\makecell{\textbf{R&R}}}",
    "dunietz": "RULE" "\multirow{3}{*}{\makecell{\textbf{BEC}}}",
    "rehbein_dunietz": "RULE" "\multirow{3}{*}{\textbf{\makecell{R&R, \\\\ BEC}}}",
}

transfer_learning_names_dict_base = {
    "no_transfer": "\u2014",
    "rehbein": "R&R",
    "dunietz": "BEC",
    "rehbein_dunietz": "R&R, BEC",
}


model_names_dict = {
    "dbmdz/bert-base-german-cased": "\textbf{Ger.}",
    "dbmdz/bert-base-german-europeana-cased": "\textbf{Europ.}",
    "bert-base-multilingual-cased": "\textbf{Multi.}",
}


metrics_dict = {
    "MCC": "MCC",
    "accuracy": "Accuracy",
    "consequence_f1": "\makecell{Consequence\\\\ F1}",
    "motivation_f1": "\makecell{Motivation\\\\ F1}",
    "purpose_f1": "\makecell{Purpose\\\\ F1}",
    "facilitate_f1": "Facilitate F1",
    "inhibit_f1": "Inhibit F1",
    "precision": "Precision",
    "recall": "Recall",
    "f1": "F1",
    "strict": "Strict",
    "relaxed": "Relaxed",
}

In [9]:
# proposed: [no finetuning, rehbein, dunietz, rehbein_dunietz]
corpus_name = "evaluation_data"

# normal
time = "2022_07_19_2043" 
name = "normal_valid"

# normalized
norm_time = "2022_07_23_2143"
norm_name = "normalization"

# coref
coref_time = "2022_07_23_2143"
coref_name = "coref"

In [10]:
def bold_largest(s):
    mean_part = s.str.split().str[0].astype(float)
    std_part = s.str.split().str[1:].str.join(" ")

    bolded_res = ["\textbf{" + str(mean) + " " + stddev + "}" if mean == mean_part.max() and mean > 0 else str(mean) + " " + stddev for mean, stddev in zip(mean_part, std_part)]
    return pd.Series(bolded_res, name=s.name, index=s.index)


## Trigger Results

In [14]:
combined_results = []

# baseline
baseline_metric_results = {}
metric_results_model = {}
for match_metric in ["precision", "recall", "f1"]:
    match_metric_mean = np.mean([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["detect_trigger_results"][f"overall_{match_metric}_strict"] for fold in range(5)]) * 100
    match_metric_std = np.std([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["detect_trigger_results"][f"overall_{match_metric}_strict"] for fold in range(5)]) * 100
    metric_results_model["\textbf{" +metrics_dict[match_metric] + "}"] = f"{match_metric_mean:.1f} ({match_metric_std:.0f})"

baseline_metric_results["Baseline"] = metric_results_model

# experiment
transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
for tl_name in transfer_learning_names:

    results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

    with open(results_path, "r") as f:
        results = json.load(f)

    metric_results = {}
    for model_kind in ['dbmdz/bert-base-german-cased', 'dbmdz/bert-base-german-europeana-cased', 'bert-base-multilingual-cased']:

        metric_results_model = {}
        for match_metric in ["precision", "recall", "f1"]:
            match_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["detect_trigger_results"][f"overall_{match_metric}_strict"] for fold in range(5)]) * 100
            match_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["detect_trigger_results"][f"overall_{match_metric}_strict"] for fold in range(5)]) * 100
            metric_results_model["\textbf{" +metrics_dict[match_metric] + "}"] = f"{match_metric_mean:.1f} ({match_metric_std:.0f})"

        metric_results[model_names_dict[model_kind]] = metric_results_model

    combined_results.append(pd.DataFrame(metric_results).T)

combined_results.append(pd.DataFrame(baseline_metric_results).T)
combined_results = pd.concat(combined_results)

index_df = pd.DataFrame({
    " ": flatten([[transfer_learning_names_dict[tl_name]] * 3 for tl_name in transfer_learning_names]) + ["Baseline"],
    "  ": combined_results.index,
})

combined_results.index = pd.MultiIndex.from_frame(index_df)
# sort in order
combined_results = combined_results[["\textbf{" + c_arg + "}" for c_arg in ["Precision", "Recall", "F1"]]]

combined_results

metric_kind_results = combined_results
metric_kind_results = metric_kind_results.T

index_df = pd.DataFrame({
    "  ": ["\textbf{Trigger}"] * 3,
    " ": metric_kind_results.index,
})

metric_kind_results.index = pd.MultiIndex.from_frame(index_df)
metric_kind_results = metric_kind_results.T

# make largest result bold
metric_kind_results = metric_kind_results.apply(bold_largest, axis=0)
display(metric_kind_results)

# convert to latex and change some things
latex_df = metric_kind_results.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
latex_df = latex_df.replace("RULE", "\hline\n")
latex_df = latex_df.replace("llllllll", "clcccccc")
latex_df = latex_df.replace(".0", "")
latex_df = latex_df.replace("\multicolumn{3}{l}", "\multicolumn{3}{c}")
latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"Baseline[\s\&]+Baseline", r"\\hline\n\\multicolumn{2}{c}{\\textbf{Baseline}}", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"^\s*\&\s*&\s*\\textbf\{Precision\}", r"\\textbf{\\makecell{Transfer \\\\ Learning}} & \\textbf{\\makecell{BERT \\\\ Model}} & \\textbf{Precision}", latex_df, flags=re.MULTILINE)
print(latex_df)


Unnamed: 0_level_0,Unnamed: 1_level_0,\textbf{Trigger},\textbf{Trigger},\textbf{Trigger}
Unnamed: 0_level_1,Unnamed: 1_level_1,\textbf{Precision},\textbf{Recall},\textbf{F1}
,,,,
\multirow{3}{*}{\makecell{---}},\textbf{Ger.},\textbf{50.1 (3)},56.7 (2),\textbf{53.1 (2)}
\multirow{3}{*}{\makecell{---}},\textbf{Europ.},49.3 (2),53.2 (2),51.1 (2)
\multirow{3}{*}{\makecell{---}},\textbf{Multi.},45.1 (4),51.9 (2),48.2 (2)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Ger.},48.5 (4),54.7 (1),51.4 (3)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Europ.},49.0 (3),53.1 (4),50.8 (2)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Multi.},45.5 (4),50.9 (2),48.0 (3)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Ger.},42.6 (2),56.9 (2),48.6 (1)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Europ.},44.5 (1),\textbf{57.3 (2)},50.0 (1)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Multi.},41.7 (3),52.2 (4),46.3 (3)


\begin{tabular}{lllll}
\toprule
         &    & \multicolumn{3}{c}{\textbf{Trigger}} \\
\textbf{\makecell{Transfer \\ Learning}} & \textbf{\makecell{BERT \\ Model}} & \textbf{Precision} &    \textbf{Recall} &        \textbf{F1} \\

\midrule
\multirow{3}{*}{\makecell{---}} & \textbf{Ger.} &  \textbf{50.1 (3)} &           56.7 (2) &  \textbf{53.1 (2)} \\
         & \textbf{Europ.} &           49.3 (2) &           53.2 (2) &           51.1 (2) \\
         & \textbf{Multi.} &           45.1 (4) &           51.9 (2) &           48.2 (2) \\
\hline
\multirow{3}{*}{\makecell{\textbf{BEC}}} & \textbf{Ger.} &           48.5 (4) &           54.7 (1) &           51.4 (3) \\
         & \textbf{Europ.} &           49 (3) &           53.1 (4) &           50.8 (2) \\
         & \textbf{Multi.} &           45.5 (4) &           50.9 (2) &           48 (3) \\
\hline
\multirow{3}{*}{\makecell{\textbf{R\&R}}} & \textbf{Ger.} &           42.6 (2) &           56.9 (2) &           48.6 (1) \\
         & \text

## Argument Results

In [11]:
trigger_res = []
for metric_kind in ["strict", "relaxed"]:
    combined_results = []

    # baseline
    baseline_metric_results = {}
    metric_results_model = {}
    trigger_mean = np.mean([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
    trigger_std = np.std([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
    metric_results_model["\textbf{" + "Trigger" + "}"] = f"{trigger_mean:.1f} ({trigger_std:.0f})"

    for c_arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]:
        c_arg_mean = np.mean([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
        c_arg_std = np.std([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
        
        if c_arg == "Controlling": c_arg = "Contr."
        if c_arg == "Affected": c_arg = "Aff."
        if c_arg == "Support": c_arg = "Sup."
        metric_results_model["\textbf{" + c_arg + "}"] = f"{c_arg_mean:.1f} ({c_arg_std:.0f})"

    baseline_metric_results["Baseline"] = metric_results_model


    # experiment
    transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
    for tl_name in transfer_learning_names:

        results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

        with open(results_path, "r") as f:
            results = json.load(f)

        metric_results = {}
        for model_kind in ['dbmdz/bert-base-german-cased', 'dbmdz/bert-base-german-europeana-cased', 'bert-base-multilingual-cased']:

            metric_results_model = {}
            trigger_mean = np.mean([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
            trigger_std = np.std([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
            metric_results_model["\textbf{" + "Trigger" + "}"] = f"{trigger_mean:.1f} ({trigger_std:.0f})"

            for c_arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]:
                c_arg_mean = np.mean([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                c_arg_std = np.std([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                
                if c_arg == "Controlling": c_arg = "Contr."
                if c_arg == "Affected": c_arg = "Aff."
                if c_arg == "Support": c_arg = "Sup."
                metric_results_model["\textbf{" + c_arg + "}"] = f"{c_arg_mean:.1f} ({c_arg_std:.0f})"

            metric_results[model_names_dict[model_kind]] = metric_results_model

        combined_results.append(pd.DataFrame(metric_results).T)

    combined_results.append(pd.DataFrame(baseline_metric_results).T)
    combined_results = pd.concat(combined_results)

    index_df = pd.DataFrame({
        " ": flatten([[transfer_learning_names_dict[tl_name]] * 3 for tl_name in transfer_learning_names]) + ["Baseline"],
        "  ": combined_results.index,
    })

    combined_results.index = pd.MultiIndex.from_frame(index_df)

    # make largest result bold
    combined_results = combined_results.apply(bold_largest, axis=0)

    combined_results_sub = combined_results[["\textbf{" + c_arg + "}" for c_arg in ["Cause", "Effect", "Actor", "Aff.", "Sup.", "Contr."]]]
    display(combined_results_sub)

    # convert to latex and change some things
    latex_df = combined_results_sub.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
    latex_df = latex_df.replace("RULE", "\hline\n")
    latex_df = latex_df.replace("llllllll", "clcccccc")
    latex_df = latex_df.replace(".0", "")
    latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
    latex_df = re.sub(r"Baseline[\s\&]+Baseline", r"\\hline\n\\multicolumn{2}{c}{\\textbf{Baseline}}", latex_df, flags=re.MULTILINE)
    latex_df = re.sub(r"^\s*\&\s*&\s*\\textbf\{Cause\}", r"\\textbf{\\makecell{Transfer \\\\ Learning}} & \\textbf{\\makecell{BERT \\\\ Model}} & \\textbf{Cause}", latex_df, flags=re.MULTILINE)
    print(metric_kind)
    print(latex_df)

detect_trigger_res = combined_results["\textbf{Trigger}"]
display(pd.DataFrame(detect_trigger_res))
latex_df = detect_trigger_res.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
latex_df = latex_df.replace("RULE", "\hline\n")
latex_df = latex_df.replace("lll", "clc")
latex_df = latex_df.replace(".0", "")
latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"Baseline[\s\&]+Baseline", r"\\hline\n\\multicolumn{2}{c}{\\textbf{Baseline}}", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"^\s*\&\s*&\s*\\textbf\{Trigger\}", r"\\textbf{\\makecell{Transfer \\\\ Learning}} & \\textbf{\\makecell{BERT \\\\ Model}} & \\textbf{Trigger}", latex_df, flags=re.MULTILINE)
print("Trigger")
print(latex_df)

Unnamed: 0,Unnamed: 1,\textbf{Cause},\textbf{Effect},\textbf{Actor},\textbf{Aff.},\textbf{Sup.},\textbf{Contr.}
,,,,,,,
\multirow{3}{*}{\makecell{---}},\textbf{Ger.},25.6 (3),24.7 (3),32.7 (12),20.0 (8),0.0 (0),8.0 (5)
\multirow{3}{*}{\makecell{---}},\textbf{Europ.},23.5 (5),23.5 (5),\textbf{34.3 (16)},16.1 (7),0.0 (0),3.5 (5)
\multirow{3}{*}{\makecell{---}},\textbf{Multi.},23.9 (3),22.5 (4),27.8 (14),9.9 (6),0.0 (0),5.6 (8)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Ger.},27.3 (4),25.8 (5),29.6 (10),20.2 (8),0.0 (0),\textbf{9.0 (3)}
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Europ.},22.0 (4),22.9 (2),29.5 (9),17.8 (5),0.0 (0),5.4 (5)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Multi.},26.8 (5),25.2 (4),31.4 (16),13.9 (4),0.0 (0),8.3 (5)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Ger.},26.4 (3),29.8 (3),31.3 (8),20.6 (3),0.0 (0),8.7 (4)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Europ.},27.9 (4),29.4 (3),30.6 (12),\textbf{25.2 (6)},0.0 (0),8.8 (6)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Multi.},26.4 (4),26.8 (4),27.5 (8),19.4 (8),0.0 (0),7.9 (5)


strict
\begin{tabular}{clcccccc}
\toprule
\textbf{\makecell{Transfer \\ Learning}} & \textbf{\makecell{BERT \\ Model}} & \textbf{Cause} &    \textbf{Effect} &      \textbf{Actor} &      \textbf{Aff.} & \textbf{Sup.} &   \textbf{Contr.} \\

\midrule
\multirow{3}{*}{\makecell{---}} & \textbf{Ger.} &           25.6 (3) &           24.7 (3) &           32.7 (12) &           20 (8) &       0 (0) &           8 (5) \\
         & \textbf{Europ.} &           23.5 (5) &           23.5 (5) &  \textbf{34.3 (16)} &           16.1 (7) &       0 (0) &           3.5 (5) \\
         & \textbf{Multi.} &           23.9 (3) &           22.5 (4) &           27.8 (14) &            9.9 (6) &       0 (0) &           5.6 (8) \\
\hline
\multirow{3}{*}{\makecell{\textbf{BEC}}} & \textbf{Ger.} &           27.3 (4) &           25.8 (5) &           29.6 (10) &           20.2 (8) &       0 (0) &  \textbf{9 (3)} \\
         & \textbf{Europ.} &           22 (4) &           22.9 (2) &            29.5 (9) &           17

Unnamed: 0,Unnamed: 1,\textbf{Cause},\textbf{Effect},\textbf{Actor},\textbf{Aff.},\textbf{Sup.},\textbf{Contr.}
,,,,,,,
\multirow{3}{*}{\makecell{---}},\textbf{Ger.},49.6 (2),52.7 (3),44.5 (17),23.5 (8),0.0 (0),24.8 (10)
\multirow{3}{*}{\makecell{---}},\textbf{Europ.},47.6 (3),51.8 (4),\textbf{47.7 (15)},22.4 (8),0.0 (0),15.9 (6)
\multirow{3}{*}{\makecell{---}},\textbf{Multi.},46.3 (2),52.5 (3),29.7 (16),16.6 (9),0.0 (0),8.7 (6)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Ger.},51.5 (4),54.8 (2),42.6 (13),27.2 (13),0.0 (0),\textbf{29.5 (6)}
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Europ.},47.1 (2),50.9 (5),39.3 (13),23.3 (8),0.0 (0),18.7 (7)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Multi.},50.4 (3),54.5 (2),38.0 (15),18.1 (7),0.0 (0),13.0 (7)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Ger.},51.5 (4),58.2 (4),36.7 (13),27.0 (7),0.0 (0),19.9 (2)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Europ.},\textbf{53.2 (2)},56.9 (3),39.1 (15),\textbf{28.8 (7)},0.0 (0),25.2 (10)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Multi.},52.0 (4),55.0 (5),34.7 (9),26.2 (8),0.0 (0),22.4 (6)


relaxed
\begin{tabular}{clcccccc}
\toprule
\textbf{\makecell{Transfer \\ Learning}} & \textbf{\makecell{BERT \\ Model}} & \textbf{Cause} &    \textbf{Effect} &      \textbf{Actor} &      \textbf{Aff.} & \textbf{Sup.} &    \textbf{Contr.} \\

\midrule
\multirow{3}{*}{\makecell{---}} & \textbf{Ger.} &           49.6 (2) &           52.7 (3) &           44.5 (17) &           23.5 (8) &       0 (0) &          24.8 (10) \\
         & \textbf{Europ.} &           47.6 (3) &           51.8 (4) &  \textbf{47.7 (15)} &           22.4 (8) &       0 (0) &           15.9 (6) \\
         & \textbf{Multi.} &           46.3 (2) &           52.5 (3) &           29.7 (16) &           16.6 (9) &       0 (0) &            8.7 (6) \\
\hline
\multirow{3}{*}{\makecell{\textbf{BEC}}} & \textbf{Ger.} &           51.5 (4) &           54.8 (2) &           42.6 (13) &          27.2 (13) &       0 (0) &  \textbf{29.5 (6)} \\
         & \textbf{Europ.} &           47.1 (2) &           50.9 (5) &           39.3 (13) 

Unnamed: 0,Unnamed: 1,\textbf{Trigger}
,,
\multirow{3}{*}{\makecell{---}},\textbf{Ger.},\textbf{53.1 (2)}
\multirow{3}{*}{\makecell{---}},\textbf{Europ.},51.1 (2)
\multirow{3}{*}{\makecell{---}},\textbf{Multi.},48.2 (2)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Ger.},51.4 (3)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Europ.},50.8 (2)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Multi.},48.0 (3)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Ger.},48.6 (1)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Europ.},50.0 (1)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Multi.},46.3 (3)


Trigger
\begin{tabular}{clc}
\toprule
\textbf{\makecell{Transfer \\ Learning}} & \textbf{\makecell{BERT \\ Model}} & \textbf{Trigger} \\

\midrule
\multirow{3}{*}{\makecell{---}} & \textbf{Ger.} &  \textbf{53.1 (2)} \\
         & \textbf{Europ.} &           51.1 (2) \\
         & \textbf{Multi.} &           48.2 (2) \\
\hline
\multirow{3}{*}{\makecell{\textbf{BEC}}} & \textbf{Ger.} &           51.4 (3) \\
         & \textbf{Europ.} &           50.8 (2) \\
         & \textbf{Multi.} &           48 (3) \\
\hline
\multirow{3}{*}{\makecell{\textbf{R\&R}}} & \textbf{Ger.} &           48.6 (1) \\
         & \textbf{Europ.} &           50 (1) \\
         & \textbf{Multi.} &           46.3 (3) \\
\hline
\multirow{3}{*}{\textbf{\makecell{R\&R, \\ BEC}}} & \textbf{Ger.} &           49.2 (2) \\
         & \textbf{Europ.} &           50.4 (3) \\
         & \textbf{Multi.} &           47.1 (3) \\
\hline
\multicolumn{2}{c}{\textbf{Baseline}} &           33.8 (3) \\
\bottomrule
\end{tabular}



### Visualization

In [None]:
combined_results_all = []
for metric_kind in ["strict", "relaxed"]:
    combined_results = []

    # baseline
    metric_results = {}
    metric_results_model = {}
    trigger_mean = np.mean([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
    trigger_std = np.std([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
    metric_results_model["Trigger"] = trigger_mean

    for c_arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]:
        c_arg_mean = np.mean([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
        c_arg_std = np.std([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
        metric_results_model[c_arg] = c_arg_mean

    metric_results["Baseline"] = metric_results_model
    combined_results.append(pd.DataFrame(metric_results).T)


    # experiment
    transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
    for tl_name in transfer_learning_names:

        results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

        with open(results_path, "r") as f:
            results = json.load(f)

        metric_results = {}
        for model_kind in ['dbmdz/bert-base-german-cased', 'dbmdz/bert-base-german-europeana-cased', 'bert-base-multilingual-cased']:

            metric_results_model = {}
            trigger_mean = np.mean([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
            trigger_std = np.std([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
            metric_results_model["Trigger"] = trigger_mean

            for c_arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]:
                c_arg_mean = np.mean([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                c_arg_std = np.std([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                metric_results_model[c_arg] = c_arg_mean

            metric_results[model_kind] = metric_results_model

        combined_results.append(pd.DataFrame(metric_results).T)

    combined_results = pd.concat(combined_results)

    index_df = pd.DataFrame({
        "Transfer": ["Baseline"] + flatten([[tl_name] * 3 for tl_name in ["\u2014", "BEC", "R&R", "R&R, BEC"]]),
        "BERT": ["Baseline"] + ["German", "Europeana", "Multilingual"]*4,
    })

    combined_results.index = pd.MultiIndex.from_frame(index_df)

    combined_results_sub = combined_results[[c_arg for c_arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]]]
    combined_results_sub["kind"] = metric_kind.title()

    combined_results_all.append(combined_results_sub)

combined_results_all = pd.concat(combined_results_all, axis=0).reset_index()
combined_results_all = combined_results_all[combined_results_all["Transfer"] != "Baseline"].copy()
combined_results_all["Macro F1"] = combined_results_all[["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]].mean(axis=1)

combined_results_all

Unnamed: 0,Transfer,BERT,Cause,Effect,Actor,Affected,Support,Controlling,kind,Macro F1
1,—,German,25.555826,24.743564,32.740436,19.964839,0.0,8.039497,Strict,18.50736
2,—,Europeana,23.470145,23.488848,34.250275,16.098103,0.0,3.529412,Strict,16.806131
3,—,Multilingual,23.924953,22.48036,27.841897,9.900142,0.0,5.6,Strict,14.957892
4,BEC,German,27.306735,25.81467,29.592922,20.220313,0.0,9.006326,Strict,18.656828
5,BEC,Europeana,22.047002,22.85132,29.471264,17.75929,0.0,5.365079,Strict,16.248993
6,BEC,Multilingual,26.75208,25.235085,31.383459,13.918475,0.0,8.301342,Strict,17.598407
7,R&R,German,26.389672,29.764783,31.306295,20.575285,0.0,8.742946,Strict,19.463163
8,R&R,Europeana,27.877048,29.36005,30.555556,25.199219,0.0,8.814132,Strict,20.301001
9,R&R,Multilingual,26.440911,26.785411,27.54741,19.356811,0.0,7.857576,Strict,17.99802
10,"R&R, BEC",German,28.687005,31.025641,33.039614,20.46904,0.0,5.810486,Strict,19.838631


In [None]:
import plotly.express as px

fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=("Relaxed", "Strict"),
    shared_xaxes=True,
    vertical_spacing=0.05
)

strict_f1 = combined_results_all.query("kind == 'Strict'")[["Transfer", "BERT", "Macro F1"]]
relaxed_f1 = combined_results_all.query("kind == 'Relaxed'")[["Transfer", "BERT", "Macro F1"]]

import plotly
cols = plotly.colors.DEFAULT_PLOTLY_COLORS

for i, bert in enumerate(strict_f1["BERT"].unique()):
    fig.append_trace(go.Scatter(
        x=strict_f1.query("BERT == @bert")["Transfer"],
        y=strict_f1.query("BERT == @bert")["Macro F1"],
        line=dict(width=2, color=cols[i]),
        marker=dict(color=cols[i]),
        name=bert,
    ), row=2, col=1)

for i, bert in enumerate(relaxed_f1["BERT"].unique()):
    fig.append_trace(go.Scatter(
        x=relaxed_f1.query("BERT == @bert")["Transfer"],
        y=relaxed_f1.query("BERT == @bert")["Macro F1"],
        line=dict(width=2, color=cols[i]),
        marker=dict(color=cols[i]),
        showlegend=False,
    ), row=1, col=1)

fig.update_layout(
    autosize=False,
    width=1000,
    height=900,
    xaxis2_title="Transfer Learning Data",
    yaxis_title="Macro F1 Score",
    yaxis2_title="Macro F1 Score",
    legend_title="Model",
    font=dict(
        size=18,
    ),
    template="plotly_white"
)

fig['layout']["annotations"][0]["font"]["size"] = 20
fig['layout']["annotations"][1]["font"]["size"] = 20

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#595959')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#595959')

fig.show()

In [None]:
fig = go.Figure()

strict_f1 = combined_results_all.query("kind == 'Strict'")[["Transfer", "BERT", "Macro F1"]]
relaxed_f1 = combined_results_all.query("kind == 'Relaxed'")[["Transfer", "BERT", "Macro F1"]]

for i, bert in enumerate(strict_f1["BERT"].unique()):
    fig.add_trace(go.Scatter(
        x=strict_f1.query("BERT == @bert")["Transfer"],
        y=strict_f1.query("BERT == @bert")["Macro F1"],
        line=dict(width=2.5, color=cols[i]),
        marker=dict(color=cols[i], size=10),
        name=bert + " - Strict " ,
    ))

    fig.add_trace(go.Scatter(
        x=relaxed_f1.query("BERT == @bert")["Transfer"],
        y=relaxed_f1.query("BERT == @bert")["Macro F1"],
        line=dict(width=2.5, color=cols[i], dash='dot'),
        marker=dict(color=cols[i], size=10),
        name=bert + " - Relaxed " ,
    ))

fig.update_layout(
    autosize=False,
    width=750,
    height=600,
    xaxis_title="Transfer Learning Data",
    yaxis_title="Macro F1 Score",
    # yaxis2_title="Macro F1 Score",
    legend_title="Models",
    font=dict(
        size=18,
    ),
    template="plotly_white",
)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#595959')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#595959')

fig.show()

fig.write_image("arguments_comparison.png")

## Combine Trigger

In [None]:
from sklearn.metrics import f1_score

combined_results = []

# Baseline
baseline_metric_results = {}
metric_results_model = {}
for combine_metric in ["MCC", "accuracy"]:
    combine_metric_mean = np.mean([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["combine_trigger_results"][combine_metric] for fold in range(5)]) * 100
    combine_metric_std = np.std([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["combine_trigger_results"][combine_metric] for fold in range(5)]) * 100
    metric_results_model["\textbf{" + metrics_dict[combine_metric] + "}"] = f"{combine_metric_mean:.1f} ({combine_metric_std:.0f})"
        
f1_1s = []
f1_0s = []
for fold in range(5):
    trigger_res = pd.DataFrame(baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["trigger_results"])

    f1_0 = f1_score(
        flatten(pd.DataFrame(baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["trigger_results"])["true_combine_triggers"].to_list()),
        flatten(pd.DataFrame(baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["trigger_results"])["pred_combine_triggers"].to_list()),
        pos_label=0
    )
    f1_0s.append(f1_0*100)

    f1_1 = f1_score(
        flatten(pd.DataFrame(baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["trigger_results"])["true_combine_triggers"].to_list()),
        flatten(pd.DataFrame(baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["trigger_results"])["pred_combine_triggers"].to_list()),
        pos_label=1
    )
    f1_1s.append(f1_1*100)

metric_results_model["\textbf{" + "\\makecell{Connected \\\\ Trigger F1}" + "}"] = f"{np.mean(f1_1s):.1f} ({np.std(f1_1s):.0f})"
metric_results_model["\textbf{" + "\\makecell{Separate \\\\ Trigger F1}" + "}"] = f"{np.mean(f1_0s):.1f} ({np.std(f1_0s):.0f})"

baseline_metric_results["Baseline"] = metric_results_model


transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
for tl_name in transfer_learning_names:

    results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

    with open(results_path, "r") as f:
        results = json.load(f)

    metric_results = {}
    for model_kind in ['dbmdz/bert-base-german-cased', 'dbmdz/bert-base-german-europeana-cased', 'bert-base-multilingual-cased']:

        metric_results_model = {}
        for combine_metric in ["MCC", "accuracy"]:
            combine_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["combine_trigger_results"][combine_metric] for fold in range(5)]) * 100
            combine_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["combine_trigger_results"][combine_metric] for fold in range(5)]) * 100
            metric_results_model["\textbf{" + metrics_dict[combine_metric] + "}"] = f"{combine_metric_mean:.1f} ({combine_metric_std:.0f})"

        # manual computation because not done in validation...
        f1_1s = []
        f1_0s = []
        for fold in range(5):
            trigger_res = pd.DataFrame(results[model_kind]["oof_results_all"][fold]["trigger_results"])

            f1_0 = f1_score(
                flatten(pd.DataFrame(results[model_kind]["oof_results_all"][fold]["trigger_results"])["true_combine_triggers"].to_list()),
                flatten(pd.DataFrame(results[model_kind]["oof_results_all"][fold]["trigger_results"])["pred_combine_triggers"].to_list()),
                pos_label=0
            )
            f1_0s.append(f1_0*100)

            f1_1 = f1_score(
                flatten(pd.DataFrame(results[model_kind]["oof_results_all"][fold]["trigger_results"])["true_combine_triggers"].to_list()),
                flatten(pd.DataFrame(results[model_kind]["oof_results_all"][fold]["trigger_results"])["pred_combine_triggers"].to_list()),
                pos_label=1
            )
            f1_1s.append(f1_1*100)

        metric_results_model["\textbf{" + "\\makecell{Connected \\\\ Trigger F1}" + "}"] = f"{np.mean(f1_1s):.1f} ({np.std(f1_1s):.0f})"
        metric_results_model["\textbf{" + "\\makecell{Separate \\\\ Trigger F1}" + "}"] = f"{np.mean(f1_0s):.1f} ({np.std(f1_0s):.0f})"

        metric_results[model_names_dict[model_kind]] = metric_results_model

    combined_results.append(pd.DataFrame(metric_results).T)

combined_results.append(pd.DataFrame(baseline_metric_results).T)
combined_results = pd.concat(combined_results)

index_df = pd.DataFrame({
    " ": flatten([[transfer_learning_names_dict[tl_name]] * 3 for tl_name in transfer_learning_names]) + ["Baseline"] ,
    "  ": combined_results.index,
})

combined_results.index = pd.MultiIndex.from_frame(index_df)

# make largest result bold
combined_results = combined_results.apply(bold_largest, axis=0)

# add trigger result
# combined_results["\textbf{Trigger F1}"] = detect_trigger_res

# sort in order
combined_results = combined_results[["\textbf{" + metric + "}" for metric in ["MCC", "Accuracy", "\\makecell{Connected \\\\ Trigger F1}", "\\makecell{Separate \\\\ Trigger F1}"]]]
# index_df = pd.DataFrame({
#     " ": ["\makecell{Trigger\\\\Detection}"] + ["\makecell{Trigger\\\\Combination}"]*4,
#     "  ": combined_results.columns,
# })
# combined_results.columns = pd.MultiIndex.from_frame(index_df)
display(combined_results)

# convert to latex and change some things
latex_df = combined_results.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
latex_df = latex_df.replace("RULE", "\hline\n")
latex_df = latex_df.replace("llllll", "clcccc")
latex_df = latex_df.replace("\multicolumn{4}{l}", "\multicolumn{4}{c}")
latex_df = latex_df.replace(".0", "")
latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"Baseline[\s\&]+Baseline", r"\\hline\n\\multicolumn{2}{c}{\\textbf{Baseline}}", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"^\s*\&\s*&\s*\\textbf\{MCC\}", r"\\textbf{\\makecell{Transfer \\\\ Learning}} & \\textbf{\\makecell{BERT \\\\ Model}} & \\textbf{MCC}", latex_df, flags=re.MULTILINE)
print(latex_df)

Unnamed: 0,Unnamed: 1,\textbf{MCC},\textbf{Accuracy},\textbf{\makecell{Connected \\ Trigger F1}},\textbf{\makecell{Separate \\ Trigger F1}}
,,,,,
\multirow{3}{*}{\makecell{---}},\textbf{Ger.},75.1 (5),87.4 (2),86.3 (2),88.3 (3)
\multirow{3}{*}{\makecell{---}},\textbf{Europ.},74.3 (5),87.0 (3),85.8 (3),87.9 (3)
\multirow{3}{*}{\makecell{---}},\textbf{Multi.},75.0 (7),87.7 (3),85.9 (4),88.9 (3)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Ger.},77.6 (4),88.8 (2),87.4 (2),\textbf{89.9 (2)}
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Europ.},76.1 (4),87.7 (2),86.8 (2),88.5 (2)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Multi.},77.1 (4),88.7 (2),87.1 (2),89.8 (2)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Ger.},73.4 (3),86.5 (2),85.3 (2),87.5 (1)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Europ.},\textbf{78.1 (2)},88.8 (1),\textbf{87.9 (2)},89.6 (1)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Multi.},73.6 (6),86.7 (3),85.5 (3),87.7 (3)


\begin{tabular}{clcccc}
\toprule
\textbf{\makecell{Transfer \\ Learning}} & \textbf{\makecell{BERT \\ Model}} & \textbf{MCC} &  \textbf{Accuracy} & \textbf{\makecell{Connected \\ Trigger F1}} & \textbf{\makecell{Separate \\ Trigger F1}} \\

\midrule
\multirow{3}{*}{\makecell{---}} & \textbf{Ger.} &           75.1 (5) &           87.4 (2) &                                    86.3 (2) &                                   88.3 (3) \\
         & \textbf{Europ.} &           74.3 (5) &           87 (3) &                                    85.8 (3) &                                   87.9 (3) \\
         & \textbf{Multi.} &           75 (7) &           87.7 (3) &                                    85.9 (4) &                                   88.9 (3) \\
\hline
\multirow{3}{*}{\makecell{\textbf{BEC}}} & \textbf{Ger.} &           77.6 (4) &           88.8 (2) &                                    87.4 (2) &                          \textbf{89.9 (2)} \\
         & \textbf{Europ.} &           76.1 

## Type Classification

In [None]:
combined_results = []

# Baseline
baseline_metric_results = {}
metric_results_model = {}
for type_metric in ["MCC", "accuracy", "consequence_f1", "motivation_f1", "purpose_f1"]:
    type_metric_mean = np.mean([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["classify_type_results"][type_metric] for fold in range(5)]) * 100
    type_metric_std = np.std([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["classify_type_results"][type_metric] for fold in range(5)]) * 100
    metric_results_model["\textbf{" + metrics_dict[type_metric] + "}"] = f"{type_metric_mean:.1f} ({type_metric_std:.0f})"

baseline_metric_results["Baseline"] = metric_results_model

# Experiments
transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
for tl_name in transfer_learning_names:

    results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

    with open(results_path, "r") as f:
        results = json.load(f)

    metric_results = {}
    for model_kind in ['dbmdz/bert-base-german-cased', 'dbmdz/bert-base-german-europeana-cased', 'bert-base-multilingual-cased']:

        metric_results_model = {}
        for type_metric in ["MCC", "accuracy", "consequence_f1", "motivation_f1", "purpose_f1"]:
            type_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["classify_type_results"][type_metric] for fold in range(5)]) * 100
            type_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["classify_type_results"][type_metric] for fold in range(5)]) * 100
            
            metric_results_model["\textbf{" + metrics_dict[type_metric] + "}"] = f"{type_metric_mean:.1f} ({type_metric_std:.0f})"

        metric_results[model_names_dict[model_kind]] = metric_results_model

    combined_results.append(pd.DataFrame(metric_results).T)

combined_results.append(pd.DataFrame(baseline_metric_results).T)
combined_results = pd.concat(combined_results)

index_df = pd.DataFrame({
    " ": flatten([[transfer_learning_names_dict[tl_name]] * 3 for tl_name in transfer_learning_names]) + ["Baseline"],
    "  ": combined_results.index,
})

combined_results.index = pd.MultiIndex.from_frame(index_df)

# make largest result bold
combined_results = combined_results.apply(bold_largest, axis=0)

# sort in order
combined_results = combined_results[["\textbf{" + metric + "}" for metric in ["MCC", "Accuracy", "\makecell{Consequence\\\\ F1}", "\makecell{Motivation\\\\ F1}", "\makecell{Purpose\\\\ F1}"]]]
display(combined_results)

# convert to latex and change some things
latex_df = combined_results.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
latex_df = latex_df.replace("RULE", "\hline\n")
latex_df = latex_df.replace("lllllll", "clccccc")
latex_df = latex_df.replace(".0", "")
latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"Baseline[\s\&]+Baseline", r"\\hline\n\\multicolumn{2}{c}{\\textbf{Baseline}}", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"^\s*\&\s*&\s*\\textbf\{MCC\}", r"\\textbf{\\makecell{Transfer \\\\ Learning}} & \\textbf{\\makecell{BERT \\\\ Model}} & \\textbf{MCC}", latex_df, flags=re.MULTILINE)
print(latex_df)

Unnamed: 0,Unnamed: 1,\textbf{MCC},\textbf{Accuracy},\textbf{\makecell{Consequence\\ F1}},\textbf{\makecell{Motivation\\ F1}},\textbf{\makecell{Purpose\\ F1}}
,,,,,,
\multirow{3}{*}{\makecell{---}},\textbf{Ger.},58.1 (9),73.8 (5),82.3 (6),55.2 (8),74.2 (6)
\multirow{3}{*}{\makecell{---}},\textbf{Europ.},56.2 (6),72.8 (4),81.6 (4),49.8 (5),76.0 (4)
\multirow{3}{*}{\makecell{---}},\textbf{Multi.},55.3 (8),72.3 (5),81.0 (4),52.3 (6),73.8 (6)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Ger.},57.1 (4),73.6 (2),82.0 (3),52.8 (3),74.7 (6)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Europ.},56.9 (5),73.5 (3),81.9 (3),50.7 (5),75.7 (6)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Multi.},50.8 (4),69.7 (3),78.4 (3),46.3 (2),73.8 (3)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Ger.},59.6 (6),74.7 (4),82.3 (4),\textbf{57.3 (6)},76.3 (5)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Europ.},59.2 (5),74.6 (3),82.2 (3),54.4 (6),\textbf{78.1 (5)}
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Multi.},51.0 (3),69.1 (2),78.0 (3),48.3 (5),72.9 (3)


\begin{tabular}{clccccc}
\toprule
\textbf{\makecell{Transfer \\ Learning}} & \textbf{\makecell{BERT \\ Model}} & \textbf{MCC} &  \textbf{Accuracy} & \textbf{\makecell{Consequence\\ F1}} & \textbf{\makecell{Motivation\\ F1}} & \textbf{\makecell{Purpose\\ F1}} \\

\midrule
\multirow{3}{*}{\makecell{---}} & \textbf{Ger.} &           58.1 (9) &           73.8 (5) &                             82.3 (6) &                            55.2 (8) &                         74.2 (6) \\
         & \textbf{Europ.} &           56.2 (6) &           72.8 (4) &                             81.6 (4) &                            49.8 (5) &                         76 (4) \\
         & \textbf{Multi.} &           55.3 (8) &           72.3 (5) &                             81 (4) &                            52.3 (6) &                         73.8 (6) \\
\hline
\multirow{3}{*}{\makecell{\textbf{BEC}}} & \textbf{Ger.} &           57.1 (4) &           73.6 (2) &                             82 (3) &               

In [None]:
# confusion matrix
transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
for tl_name in transfer_learning_names:

    results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

    with open(results_path, "r") as f:
        results = json.load(f)

    type_conf_df = pd.DataFrame(
        sum(pd.DataFrame(results['dbmdz/bert-base-german-cased']["oof_results_all"][i]['classify_type_results']["confusion_matrix"]) for i in range(5)),
    )

    index_df = pd.DataFrame({
        " ": ["\multirow{3}{*}{\textbf{Predicted}}"]*3,
        "  ": ["\textbf{" + n + "}" for n in type_conf_df.index],
    })
    col_df = pd.DataFrame({
        " ": ["\textbf{Ground Truth}"]*3,
        "  ": ["\textbf{" + n + "}" for n in type_conf_df.index],
    })
    type_conf_df.columns = pd.MultiIndex.from_frame(col_df)
    type_conf_df.index = pd.MultiIndex.from_frame(index_df)

    latex_df = type_conf_df.to_latex()
    latex_df = latex_df.replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
    latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
    latex_df = latex_df.replace("\multicolumn{3}{l}", "\multicolumn{3}{c}")
    latex_df = latex_df.replace("llrrr", "llccc")


    print(tl_name)
    print(latex_df)
    print()

no_transfer
\begin{tabular}{llccc}
\toprule
                                    &   & \multicolumn{3}{c}{\textbf{Ground Truth}} \\
                                    &    &      \textbf{Purpose} & \textbf{Motivation} & \textbf{Consequence} \\

\midrule
\multirow{3}{*}{\textbf{Predicted}} & \textbf{Purpose} &                   132 &                  27 &                   16 \\
                                    & \textbf{Motivation} &                    31 &                  93 &                   50 \\
                                    & \textbf{Consequence} &                    14 &                  45 &                  290 \\
\bottomrule
\end{tabular}


dunietz
\begin{tabular}{llccc}
\toprule
                                    &   & \multicolumn{3}{c}{\textbf{Ground Truth}} \\
                                    &    &      \textbf{Purpose} & \textbf{Motivation} & \textbf{Consequence} \\

\midrule
\multirow{3}{*}{\textbf{Predicted}} & \textbf{Purpose} &                   133 &

### Visualization

In [None]:
combined_results = []

# Experiments
transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
for tl_name in transfer_learning_names:

    results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

    with open(results_path, "r") as f:
        results = json.load(f)

    metric_results = {}
    for model_kind in ['dbmdz/bert-base-german-cased', 'dbmdz/bert-base-german-europeana-cased', 'bert-base-multilingual-cased']:

        metric_results_model = {}
        for type_metric in ["MCC", "accuracy", "consequence_f1", "motivation_f1", "purpose_f1"]:
            type_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["classify_type_results"][type_metric] for fold in range(5)]) * 100
            type_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["classify_type_results"][type_metric] for fold in range(5)]) * 100
            
            metric_results_model[type_metric] = type_metric_mean

        metric_results[model_names_dict[model_kind]] = metric_results_model

    combined_results.append(pd.DataFrame(metric_results).T)

combined_results = pd.concat(combined_results)

index_df = pd.DataFrame({
    "Transfer": flatten([[tl_name] * 3 for tl_name in ["\u2014", "BEC", "R&R", "R&R, BEC"]]),
    "BERT": ["German", "Europeana", "Multilingual"]*4,
})

combined_results.index = pd.MultiIndex.from_frame(index_df)

# sort in order
combined_results = combined_results[[metric for metric in ["MCC", "accuracy", "consequence_f1", "motivation_f1", "purpose_f1"]]]
combined_results.columns = ["MCC", "Accuracy", "Consequence F1", "Motivation F1", "Purpose F1"]
combined_results = combined_results.reset_index()
display(combined_results)

Unnamed: 0,Transfer,BERT,MCC,Accuracy,Consequence F1,Motivation F1,Purpose F1
0,—,German,58.103017,73.84584,82.342635,55.215067,74.23764
1,—,Europeana,56.225264,72.815284,81.639359,49.769027,76.030648
2,—,Multilingual,55.32217,72.284506,81.015626,52.263534,73.758175
3,BEC,German,57.13266,73.642481,82.01589,52.817326,74.661781
4,BEC,Europeana,56.894796,73.522769,81.870106,50.734034,75.68278
5,BEC,Multilingual,50.750935,69.659153,78.399143,46.333549,73.75181
6,R&R,German,59.640316,74.712025,82.303024,57.311122,76.280255
7,R&R,Europeana,59.224353,74.582834,82.176026,54.425211,78.141443
8,R&R,Multilingual,50.980308,69.125288,78.047147,48.340538,72.938482
9,"R&R, BEC",German,59.890441,75.276698,82.945876,56.246675,77.45749


In [None]:
import plotly.express as px
import plotly

fig = go.Figure()

cols = plotly.colors.DEFAULT_PLOTLY_COLORS

for i, bert in enumerate(combined_results["BERT"].unique()):
    fig.add_trace(go.Scatter(
        x=combined_results.query("BERT == @bert")["Transfer"],
        y=combined_results.query("BERT == @bert")["MCC"],
        line=dict(width=2, color=cols[i]),
        marker=dict(color=cols[i], size=10),
        name=bert
    ))

fig.update_layout(
    autosize=False,
    width=750,
    height=450,
    xaxis_title="Transfer Learning Data",
    yaxis_title="MCC",
    legend_title="Models",
    font=dict(
        size=18,
    ),
    template="plotly_white",
)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#595959')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#595959')

fig.show()

fig.write_image("type_comparison.png")

## Degree Results

In [None]:
combined_results = []

# Baseline
baseline_metric_results = {}
metric_results_model = {}
for degree_metric in ["MCC", "accuracy", "facilitate_f1", "inhibit_f1"]:
    degree_metric_mean = np.mean([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["classify_degree_results"][degree_metric] for fold in range(5)]) * 100
    degree_metric_std = np.std([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["classify_degree_results"][degree_metric] for fold in range(5)]) * 100
    metric_results_model["\textbf{" + metrics_dict[degree_metric] + "}"] = f"{degree_metric_mean:.1f} ({degree_metric_std:.0f})"

baseline_metric_results["Baseline"] = metric_results_model

# Experiments
transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
for tl_name in transfer_learning_names:

    results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

    with open(results_path, "r") as f:
        results = json.load(f)

    metric_results = {}
    for model_kind in ['dbmdz/bert-base-german-cased', 'dbmdz/bert-base-german-europeana-cased', 'bert-base-multilingual-cased']:

        metric_results_model = {}
        for degree_metric in ["MCC", "accuracy", "facilitate_f1", "inhibit_f1"]:
            degree_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["classify_degree_results"][degree_metric] for fold in range(5)]) * 100
            degree_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["classify_degree_results"][degree_metric] for fold in range(5)]) * 100
            
            metric_results_model["\textbf{" + metrics_dict[degree_metric] + "}"] = f"{degree_metric_mean:.1f} ({degree_metric_std:.0f})"

        metric_results[model_names_dict[model_kind]] = metric_results_model

    combined_results.append(pd.DataFrame(metric_results).T)

combined_results.append(pd.DataFrame(baseline_metric_results).T)
combined_results = pd.concat(combined_results)

index_df = pd.DataFrame({
    " ": flatten([[transfer_learning_names_dict[tl_name]] * 3 for tl_name in transfer_learning_names]) + ["Baseline"],
    "  ": combined_results.index,
})

combined_results.index = pd.MultiIndex.from_frame(index_df)

# make largest result bold
combined_results = combined_results.apply(bold_largest, axis=0)

# sort in order
combined_results = combined_results[["\textbf{" + metric + "}" for metric in ["MCC", "Accuracy", "Facilitate F1", "Inhibit F1"]]]
display(combined_results)

# convert to latex and change some things
latex_df = combined_results.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
latex_df = latex_df.replace("RULE", "\hline\n")
latex_df = latex_df.replace("llllll", "clcccc")
latex_df = latex_df.replace(".0", "")
latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"Baseline[\s\&]+Baseline", r"\\hline\n\\multicolumn{2}{c}{\\textbf{Baseline}}", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"^\s*\&\s*&\s*\\textbf\{MCC\}", r"\\textbf{\\makecell{Transfer \\\\ Learning}} & \\textbf{\\makecell{BERT \\\\ Model}} & \\textbf{MCC}", latex_df, flags=re.MULTILINE)
print(latex_df)

Unnamed: 0,Unnamed: 1,\textbf{MCC},\textbf{Accuracy},\textbf{Facilitate F1},\textbf{Inhibit F1}
,,,,,
\multirow{3}{*}{\makecell{---}},\textbf{Ger.},52.7 (14),95.8 (1),97.8 (1),50.4 (12)
\multirow{3}{*}{\makecell{---}},\textbf{Europ.},42.9 (16),95.5 (1),97.7 (0),37.7 (14)
\multirow{3}{*}{\makecell{---}},\textbf{Multi.},42.1 (17),94.9 (2),97.4 (1),42.7 (16)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Ger.},55.4 (15),95.8 (2),97.8 (1),54.8 (14)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Europ.},38.7 (12),95.3 (1),97.5 (0),32.5 (13)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Multi.},52.4 (12),95.8 (1),97.8 (1),51.9 (13)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Ger.},64.5 (18),96.4 (2),98.1 (1),65.7 (17)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Europ.},58.9 (23),96.5 (2),98.2 (1),57.6 (22)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Multi.},46.5 (20),95.1 (2),97.4 (1),45.4 (18)


\begin{tabular}{clcccc}
\toprule
\textbf{\makecell{Transfer \\ Learning}} & \textbf{\makecell{BERT \\ Model}} & \textbf{MCC} &  \textbf{Accuracy} & \textbf{Facilitate F1} & \textbf{Inhibit F1} \\

\midrule
\multirow{3}{*}{\makecell{---}} & \textbf{Ger.} &           52.7 (14) &           95.8 (1) &               97.8 (1) &           50.4 (12) \\
         & \textbf{Europ.} &           42.9 (16) &           95.5 (1) &               97.7 (0) &           37.7 (14) \\
         & \textbf{Multi.} &           42.1 (17) &           94.9 (2) &               97.4 (1) &           42.7 (16) \\
\hline
\multirow{3}{*}{\makecell{\textbf{BEC}}} & \textbf{Ger.} &           55.4 (15) &           95.8 (2) &               97.8 (1) &           54.8 (14) \\
         & \textbf{Europ.} &           38.7 (12) &           95.3 (1) &               97.5 (0) &           32.5 (13) \\
         & \textbf{Multi.} &           52.4 (12) &           95.8 (1) &               97.8 (1) &           51.9 (13) \\
\hline
\multirow

In [None]:
# confusion matrix
transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
for tl_name in transfer_learning_names:

    results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

    with open(results_path, "r") as f:
        results = json.load(f)

    degree_conf_df = pd.DataFrame(
        sum(pd.DataFrame(results['dbmdz/bert-base-german-cased']["oof_results_all"][i]['classify_degree_results']["confusion_matrix"]) for i in range(5)),
    )

    index_df = pd.DataFrame({
        " ": ["\multirow{2}{*}{\textbf{Predicted}}"]*2,
        "  ": ["\textbf{" + n + "}" for n in degree_conf_df.index],
    })
    col_df = pd.DataFrame({
        " ": ["\textbf{Ground Truth}"]*2,
        "  ": ["\textbf{" + n + "}" for n in degree_conf_df.index],
    })
    degree_conf_df.columns = pd.MultiIndex.from_frame(col_df)
    degree_conf_df.index = pd.MultiIndex.from_frame(index_df)

    latex_df = degree_conf_df.to_latex()
    latex_df = latex_df.replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
    latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
    latex_df = latex_df.replace("\multicolumn{2}{l}", "\multicolumn{2}{c}")

    print(tl_name)
    print(latex_df)
    print()

no_transfer
\begin{tabular}{llrr}
\toprule
                                    &   & \multicolumn{2}{c}{\textbf{Ground Truth}} \\
                                    &    &   \textbf{Facilitate} & \textbf{Inhibit} \\

\midrule
\multirow{2}{*}{\textbf{Predicted}} & \textbf{Facilitate} &                   654 &               24 \\
                                    & \textbf{Inhibit} &                     5 &               15 \\
\bottomrule
\end{tabular}


dunietz
\begin{tabular}{llrr}
\toprule
                                    &   & \multicolumn{2}{c}{\textbf{Ground Truth}} \\
                                    &    &   \textbf{Facilitate} & \textbf{Inhibit} \\

\midrule
\multirow{2}{*}{\textbf{Predicted}} & \textbf{Facilitate} &                   652 &               22 \\
                                    & \textbf{Inhibit} &                     7 &               17 \\
\bottomrule
\end{tabular}


rehbein
\begin{tabular}{llrr}
\toprule
                                    &   & \mu

### Visualization

In [None]:
combined_results = []

# Experiments
transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
for tl_name in transfer_learning_names:

    results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

    with open(results_path, "r") as f:
        results = json.load(f)

    metric_results = {}
    for model_kind in ['dbmdz/bert-base-german-cased', 'dbmdz/bert-base-german-europeana-cased', 'bert-base-multilingual-cased']:

        metric_results_model = {}
        for degree_metric in ["MCC", "accuracy", "facilitate_f1", "inhibit_f1"]:
            degree_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["classify_degree_results"][degree_metric] for fold in range(5)]) * 100
            degree_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["classify_degree_results"][degree_metric] for fold in range(5)]) * 100
            
            metric_results_model[metrics_dict[degree_metric]] = degree_metric_mean

        metric_results[model_names_dict[model_kind]] = metric_results_model

    combined_results.append(pd.DataFrame(metric_results).T)

combined_results = pd.concat(combined_results)

index_df = pd.DataFrame({
    "Transfer": flatten([[tl_name] * 3 for tl_name in ["\u2014", "BEC", "R&R", "R&R, BEC"]]),
    "BERT": ["German", "Europeana", "Multilingual"]*4,
})

combined_results.index = pd.MultiIndex.from_frame(index_df)

# sort in order
combined_results = combined_results[[metric for metric in ["MCC", "Accuracy", "Facilitate F1", "Inhibit F1"]]]
combined_results = combined_results.reset_index()

display(combined_results)


Unnamed: 0,Transfer,BERT,MCC,Accuracy,Facilitate F1,Inhibit F1
0,—,German,52.650525,95.821854,97.815216,50.376068
1,—,Europeana,42.906329,95.537604,97.68264,37.650794
2,—,Multilingual,42.053704,94.94759,97.35398,42.666667
3,BEC,German,55.373159,95.805037,97.7975,54.794539
4,BEC,Europeana,38.650574,95.260705,97.538164,32.539683
5,BEC,Multilingual,52.400175,95.824655,97.813077,51.939394
6,R&R,German,64.458383,96.402517,98.095169,65.748918
7,R&R,Europeana,58.917214,96.542189,98.195202,57.561497
8,R&R,Multilingual,46.492481,95.088215,97.422969,45.434343
9,"R&R, BEC",German,69.128754,96.960991,98.398653,69.803922


In [None]:
fig = go.Figure()

for i, bert in enumerate(combined_results["BERT"].unique()):
    fig.add_trace(go.Scatter(
        x=combined_results.query("BERT == @bert")["Transfer"],
        y=combined_results.query("BERT == @bert")["MCC"],
        line=dict(width=2, color=cols[i]),
        marker=dict(color=cols[i], size=10),
        name=bert
    ))

fig.update_layout(
    autosize=False,
    width=750,
    height=450,
    xaxis_title="Transfer Learning Data",
    yaxis_title="MCC",
    legend_title="Models",
    font=dict(
        size=18,
    ),
    template="plotly_white",
)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#595959')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#595959')

fig.show()

fig.write_image("degree_comparison.png")

## Results matched relations

In [None]:
metric_kind_results = []
for metric_kind in ["strict", "relaxed"]:

    combined_results = []

    # baseline
    baseline_metric_results = {}
    metric_results_model = {}
    for match_metric in ["precision", "recall", "f1"]:
        match_metric_mean = np.mean([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
        match_metric_std = np.std([baseline_results['dbmdz/bert-base-german-cased']["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
        metric_results_model["\textbf{" +metrics_dict[match_metric] + "}"] = f"{match_metric_mean:.1f} ({match_metric_std:.0f})"

    baseline_metric_results["Baseline"] = metric_results_model

    # experiment
    transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
    for tl_name in transfer_learning_names:

        results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

        with open(results_path, "r") as f:
            results = json.load(f)

        metric_results = {}
        for model_kind in ['dbmdz/bert-base-german-cased', 'dbmdz/bert-base-german-europeana-cased', 'bert-base-multilingual-cased']:

            metric_results_model = {}
            for match_metric in ["precision", "recall", "f1"]:
                match_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                match_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                metric_results_model["\textbf{" +metrics_dict[match_metric] + "}"] = f"{match_metric_mean:.1f} ({match_metric_std:.0f})"

            metric_results[model_names_dict[model_kind]] = metric_results_model

        combined_results.append(pd.DataFrame(metric_results).T)

    combined_results.append(pd.DataFrame(baseline_metric_results).T)
    combined_results = pd.concat(combined_results)

    index_df = pd.DataFrame({
        " ": flatten([[transfer_learning_names_dict[tl_name]] * 3 for tl_name in transfer_learning_names]) + ["Baseline"],
        "  ": combined_results.index,
    })

    combined_results.index = pd.MultiIndex.from_frame(index_df)
    # sort in order
    combined_results = combined_results[["\textbf{" + c_arg + "}" for c_arg in ["Precision", "Recall", "F1"]]]
    metric_kind_results.append(combined_results)

metric_kind_results = pd.concat(metric_kind_results, axis=1)
metric_kind_results = metric_kind_results.T

index_df = pd.DataFrame({
    "  ": ["\textbf{Strict}"] * 3 + ["\textbf{Relaxed}"] * 3,
    " ": metric_kind_results.index,
})

metric_kind_results.index = pd.MultiIndex.from_frame(index_df)
metric_kind_results = metric_kind_results.T

# make largest result bold
metric_kind_results = metric_kind_results.apply(bold_largest, axis=0)
display(metric_kind_results)

# convert to latex and change some things
latex_df = metric_kind_results.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
latex_df = latex_df.replace("RULE", "\hline\n")
latex_df = latex_df.replace("llllllll", "clcccccc")
latex_df = latex_df.replace(".0", "")
latex_df = latex_df.replace("\multicolumn{3}{l}", "\multicolumn{3}{c}")
latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"Baseline[\s\&]+Baseline", r"\\hline\n\\multicolumn{2}{c}{\\textbf{Baseline}}", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"^\s*\&\s*&\s*\\textbf\{Precision\}", r"\\textbf{\\makecell{Transfer \\\\ Learning}} & \\textbf{\\makecell{BERT \\\\ Model}} & \\textbf{Precision}", latex_df, flags=re.MULTILINE)
print(latex_df)


Unnamed: 0_level_0,Unnamed: 1_level_0,\textbf{Strict},\textbf{Strict},\textbf{Strict},\textbf{Relaxed},\textbf{Relaxed},\textbf{Relaxed}
Unnamed: 0_level_1,Unnamed: 1_level_1,\textbf{Precision},\textbf{Recall},\textbf{F1},\textbf{Precision},\textbf{Recall},\textbf{F1}
,,,,,,,
\multirow{3}{*}{\makecell{---}},\textbf{Ger.},6.4 (1),8.2 (1),7.2 (1),\textbf{60.5 (4)},78.0 (3),\textbf{68.0 (2)}
\multirow{3}{*}{\makecell{---}},\textbf{Europ.},6.3 (1),7.7 (1),7.0 (1),58.2 (2),70.8 (2),63.8 (1)
\multirow{3}{*}{\makecell{---}},\textbf{Multi.},6.2 (2),8.3 (2),7.1 (2),54.3 (4),73.9 (4),62.4 (3)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Ger.},7.7 (1),10.0 (2),8.7 (2),59.1 (3),77.6 (3),67.0 (2)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Europ.},5.9 (1),7.2 (1),6.5 (1),59.6 (2),72.0 (5),65.1 (1)
RULE\multirow{3}{*}{\makecell{\textbf{BEC}}},\textbf{Multi.},7.2 (1),9.5 (1),8.1 (1),56.1 (4),74.0 (3),63.7 (2)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Ger.},7.4 (1),11.5 (2),9.0 (1),51.9 (4),80.1 (3),62.9 (3)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Europ.},\textbf{8.9 (3)},\textbf{13.0 (4)},\textbf{10.6 (4)},53.6 (1),79.0 (4),63.8 (2)
RULE\multirow{3}{*}{\makecell{\textbf{R&R}}},\textbf{Multi.},6.4 (1),9.8 (2),7.7 (2),51.5 (3),78.5 (3),62.1 (3)


\begin{tabular}{clcccccc}
\toprule
         &    & \multicolumn{3}{c}{\textbf{Strict}} & \multicolumn{3}{c}{\textbf{Relaxed}} \\
\textbf{\makecell{Transfer \\ Learning}} & \textbf{\makecell{BERT \\ Model}} & \textbf{Precision} &    \textbf{Recall} &        \textbf{F1} & \textbf{Precision} &    \textbf{Recall} &        \textbf{F1} \\

\midrule
\multirow{3}{*}{\makecell{---}} & \textbf{Ger.} &            6.4 (1) &            8.2 (1) &            7.2 (1) &  \textbf{60.5 (4)} &           78 (3) &  \textbf{68 (2)} \\
         & \textbf{Europ.} &            6.3 (1) &            7.7 (1) &            7 (1) &           58.2 (2) &           70.8 (2) &           63.8 (1) \\
         & \textbf{Multi.} &            6.2 (2) &            8.3 (2) &            7.1 (2) &           54.3 (4) &           73.9 (4) &           62.4 (3) \\
\hline
\multirow{3}{*}{\makecell{\textbf{BEC}}} & \textbf{Ger.} &            7.7 (1) &           10 (2) &            8.7 (2) &           59.1 (3) &           77.6 (3) &    

### make plot

In [None]:
metric_kind_results = []
for metric_kind in ["strict", "relaxed"]:

    combined_results = []

    # experiment
    transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
    for tl_name in transfer_learning_names:

        results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

        with open(results_path, "r") as f:
            results = json.load(f)

        metric_results = {}
        for model_kind in ['dbmdz/bert-base-german-cased', 'dbmdz/bert-base-german-europeana-cased', 'bert-base-multilingual-cased']:

            metric_results_model = {}
            for match_metric in ["precision", "recall", "f1"]:
                match_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                match_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                metric_results_model[metrics_dict[match_metric]] = match_metric_mean

            metric_results[model_kind] = metric_results_model

        combined_results.append(pd.DataFrame(metric_results).T)

    combined_results = pd.concat(combined_results)

    index_df = pd.DataFrame({
        "Transfer": flatten([[tl_name] * 3 for tl_name in ["\u2014", "BEC", "R&R", "R&R, BEC"]]),
        "BERT": ["German", "Europeana", "Multilingual"]*4,
    })

    combined_results.index = pd.MultiIndex.from_frame(index_df)
    metric_kind_results.append(combined_results)

metric_kind_results = pd.concat(metric_kind_results, axis=1)
metric_kind_results = metric_kind_results.T

index_df = pd.DataFrame({
    "Kind": ["Strict"] * 3 + ["Relaxed"] * 3,
    "Metric": metric_kind_results.index,
})

metric_kind_results.index = pd.MultiIndex.from_frame(index_df)
metric_kind_results = metric_kind_results.T

metric_kind_results

Unnamed: 0_level_0,Kind,Strict,Strict,Strict,Relaxed,Relaxed,Relaxed
Unnamed: 0_level_1,Metric,Precision,Recall,F1,Precision,Recall,F1
Transfer,BERT,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
—,German,6.369652,8.189504,7.154313,60.529212,77.981956,68.037232
—,Europeana,6.34887,7.734372,6.967698,58.167063,70.809795,63.820719
—,Multilingual,6.199671,8.28084,7.074482,54.255725,73.877296,62.412126
BEC,German,7.657585,10.03458,8.67251,59.051532,77.580118,66.967
BEC,Europeana,5.94736,7.183153,6.490945,59.564204,72.041252,65.058139
BEC,Multilingual,7.155599,9.482547,8.137445,56.111112,74.003301,63.682266
R&R,German,7.421696,11.490246,9.004887,51.868575,80.148352,62.878363
R&R,Europeana,8.929016,12.997718,10.574318,53.619928,79.040531,63.840748
R&R,Multilingual,6.361227,9.761542,7.693403,51.451666,78.540577,62.108618
"R&R, BEC",German,8.753766,12.733431,10.365927,54.304967,79.718223,64.540688


In [None]:
fig = make_subplots(
    rows=2, cols=1,
    subplot_titles=("Relaxed", "Strict"),
    shared_xaxes=True,
    vertical_spacing=0.05
)

strict_f1 = metric_kind_results["Strict"]["F1"].reset_index()
relaxed_f1 = metric_kind_results["Relaxed"]["F1"].reset_index()

for i, bert in enumerate(strict_f1["BERT"].unique()):
    fig.append_trace(go.Scatter(
        x=strict_f1.query("BERT == @bert")["Transfer"],
        y=strict_f1.query("BERT == @bert")["F1"],
        line=dict(width=2, color=cols[i]),
        marker=dict(color=cols[i]),
        name=bert + " - Strict "
    ), row=2, col=1)

for i, bert in enumerate(relaxed_f1["BERT"].unique()):
    fig.append_trace(go.Scatter(
        x=relaxed_f1.query("BERT == @bert")["Transfer"],
        y=relaxed_f1.query("BERT == @bert")["F1"],
        line=dict(width=2, color=cols[i], dash="dot"),
        marker=dict(color=cols[i]),
        showlegend=False,
        name=bert + " - Relaxed " ,
    ), row=1, col=1)

fig.update_layout(
    autosize=False,
    width=750,
    height=900,
    xaxis2_title="Transfer Learning Data",
    yaxis_title="F1 Score",
    yaxis2_title="F1 Score",
    legend_title="BERT Models",
    font=dict(
        size=18,
    ),
    template="plotly_white"
)

fig['layout']["annotations"][0]["font"]["size"] = 20
fig['layout']["annotations"][1]["font"]["size"] = 20

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#595959')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#595959')

fig.show()

In [None]:
import plotly.express as px
import plotly
cols = plotly.colors.DEFAULT_PLOTLY_COLORS

fig = go.Figure()

strict_f1 = metric_kind_results["Strict"]["F1"].reset_index()

for i, bert in enumerate(strict_f1["BERT"].unique()):
    fig.add_trace(go.Scatter(
        x=strict_f1.query("BERT == @bert")["Transfer"],
        y=strict_f1.query("BERT == @bert")["F1"],
        line=dict(width=2, color=cols[i]),
        marker=dict(color=cols[i]),
        name=bert + "<br>Strict "
    ))

fig.update_layout(
    autosize=False,
    width=750,
    height=450,
    xaxis_title="Transfer Learning Data",
    yaxis_title="F1 Score",
    # yaxis2_title="F1 Score",
    legend_title="Model",
    font=dict(
        size=18,
    ),
    template="plotly_white"
)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#595959')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#595959')

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

fig.show()

fig.write_image("match_relation_comparison_strict.png")

In [None]:
fig = go.Figure()

relaxed_f1 = metric_kind_results["Relaxed"]["F1"].reset_index()

for i, bert in enumerate(relaxed_f1["BERT"].unique()):
    fig.add_trace(go.Scatter(
        x=relaxed_f1.query("BERT == @bert")["Transfer"],
        y=relaxed_f1.query("BERT == @bert")["F1"],
        line=dict(width=2, color=cols[i], dash="dot"),
        marker=dict(color=cols[i]),
        name=bert + "<br>Relaxed " ,
    ))

fig.update_layout(
    autosize=False,
    width=750,
    height=450,
    xaxis_title="Transfer Learning Data",
    yaxis_title="F1 Score",
    # yaxis2_title="F1 Score",
    legend_title="Models",
    font=dict(
        size=18,
    ),
    template="plotly_white"
)

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#595959')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#595959')

fig.show()

fig.write_image("match_relation_comparison_relaxed.png")

## Mutliple Relations per sentence

In [None]:
oof_results = pd.DataFrame(results['dbmdz/bert-base-german-europeana-cased']["oof_predictions"])
is_mutliple_relation = oof_results.query("kind == 'Ground Truth'").groupby("id").count()["tokens"] > 1
mutliple_relation_ids = list(is_mutliple_relation.index[is_mutliple_relation])
oof_results["token_len"] = oof_results["tokens"].str.len()
oof_results_multiple = oof_results[oof_results["id"].isin(mutliple_relation_ids)]

oof_results_multiple

Unnamed: 0,tokens,labels,kind,type,degree,relation_id,id,fold,token_len
24,"[Die, Staatsforstverwaltung, arbeitet, seit, e...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Predicted,Purpose,Facilitate,0,65,0,40
25,"[Die, Staatsforstverwaltung, arbeitet, seit, e...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Predicted,Purpose,Facilitate,1,65,0,40
26,"[Die, Staatsforstverwaltung, arbeitet, seit, e...","[B-Actor, I-Actor, O, O, O, O, O, O, O, B-Effe...",Ground Truth,Purpose,Facilitate,0,65,0,40
27,"[Die, Staatsforstverwaltung, arbeitet, seit, e...","[B-Actor, I-Actor, O, O, O, O, O, O, O, B-Effe...",Ground Truth,Purpose,Facilitate,1,65,0,40
66,"[Außer, den, genannten, verdienen, noch, die, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Predicted,Consequence,Facilitate,0,170,0,43
...,...,...,...,...,...,...,...,...,...
4870,"[Aufnahme, von, Verti, ., Während, der, Vornah...","[O, O, O, O, B-Cause, O, O, O, O, O, O, B-Trig...",Predicted without label,Purpose,Facilitate,2,1666,4,87
4873,"[strumente, und, Belielte, zu, vermessen, sind...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Predicted without label,Consequence,Facilitate,0,1676,4,64
4874,"[Zur, Flächenberechnung, im, Grossen, verwende...","[B-Trigger, B-Cause, I-Cause, I-Cause, B-Trigg...",Predicted without label,Purpose,Facilitate,0,1677,4,27
4875,"[Zur, Flächenberechnung, im, Grossen, verwende...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Predicted without label,Purpose,Facilitate,1,1677,4,27


In [None]:
calc_relation_metrics(oof_results_multiple.query("type != 'None'"), config)

{'relaxed': {'f1': 0.7711138310893513,
  'fn': 137,
  'fp': 50,
  'precision': 0.863013698630137,
  'recall': 0.6969026548672567,
  'tp': 315},
 'strict': {'f1': 0.08567931456548347,
  'fn': 417,
  'fp': 330,
  'precision': 0.0958904109589041,
  'recall': 0.07743362831858407,
  'tp': 35}}

In [None]:
calc_relation_metrics(oof_results.query("type != 'None'"), config)


{'relaxed': {'f1': 0.6417177914110429,
  'fn': 175,
  'fp': 409,
  'precision': 0.5611587982832618,
  'recall': 0.7492836676217765,
  'tp': 523},
 'strict': {'f1': 0.08834355828220859,
  'fn': 626,
  'fp': 860,
  'precision': 0.07725321888412018,
  'recall': 0.10315186246418338,
  'tp': 72}}

## Compare sample sentences

Samples: 347

In [None]:
oof_results = pd.DataFrame(results['dbmdz/bert-base-german-europeana-cased']["oof_predictions"])
is_single_relation = oof_results.query("kind == 'Ground Truth'").groupby("id").count()["tokens"] == 1
single_relation_ids = list(is_single_relation.index[is_single_relation])
oof_results["token_len"] = oof_results["tokens"].str.len()
possible_ids = oof_results[oof_results["id"].isin(single_relation_ids)].query("kind == 'Ground Truth' and type != 'None' and token_len < 30")["id"].unique()


In [None]:
baseline_oof_results = pd.DataFrame(baseline_results['dbmdz/bert-base-german-cased']["oof_predictions"])

In [None]:
def connect_labels(labels):
    new_labels = [labels[0]]
    for i in range(1, len(labels)):
        
        l = labels[i]
        l_prev = labels[i-1]

        if l[2:] == l_prev[2:] and l[0] == "B":
            new_labels.append(f"I-{l[2:]}")
        else:
            new_labels.append(l)

    return new_labels

In [None]:
input_id = 1052

output_str = ""
for i, tl_name in enumerate(transfer_learning_names):

    results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

    with open(results_path, "r") as f:
        results = json.load(f)

    oof_results = pd.DataFrame(results['dbmdz/bert-base-german-cased']["oof_predictions"])

    sample_data = oof_results.query("id == @input_id").copy()
    baseline_sample_data = baseline_oof_results.query("id == @input_id").copy()

    sample_data["labels"] = sample_data["labels"].apply(connect_labels)
    baseline_sample_data["labels"] = baseline_sample_data["labels"].apply(connect_labels)

    if i == 0:
        gt_data = sample_data.query("kind == 'Ground Truth' & relation_id == 0").iloc[0]
        output_str += f"""<div style="vertical-align:top; display: inline-block; width:120px; text-align: center;">
            <h3>Ground<br>Truth</h3>
        </div>"""
        output_str += """<div style="vertical-align:top; display: inline-block; width: 640px">"""

        output_str += f"Causal Type: <u>{gt_data['type']}</u>, Degree: <u>{gt_data['degree']}</u>"
        output_str += visualize_annotated_labels(gt_data["tokens"], gt_data["labels"], config)

        output_str += """</div><hr>"""


        gt_data_base = baseline_sample_data.query("kind == 'Predicted without label' & relation_id == 0").iloc[0]
        output_str += f"""<div style="vertical-align:top; display: inline-block; width:120px; text-align: center;">
            <h3>Baseline</h3>
        </div>"""
        output_str += """<div style="vertical-align:top; display: inline-block; width: 640px">"""

        output_str += f"Causal Type: <u>{gt_data_base['type']}</u>, Degree: <u>{gt_data_base['degree']}</u>"
        output_str += visualize_annotated_labels(gt_data_base["tokens"], gt_data_base["labels"], config)

        output_str += """</div><hr>"""

        

    matches = get_matches_from_sample(sample_data, config)

    for j, match in enumerate(matches):
        output_str += f"""<div style="vertical-align:top; display: inline-block; width:120px; text-align: center;">
            <h3>Transfer Data:<br>{transfer_learning_names_dict_base[tl_name]}</h3>
        </div>"""

        output_str += f"""<div style="vertical-align:top; display: inline-block; width: 640px;">"""
        pred_rel_id = match["pred_rel_id"]
        if pred_rel_id != -1:
            pred_data = sample_data.query("kind == 'Predicted without label' & relation_id == @pred_rel_id").iloc[0]
            output_str += f"Causal Type: <u>{pred_data['type']}</u>, Degree: <u>{pred_data['degree']}</u>"
            output_str += visualize_annotated_labels(pred_data["tokens"], pred_data["labels"], config)
        else:
            output_str += get_empty_with_layout_string("Predicted")

        output_str += """</div>"""

        break

    if i != len(transfer_learning_names)-1:
        output_str += "<hr>"

for arg in config["causal_arguments"]:
    output_str = output_str.replace(f'<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">{arg}</span>', "")

# output_str = output_str.replace("padding: 0.45em 0.6em;", "")
# output_str = output_str.replace("border-radius: 0.35em;", "")
output_str = output_str.replace("padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1;", "padding: 0.25em 0em;")
output_str = output_str.replace('class="entities" style="', 'class="entities" style="font-size: 1.15em;')

output_str += """
<ul class="legend">
    <li><span class="trigger"></span>Trigger</li>
    <li><span class="cause"></span>Cause</li>
    <li><span class="effect"></span>Effect</li>
    <li><span class="actor"></span>Actor</li>
    <li><span class="affected"></span>Affected</li>
    <li><span class="support"></span>Support</li>
    <li><span class="controller"></span>Controller</li>
</ul>
"""
output_str += """
<style>
    /* basic positioning */
    .legend { list-style: none; }
    .legend li { float: left; margin-right: 10px; }
    .legend span { border: 1px solid #ccc; float: left; width: 12px; height: 12px; margin: 2px; }
    /* your colors */
    .legend .trigger { background-color: #C5E95E; }
    .legend .cause { background-color: #99FCE0; }
    .legend .effect { background-color: #6779CB; }
    .legend .actor { background-color: #108482; }
    .legend .affected { background-color: #84F72D; }
    .legend .support { background-color: #C44C6D; }
    .legend .controller { background-color: #E3AF32; }
</style>
"""

display_html(jupyter_HTML(output_str))
html = HTML(string=output_str)

css_1cm = [CSS(string="@page {height: 1cm; page-break-inside: always;}}}")]
css = [CSS(string='body { font-size: 12px; font-family: arial !important}')]
render = html.render(stylesheets=css + css_1cm)


css += [ CSS(string="@page {height: " + str(len(render.pages)-5) + "cm; margin: 0mm}}}")]
html.write_pdf("sent_example1.pdf", stylesheets=css)



In [None]:
input_id = 1649

output_str = ""
for i, tl_name in enumerate(transfer_learning_names):

    results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

    with open(results_path, "r") as f:
        results = json.load(f)

    oof_results = pd.DataFrame(results['dbmdz/bert-base-german-cased']["oof_predictions"])

    sample_data = oof_results.query("id == @input_id").copy()
    baseline_sample_data = baseline_oof_results.query("id == @input_id").copy()

    sample_data["labels"] = sample_data["labels"].apply(connect_labels)
    baseline_sample_data["labels"] = baseline_sample_data["labels"].apply(connect_labels)

    if i == 0:
        gt_data = sample_data.query("kind == 'Ground Truth' & relation_id == 0").iloc[0]
        output_str += f"""<div style="vertical-align:top; display: inline-block; width:120px; text-align: center;">
            <h3>Ground<br>Truth</h3>
        </div>"""
        output_str += """<div style="vertical-align:top; display: inline-block; width: 640px">"""

        output_str += f"Causal Type: <u>{gt_data['type']}</u>, Degree: <u>{gt_data['degree']}</u>"
        output_str += visualize_annotated_labels(gt_data["tokens"], gt_data["labels"], config)

        output_str += """</div><hr>"""


        gt_data_base = baseline_sample_data.query("kind == 'Predicted without label' & relation_id == 0").iloc[0]
        output_str += f"""<div style="vertical-align:top; display: inline-block; width:120px; text-align: center;">
            <h3>Baseline</h3>
        </div>"""
        output_str += """<div style="vertical-align:top; display: inline-block; width: 640px">"""

        output_str += f"Causal Type: <u>{gt_data_base['type']}</u>, Degree: <u>{gt_data_base['degree']}</u>"
        output_str += visualize_annotated_labels(gt_data_base["tokens"], gt_data_base["labels"], config)

        output_str += """</div><hr>"""

        

    matches = get_matches_from_sample(sample_data, config)

    for j, match in enumerate(matches):
        output_str += f"""<div style="vertical-align:top; display: inline-block; width:120px; text-align: center;">
            <h3>Transfer Data:<br>{transfer_learning_names_dict_base[tl_name]}</h3>
        </div>"""

        output_str += f"""<div style="vertical-align:top; display: inline-block; width: 640px;">"""
        pred_rel_id = match["pred_rel_id"]
        if pred_rel_id != -1:
            pred_data = sample_data.query("kind == 'Predicted without label' & relation_id == @pred_rel_id").iloc[0]
            output_str += f"Causal Type: <u>{pred_data['type']}</u>, Degree: <u>{pred_data['degree']}</u>"
            output_str += visualize_annotated_labels(pred_data["tokens"], pred_data["labels"], config)
        else:
            output_str += get_empty_with_layout_string("Predicted")

        output_str += """</div>"""

        break

    if i != len(transfer_learning_names)-1:
        output_str += "<hr>"

for arg in config["causal_arguments"]:
    output_str = output_str.replace(f'<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">{arg}</span>', "")

# output_str = output_str.replace("padding: 0.45em 0.6em;", "")
# output_str = output_str.replace("border-radius: 0.35em;", "")
output_str = output_str.replace("padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1;", "padding: 0.25em 0em;")
output_str = output_str.replace('class="entities" style="', 'class="entities" style="font-size: 1.15em;')

output_str += """
<ul class="legend">
    <li><span class="trigger"></span>Trigger</li>
    <li><span class="cause"></span>Cause</li>
    <li><span class="effect"></span>Effect</li>
    <li><span class="actor"></span>Actor</li>
    <li><span class="affected"></span>Affected</li>
    <li><span class="support"></span>Support</li>
    <li><span class="controller"></span>Controller</li>
</ul>
"""
output_str += """
<style>
    /* basic positioning */
    .legend { list-style: none; }
    .legend li { float: left; margin-right: 10px; }
    .legend span { border: 1px solid #ccc; float: left; width: 12px; height: 12px; margin: 2px; }
    /* your colors */
    .legend .trigger { background-color: #C5E95E; }
    .legend .cause { background-color: #99FCE0; }
    .legend .effect { background-color: #6779CB; }
    .legend .actor { background-color: #108482; }
    .legend .affected { background-color: #84F72D; }
    .legend .support { background-color: #C44C6D; }
    .legend .controller { background-color: #E3AF32; }
</style>
"""

display_html(jupyter_HTML(output_str))
html = HTML(string=output_str)

css_1cm = [CSS(string="@page {height: 1cm; page-break-inside: always;}}}")]
css = [CSS(string='body { font-size: 12px; font-family: arial !important}')]
render = html.render(stylesheets=css + css_1cm)


css += [ CSS(string="@page {height: " + str(len(render.pages)-5) + "cm; margin: 0mm}}}")]
html.write_pdf("sent_example2.pdf", stylesheets=css)



In [None]:
input_id = 916

output_str = ""
for i, tl_name in enumerate(transfer_learning_names):

    results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{tl_name}.json"

    with open(results_path, "r") as f:
        results = json.load(f)

    oof_results = pd.DataFrame(results['dbmdz/bert-base-german-cased']["oof_predictions"])

    sample_data = oof_results.query("id == @input_id").copy()
    baseline_sample_data = baseline_oof_results.query("id == @input_id").copy()

    sample_data["labels"] = sample_data["labels"].apply(connect_labels)
    baseline_sample_data["labels"] = baseline_sample_data["labels"].apply(connect_labels)

    if i == 0:
        gt_data = sample_data.query("kind == 'Ground Truth' & relation_id == 0").iloc[0]
        output_str += f"""<div style="vertical-align:top; display: inline-block; width:120px; text-align: center;">
            <h3>Ground<br>Truth</h3>
        </div>"""
        output_str += """<div style="vertical-align:top; display: inline-block; width: 640px">"""

        output_str += f"Causal Type: <u>{gt_data['type']}</u>, Degree: <u>{gt_data['degree']}</u>"
        output_str += visualize_annotated_labels(gt_data["tokens"], gt_data["labels"], config)

        output_str += """</div><hr>"""


        gt_data_base = baseline_sample_data.query("kind == 'Predicted without label' & relation_id == 0").iloc[0]
        output_str += f"""<div style="vertical-align:top; display: inline-block; width:120px; text-align: center;">
            <h3>Baseline</h3>
        </div>"""
        output_str += """<div style="vertical-align:top; display: inline-block; width: 640px">"""

        output_str += f"Causal Type: <u>{gt_data_base['type']}</u>, Degree: <u>{gt_data_base['degree']}</u>"
        output_str += visualize_annotated_labels(gt_data_base["tokens"], gt_data_base["labels"], config)

        output_str += """</div><hr>"""

        

    matches = get_matches_from_sample(sample_data, config)

    for j, match in enumerate(matches):
        output_str += f"""<div style="vertical-align:top; display: inline-block; width:120px; text-align: center;">
            <h3>Transfer Data:<br>{transfer_learning_names_dict_base[tl_name]}</h3>
        </div>"""

        output_str += f"""<div style="vertical-align:top; display: inline-block; width: 640px;">"""
        pred_rel_id = match["pred_rel_id"]
        if pred_rel_id != -1:
            pred_data = sample_data.query("kind == 'Predicted without label' & relation_id == @pred_rel_id").iloc[0]
            output_str += f"Causal Type: <u>{pred_data['type']}</u>, Degree: <u>{pred_data['degree']}</u>"
            output_str += visualize_annotated_labels(pred_data["tokens"], pred_data["labels"], config)
        else:
            output_str += get_empty_with_layout_string("Predicted")

        output_str += """</div>"""

        break

    if i != len(transfer_learning_names)-1:
        output_str += "<hr>"

for arg in config["causal_arguments"]:
    output_str = output_str.replace(f'<span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem">{arg}</span>', "")

# output_str = output_str.replace("padding: 0.45em 0.6em;", "")
# output_str = output_str.replace("border-radius: 0.35em;", "")
output_str = output_str.replace("padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1;", "padding: 0.25em 0em;")
output_str = output_str.replace('class="entities" style="', 'class="entities" style="font-size: 1.15em;')

output_str += """
<ul class="legend">
    <li><span class="trigger"></span>Trigger</li>
    <li><span class="cause"></span>Cause</li>
    <li><span class="effect"></span>Effect</li>
    <li><span class="actor"></span>Actor</li>
    <li><span class="affected"></span>Affected</li>
    <li><span class="support"></span>Support</li>
    <li><span class="controller"></span>Controller</li>
</ul>
"""
output_str += """
<style>
    /* basic positioning */
    .legend { list-style: none; }
    .legend li { float: left; margin-right: 10px; }
    .legend span { border: 1px solid #ccc; float: left; width: 12px; height: 12px; margin: 2px; }
    /* your colors */
    .legend .trigger { background-color: #C5E95E; }
    .legend .cause { background-color: #99FCE0; }
    .legend .effect { background-color: #6779CB; }
    .legend .actor { background-color: #108482; }
    .legend .affected { background-color: #84F72D; }
    .legend .support { background-color: #C44C6D; }
    .legend .controller { background-color: #E3AF32; }
</style>
"""

display_html(jupyter_HTML(output_str))
html = HTML(string=output_str)

css_1cm = [CSS(string="@page {height: 1cm; page-break-inside: always;}}}")]
css = [CSS(string='body { font-size: 12px; font-family: arial !important}')]
render = html.render(stylesheets=css + css_1cm)


css += [ CSS(string="@page {height: " + str(len(render.pages)-5) + "cm; margin: 0mm}}}")]
html.write_pdf("sent_example3.pdf", stylesheets=css)



# Visualize all Results

In [None]:
results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_{transfer_learning_names[-1]}.json"

with open(results_path, "r") as f:
    results = json.load(f)

oof_results = pd.DataFrame(results['dbmdz/bert-base-german-cased']["oof_predictions"])


In [None]:
output = "<h1> Fondsforste Prediction Quality Check <h1>"

for id in [916]: #tqdm(sorted(oof_results["id"].unique()[:])):

    # no relation in GT or predicted
    sample_data = oof_results.query("id == @id")
    if len(sample_data["type"].unique()) == 1 and sample_data["type"].unique()[0] == "None":
        continue

    # if len(sample_data["tokens"].iloc[0]) > 30: continue


    output += visualize_id_html(id, oof_results, config)

display_html(jupyter_HTML(output))


In [None]:
oof_results.query("id == 398")

Unnamed: 0,tokens,labels,kind,type,degree,relation_id,id,fold
1177,[],[],Ground Truth,,,0,398,1
1178,[],[],Predicted,,,0,398,1


In [None]:
html = HTML(string=output)
css = CSS(
    string="font: serif"
)

# datetime apparently 2 hours behind?
html.write_pdf("forstvermessung_predictions_v1-2.pdf", stylesheets=[CSS(string='body { font-size: 10px; font-family: arial !important }')])

# Evaluate Normalization

## Arguments

In [None]:
for metric_kind in ["strict", "relaxed"]:
    combined_results = []

    # experiment
    for t, n in [[time, name], [norm_time, norm_name]]:

        results_path = PROJECT_PATH + f"output/{t}_{n}/"  + f"{corpus_name}_predictions_{t}_rehbein_dunietz.json"

        with open(results_path, "r") as f:
            results = json.load(f)

        metric_results = {}
        for model_kind in ['dbmdz/bert-base-german-cased']:

            metric_results_model = {}
            trigger_mean = np.mean([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
            trigger_std = np.std([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
            metric_results_model["\textbf{" + "Trigger" + "}"] = f"{trigger_mean:.1f} ({trigger_std:.0f})"

            for c_arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]:
                c_arg_mean = np.mean([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                c_arg_std = np.std([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                
                if c_arg == "Controlling": c_arg = "Contr."
                if c_arg == "Affected": c_arg = "Aff."
                if c_arg == "Support": c_arg = "Sup."
                metric_results_model["\textbf{" + c_arg + "}"] = f"{c_arg_mean:.1f} ({c_arg_std:.0f})"

            metric_results[model_names_dict[model_kind]] = metric_results_model

        combined_results.append(pd.DataFrame(metric_results).T)

    combined_results = pd.concat(combined_results)

    index_df = pd.DataFrame({
        " ": ["\multirow{3}{*}{\textbf{Original}}"] * 1 + ["RULE\multirow{3}{*}{\textbf{Normalization}}"] * 1,
        "  ": combined_results.index,
    })

    combined_results.index = pd.MultiIndex.from_frame(index_df)

    # make largest result bold
    combined_results = combined_results.apply(bold_largest, axis=0)

    combined_results_sub = combined_results[["\textbf{" + c_arg + "}" for c_arg in ["Trigger", "Cause", "Effect", "Actor", "Aff.", "Sup.", "Contr."]]]
    display(combined_results_sub)

    # convert to latex and change some things
    latex_df = combined_results_sub.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
    latex_df = latex_df.replace("RULE", "\hline\n")
    latex_df = latex_df.replace("lllllllll", "clccccccc")
    latex_df = latex_df.replace(".0", "") 
    latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
    latex_df = re.sub(r"Baseline[\s\&]+Baseline", r"\\multicolumn{2}{c}{\\textbf{Baseline}}", latex_df, flags=re.MULTILINE)
    latex_df = re.sub(r"^\s*\&\s*&\s*\\textbf\{Trigger\}", r"\\textbf{\\makecell{Pre-processing}} & \\textbf{\\makecell{BERT \\\\ Model}} & \\textbf{Trigger}", latex_df, flags=re.MULTILINE)
    print(metric_kind)
    print(latex_df)

Unnamed: 0,Unnamed: 1,\textbf{Trigger},\textbf{Cause},\textbf{Effect},\textbf{Actor},\textbf{Aff.},\textbf{Sup.},\textbf{Contr.}
,,,,,,,,
\multirow{3}{*}{\textbf{Original}},\textbf{Ger.},49.2 (2),28.7 (2),\textbf{31.0 (5)},33.0 (13),20.5 (6),0.0 (0),5.8 (5)
RULE\multirow{3}{*}{\textbf{Normalization}},\textbf{Ger.},\textbf{50.9 (1)},\textbf{29.6 (3)},29.8 (4),\textbf{35.4 (9)},\textbf{21.8 (4)},0.0 (0),\textbf{7.3 (5)}


strict
\begin{tabular}{clccccccc}
\toprule
\textbf{\makecell{Pre-processing}} & \textbf{\makecell{BERT \\ Model}} & \textbf{Trigger} &     \textbf{Cause} &    \textbf{Effect} &     \textbf{Actor} &      \textbf{Aff.} & \textbf{Sup.} &   \textbf{Contr.} \\

\midrule
\multirow{3}{*}{\textbf{Original}} & \textbf{Ger.} &           49.2 (2) &           28.7 (2) &  \textbf{31 (5)} &          33 (13) &           20.5 (6) &       0 (0) &           5.8 (5) \\
\hline
\multirow{3}{*}{\textbf{Normalization}} & \textbf{Ger.} &  \textbf{50.9 (1)} &  \textbf{29.6 (3)} &           29.8 (4) &  \textbf{35.4 (9)} &  \textbf{21.8 (4)} &       0 (0) &  \textbf{7.3 (5)} \\
\bottomrule
\end{tabular}



Unnamed: 0,Unnamed: 1,\textbf{Trigger},\textbf{Cause},\textbf{Effect},\textbf{Actor},\textbf{Aff.},\textbf{Sup.},\textbf{Contr.}
,,,,,,,,
\multirow{3}{*}{\textbf{Original}},\textbf{Ger.},49.2 (2),52.7 (2),\textbf{59.4 (3)},36.0 (13),\textbf{28.7 (8)},0.0 (0),18.7 (3)
RULE\multirow{3}{*}{\textbf{Normalization}},\textbf{Ger.},\textbf{50.9 (1)},\textbf{53.9 (4)},58.9 (3),\textbf{38.3 (12)},27.8 (6),0.0 (0),\textbf{23.0 (5)}


relaxed
\begin{tabular}{clccccccc}
\toprule
\textbf{\makecell{Pre-processing}} & \textbf{\makecell{BERT \\ Model}} & \textbf{Trigger} &     \textbf{Cause} &    \textbf{Effect} &      \textbf{Actor} &      \textbf{Aff.} & \textbf{Sup.} &    \textbf{Contr.} \\

\midrule
\multirow{3}{*}{\textbf{Original}} & \textbf{Ger.} &           49.2 (2) &           52.7 (2) &  \textbf{59.4 (3)} &           36 (13) &  \textbf{28.7 (8)} &       0 (0) &           18.7 (3) \\
\hline
\multirow{3}{*}{\textbf{Normalization}} & \textbf{Ger.} &  \textbf{50.9 (1)} &  \textbf{53.9 (4)} &           58.9 (3) &  \textbf{38.3 (12)} &           27.8 (6) &       0 (0) &  \textbf{23 (5)} \\
\bottomrule
\end{tabular}



### Visualization

In [None]:
combined_results_all = []
for metric_kind in ["strict", "relaxed"]:
    combined_results = []

    # experiment
    for t, n in [[time, name], [norm_time, norm_name]]:

        transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
        for tl_name in transfer_learning_names:
            results_path = PROJECT_PATH + f"output/{t}_{n}/"  + f"{corpus_name}_predictions_{t}_{tl_name}.json"

            with open(results_path, "r") as f:
                results = json.load(f)

            metric_results = {}
            for model_kind in ['dbmdz/bert-base-german-cased']:

                metric_results_model = {}
                trigger_mean = np.mean([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
                trigger_std = np.std([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
                metric_results_model["Trigger"] = trigger_mean

                for c_arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]:
                    c_arg_mean = np.mean([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                    c_arg_std = np.std([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                    
                    metric_results_model[c_arg] = c_arg_mean

                metric_results[tl_name] = metric_results_model

            combined_results.append(pd.DataFrame(metric_results).T)

    combined_results = pd.concat(combined_results, axis=0)

    index_df = pd.DataFrame({
        "Preprocessing": ["Original"]*4 + ["Normalization"]*4,
        "Transfer": ["\u2014", "BEC", "R&R", "R&R, BEC"]*2,
    })

    combined_results.index = pd.MultiIndex.from_frame(index_df)

    combined_results_sub = combined_results[[c_arg for c_arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]]]
    combined_results_sub["kind"] = metric_kind.title()

    combined_results_all.append(combined_results_sub)

combined_results_all = pd.concat(combined_results_all, axis=0).reset_index()
combined_results_all["Macro F1"] = combined_results_all[["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]].mean(axis=1)
combined_results_all

Unnamed: 0,Preprocessing,Transfer,Cause,Effect,Actor,Affected,Support,Controlling,kind,Macro F1
0,Original,—,25.555826,24.743564,32.740436,19.964839,0.0,8.039497,Strict,18.50736
1,Original,BEC,27.306735,25.81467,29.592922,20.220313,0.0,9.006326,Strict,18.656828
2,Original,R&R,26.389672,29.764783,31.306295,20.575285,0.0,8.742946,Strict,19.463163
3,Original,"R&R, BEC",28.687005,31.025641,33.039614,20.46904,0.0,5.810486,Strict,19.838631
4,Normalization,—,26.092504,23.801175,30.600733,19.558196,0.0,6.906832,Strict,17.826573
5,Normalization,BEC,26.330017,25.372231,32.826236,18.913373,0.0,6.145455,Strict,18.264552
6,Normalization,R&R,28.484691,29.678244,31.996537,19.936011,0.0,9.312787,Strict,19.901378
7,Normalization,"R&R, BEC",29.622923,29.842456,35.359408,21.81115,0.0,7.276936,Strict,20.652145
8,Original,—,49.550221,52.690052,44.521596,23.482625,0.0,24.811714,Relaxed,32.509368
9,Original,BEC,51.52557,54.763312,42.583226,27.203276,0.0,29.472395,Relaxed,34.257963


In [None]:
import plotly.express as px

fig = go.Figure()

strict_f1 = combined_results_all.query("kind == 'Strict'")[["Preprocessing", "Transfer", "Macro F1"]]
relaxed_f1 = combined_results_all.query("kind == 'Relaxed'")[["Preprocessing", "Transfer", "Macro F1"]]

import plotly
cols = plotly.colors.DEFAULT_PLOTLY_COLORS

for i, pre in enumerate(strict_f1["Preprocessing"].unique()):
    fig.add_trace(go.Scatter(
        x=strict_f1.query("Preprocessing == @pre")["Transfer"],
        y=strict_f1.query("Preprocessing == @pre")["Macro F1"],
        line=dict(width=2, color=cols[i]),
        marker=dict(
            color=cols[i],
            size=10,
        ),        
        name=pre + " - Strict " ,
    ))

    fig.add_trace(go.Scatter(
        x=relaxed_f1.query("Preprocessing == @pre")["Transfer"],
        y=relaxed_f1.query("Preprocessing == @pre")["Macro F1"],
        line=dict(width=2, color=cols[i], dash='dot'),
        marker=dict(
            color=cols[i],
            size=10,
        ),
        name=pre + " - Relaxed " ,
    ))

fig.update_layout(
    autosize=False,
    width=1200,
    height=600,
    xaxis_title="Transfer Learning Data",
    yaxis_title="Macro F1 Score",
    # yaxis2_title="Macro F1 Score",
    legend_title="Models",
    font=dict(
        size=18,
    ),
    template="plotly_white",
)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#292828')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#292828')

fig.show()

fig.write_image("arguments_normalization_comp.png")

## Matched Relations

In [None]:
metric_kind_results = []
for metric_kind in ["strict", "relaxed"]:

    combined_results = []

    # experiment
    for t, n in [[time, name], [norm_time, norm_name]]:

        results_path = PROJECT_PATH + f"output/{t}_{n}/"  + f"{corpus_name}_predictions_{t}_rehbein_dunietz.json"

        with open(results_path, "r") as f:
            results = json.load(f)

        metric_results = {}
        for model_kind in ['dbmdz/bert-base-german-cased']:

            metric_results_model = {}
            for match_metric in ["precision", "recall", "f1"]:
                match_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                match_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                metric_results_model["\textbf{" +metrics_dict[match_metric] + "}"] = f"{match_metric_mean:.1f} ({match_metric_std:.0f})"

            metric_results[model_names_dict[model_kind]] = metric_results_model

        combined_results.append(pd.DataFrame(metric_results).T)

    combined_results = pd.concat(combined_results)

    index_df = pd.DataFrame({
        " ": ["\multirow{3}{*}{\textbf{Original}}"] * 1 + ["RULE\multirow{3}{*}{\textbf{Normalization}}"] * 1,
        "  ": combined_results.index,
    })


    combined_results.index = pd.MultiIndex.from_frame(index_df)
    # sort in order
    combined_results = combined_results[["\textbf{" + c_arg + "}" for c_arg in ["Precision", "Recall", "F1"]]]
    metric_kind_results.append(combined_results)

metric_kind_results = pd.concat(metric_kind_results, axis=1)
metric_kind_results = metric_kind_results.T

index_df = pd.DataFrame({
    "  ": ["\textbf{Strict}"] * 3 + ["\textbf{Relaxed}"] * 3,
    " ": metric_kind_results.index,
})

metric_kind_results.index = pd.MultiIndex.from_frame(index_df)
metric_kind_results = metric_kind_results.T

# make largest result bold
metric_kind_results = metric_kind_results.apply(bold_largest, axis=0)
display(metric_kind_results)

# convert to latex and change some things
latex_df = metric_kind_results.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
latex_df = latex_df.replace("RULE", "\hline\n")
latex_df = latex_df.replace("llllllll", "clcccccc")
latex_df = latex_df.replace(".0", "")
latex_df = latex_df.replace("\multicolumn{3}{l}", "\multicolumn{3}{c}")
latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"Baseline[\s\&]+Baseline", r"\\multicolumn{2}{c}{\\textbf{Baseline}}", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"^\s*\&\s*&\s*\\textbf\{Precision\}", r"\\textbf{\\makecell{Transfer \\\\ Learning}} & \\textbf{\\makecell{BERT \\\\ Model}} & \\textbf{Precision}", latex_df, flags=re.MULTILINE)
print(latex_df)


Unnamed: 0_level_0,Unnamed: 1_level_0,\textbf{Strict},\textbf{Strict},\textbf{Strict},\textbf{Relaxed},\textbf{Relaxed},\textbf{Relaxed}
Unnamed: 0_level_1,Unnamed: 1_level_1,\textbf{Precision},\textbf{Recall},\textbf{F1},\textbf{Precision},\textbf{Recall},\textbf{F1}
,,,,,,,
\multirow{3}{*}{\textbf{Original}},\textbf{Ger.},\textbf{8.8 (2)},\textbf{12.7 (3)},\textbf{10.4 (2)},54.3 (1),79.7 (3),64.5 (0)
RULE\multirow{3}{*}{\textbf{Normalization}},\textbf{Ger.},8.4 (1),12.3 (1),10.0 (1),\textbf{54.7 (2)},\textbf{80.1 (1)},\textbf{65.0 (1)}


\begin{tabular}{clcccccc}
\toprule
                                            &    & \multicolumn{3}{c}{\textbf{Strict}} & \multicolumn{3}{c}{\textbf{Relaxed}} \\
\textbf{\makecell{Transfer \\ Learning}} & \textbf{\makecell{BERT \\ Model}} & \textbf{Precision} &    \textbf{Recall} &        \textbf{F1} & \textbf{Precision} &    \textbf{Recall} &        \textbf{F1} \\

\midrule
\multirow{3}{*}{\textbf{Original}} & \textbf{Ger.} &   \textbf{8.8 (2)} &  \textbf{12.7 (3)} &  \textbf{10.4 (2)} &           54.3 (1) &           79.7 (3) &           64.5 (0) \\
\hline
\multirow{3}{*}{\textbf{Normalization}} & \textbf{Ger.} &            8.4 (1) &           12.3 (1) &           10 (1) &  \textbf{54.7 (2)} &  \textbf{80.1 (1)} &  \textbf{65 (1)} \\
\bottomrule
\end{tabular}



### Visualization

In [None]:
combined_results_all = []
for metric_kind in ["strict", "relaxed"]:
    combined_results = []

    # experiment
    for t, n in [[time, name], [norm_time, norm_name]]:

        # experiment
        transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
        for tl_name in transfer_learning_names:

            results_path = PROJECT_PATH + f"output/{t}_{n}/"  + f"{corpus_name}_predictions_{t}_{tl_name}.json"

            with open(results_path, "r") as f:
                results = json.load(f)

            metric_results = {}
            model_kind = 'dbmdz/bert-base-german-cased'

            metric_results_model = {}
            for match_metric in ["precision", "recall", "f1"]:
                match_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                match_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                metric_results_model[metrics_dict[match_metric]] = match_metric_mean
                metric_results_model["Preprocessing"] = n
                metric_results_model["Transfer"] = transfer_learning_names_dict_base[tl_name]
                metric_results_model["kind"] = metric_kind.title()

            metric_results[model_kind] = metric_results_model

            combined_results.append(pd.DataFrame(metric_results).T)

    combined_results = pd.concat(combined_results)
    combined_results_all.append(combined_results)

combined_results_all = pd.concat(combined_results_all, axis=0).reset_index()

combined_results_all

Unnamed: 0,index,F1,Precision,Preprocessing,Recall,Transfer,kind
0,dbmdz/bert-base-german-cased,7.154313,6.369652,normal_valid,8.189504,—,Strict
1,dbmdz/bert-base-german-cased,8.67251,7.657585,normal_valid,10.03458,BEC,Strict
2,dbmdz/bert-base-german-cased,9.004887,7.421696,normal_valid,11.490246,R&R,Strict
3,dbmdz/bert-base-german-cased,10.365927,8.753766,normal_valid,12.733431,"R&R, BEC",Strict
4,dbmdz/bert-base-german-cased,8.943326,7.882818,normalization,10.424318,—,Strict
5,dbmdz/bert-base-german-cased,9.049178,7.992814,normalization,10.473233,BEC,Strict
6,dbmdz/bert-base-german-cased,9.709657,7.974674,normalization,12.451503,R&R,Strict
7,dbmdz/bert-base-german-cased,9.997281,8.408707,normalization,12.334364,"R&R, BEC",Strict
8,dbmdz/bert-base-german-cased,68.037232,60.529212,normal_valid,77.981956,—,Relaxed
9,dbmdz/bert-base-german-cased,66.967,59.051532,normal_valid,77.580118,BEC,Relaxed


In [None]:
import plotly.express as px

fig = make_subplots(
    rows=2, cols=1,
    # subplot_titles=("Strict", "Relaxed"),
    subplot_titles=(" ", " "),
    shared_xaxes=True,
    vertical_spacing=0.05
)

strict_f1 = combined_results_all.query("kind == 'Strict'")[["Preprocessing", "Transfer", "F1"]]
relaxed_f1 = combined_results_all.query("kind == 'Relaxed'")[["Preprocessing", "Transfer", "F1"]]

fig = go.Figure()

preprocess_dict = {
    "normal_valid": "Original",
    "normalization": "Normalized",
}
for i, pre in enumerate(strict_f1["Preprocessing"].unique()):
    fig.add_trace(go.Scatter(
        x=strict_f1.query("Preprocessing == @pre")["Transfer"],
        y=strict_f1.query("Preprocessing == @pre")["F1"],
        line=dict(width=2, color=cols[i]),
        marker=dict(color=cols[i]),
        name=preprocess_dict[pre] + "<br>Strict "
    ))

fig.update_layout(
    autosize=False,
    width=750,
    height=450,
    xaxis_title="Transfer Learning Data",
    yaxis_title="F1 Score",
    # yaxis2_title="F1 Score",
    legend_title="Model",
    font=dict(
        size=18,
    ),
    template="plotly_white"
)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#595959')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#595959')

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

fig.show()

fig.write_image("matched_rel_norm_comb_strict.png")

In [None]:
import plotly.express as px

fig = make_subplots(
    rows=2, cols=1,
    # subplot_titles=("Strict", "Relaxed"),
    subplot_titles=(" ", " "),
    shared_xaxes=True,
    vertical_spacing=0.05
)

relaxed_f1 = combined_results_all.query("kind == 'Relaxed'")[["Preprocessing", "Transfer", "F1"]]

fig = go.Figure()

preprocess_dict = {
    "normal_valid": "Original",
    "normalization": "Normalized",
}
for i, pre in enumerate(relaxed_f1["Preprocessing"].unique()):
    fig.add_trace(go.Scatter(
        x=relaxed_f1.query("Preprocessing == @pre")["Transfer"],
        y=relaxed_f1.query("Preprocessing == @pre")["F1"],
        line=dict(width=2, color=cols[i], dash="dot"),
        marker=dict(color=cols[i]),
        name=preprocess_dict[pre] + "<br>Relaxed "
    ))

fig.update_layout(
    autosize=False,
    width=750,
    height=450,
    xaxis_title="Transfer Learning Data",
    yaxis_title="F1 Score",
    # yaxis2_title="F1 Score",
    legend_title="Model",
    font=dict(
        size=18,
    ),
    template="plotly_white"
)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#595959')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#595959')

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

fig.show()

fig.write_image("matched_rel_norm_comb_relaxed.png")

## Sanity Check

In [None]:
# normal results
results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_rehbein_dunietz.json"

with open(results_path, "r") as f:
    results= json.load(f)

oof_results = pd.DataFrame(results["dbmdz/bert-base-german-cased"]["oof_predictions"]).query("kind != 'Predicted without label'").drop_duplicates("id")
display(oof_results)


# norm results
results_path_norm = PROJECT_PATH + f"output/{norm_time}_{norm_name}/"  + f"{corpus_name}_predictions_{norm_time}_rehbein_dunietz.json"

with open(results_path_norm, "r") as f:
    results_norm = json.load(f)

oof_results_norm = pd.DataFrame(results_norm["dbmdz/bert-base-german-cased"]["oof_predictions"]).query("kind != 'Predicted without label'").drop_duplicates("id")
oof_results_norm

Unnamed: 0,tokens,labels,kind,type,degree,relation_id,id,fold
0,"[[, Datei, :, /media, /, bernhard, /, 40FE-DE2...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Ground Truth,,,0,0,0
2,"[Während, der, beherzte, Weidmann, im, Osten, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Predicted,Motivation,Facilitate,0,10,0
4,"[In, diesem, Wechsel, der, Verhältnisse, sind,...","[B-Cause, I-Cause, I-Cause, I-Cause, I-Cause, ...",Predicted,Consequence,Facilitate,0,11,0
6,"[Die, in, Verwaltung, des, Ackerbauministerium...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Ground Truth,,,0,14,0
8,"[Die, Ursachen, dieser, Erscheinung, sind, ein...","[O, B-Trigger, B-Effect, I-Effect, B-Trigger, ...",Predicted,Consequence,Facilitate,0,16,0
...,...,...,...,...,...,...,...,...
4759,"[Zur, Flächenberechnung, im, Grossen, verwende...","[O, O, O, O, O, O, O, O, O, O, B-Cause, I-Caus...",Predicted,Consequence,Facilitate,0,1677,4
4765,"[Die, Flächen-Grössen, sind, in, der, zugehöri...","[O, O, O, O, O, O, O, O, O, O, O, O, O, B-Caus...",Predicted,Consequence,Facilitate,0,1679,4
4767,"[VI, .]","[O, O]",Ground Truth,,,0,1680,4
4769,"[VII, ., Taxatorische, Vorerhebungen, und, Ber...","[O, O, B-Effect, I-Effect, I-Effect, I-Effect,...",Predicted,Purpose,Facilitate,0,1688,4


Unnamed: 0,tokens,labels,kind,type,degree,relation_id,id,fold
0,"[[, Datei, :, /media, /, bernhard, /, 40FE-DE2...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Ground Truth,,,0,0,0
2,"[Während, der, beherzte, Weidmann, im, Osten, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Predicted,Motivation,Facilitate,0,10,0
4,"[In, diesem, Wechsel, der, Verhältnisse, sind,...","[B-Cause, I-Cause, I-Cause, I-Cause, I-Cause, ...",Predicted,Consequence,Facilitate,0,11,0
6,"[Die, in, Verwaltung, des, Ackerbauministerium...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Ground Truth,,,0,14,0
8,"[Die, Ursachen, dieser, Erscheinung, sind, ein...","[O, B-Trigger, B-Effect, I-Effect, B-Trigger, ...",Predicted,Motivation,Facilitate,0,16,0
...,...,...,...,...,...,...,...,...
4752,"[Zur, Flächenberechnung, im, Großen, verwendet...","[O, O, O, O, O, O, O, O, O, O, B-Cause, I-Caus...",Predicted,Consequence,Facilitate,0,1677,4
4758,"[Die, Flächen-Grössen, sind, in, der, zugehöri...","[B-Affected, I-Affected, O, O, O, O, O, O, O, ...",Predicted,Consequence,Facilitate,0,1679,4
4760,"[VI, .]","[O, O]",Ground Truth,,,0,1680,4
4762,"[VII, ., Taxatorische, Vorerhebungen, und, Ber...","[O, O, B-Effect, I-Effect, I-Effect, I-Effect,...",Predicted,Purpose,Facilitate,0,1688,4


# Evaluate Coref

## Arguments

In [None]:
for metric_kind in ["strict", "relaxed"]:
    combined_results = []

    # experiment
    for t, n in [[time, name], [coref_time, coref_name]]:

        results_path = PROJECT_PATH + f"output/{t}_{n}/"  + f"{corpus_name}_predictions_{t}_rehbein_dunietz.json"

        with open(results_path, "r") as f:
            results = json.load(f)

        metric_results = {}
        for model_kind in ['dbmdz/bert-base-german-cased']:

            metric_results_model = {}
            trigger_mean = np.mean([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
            trigger_std = np.std([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
            metric_results_model["\textbf{" + "Trigger" + "}"] = f"{trigger_mean:.1f} ({trigger_std:.0f})"

            for c_arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]:
                c_arg_mean = np.mean([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                c_arg_std = np.std([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                
                if c_arg == "Controlling": c_arg = "Contr."
                if c_arg == "Affected": c_arg = "Aff."
                if c_arg == "Support": c_arg = "Sup."
                metric_results_model["\textbf{" + c_arg + "}"] = f"{c_arg_mean:.1f} ({c_arg_std:.0f})"

            metric_results[model_names_dict[model_kind]] = metric_results_model

        combined_results.append(pd.DataFrame(metric_results).T)

    combined_results = pd.concat(combined_results)

    index_df = pd.DataFrame({
        " ": ["\multirow{3}{*}{\textbf{Original}}"] * 1 + ["RULE\multirow{3}{*}{\textbf{Coreference}}"] * 1,
        "  ": combined_results.index,
    })

    combined_results.index = pd.MultiIndex.from_frame(index_df)

    # make largest result bold
    combined_results = combined_results.apply(bold_largest, axis=0)

    combined_results_sub = combined_results[["\textbf{" + c_arg + "}" for c_arg in ["Trigger", "Cause", "Effect", "Actor", "Aff.", "Sup.", "Contr."]]]
    display(combined_results_sub)

    # convert to latex and change some things
    latex_df = combined_results_sub.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
    latex_df = latex_df.replace("RULE", "\hline\n")
    latex_df = latex_df.replace("lllllllll", "clccccccc")
    latex_df = latex_df.replace(".0", "")
    latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
    latex_df = re.sub(r"Baseline[\s\&]+Baseline", r"\\multicolumn{2}{c}{\\textbf{Baseline}}", latex_df, flags=re.MULTILINE)
    latex_df = re.sub(r"^\s*\&\s*&\s*\\textbf\{Trigger\}", r"\\textbf{\\makecell{Pre-processing}} & \\textbf{\\makecell{BERT \\\\ Model}} & \\textbf{Trigger}", latex_df, flags=re.MULTILINE)
    print(metric_kind)
    print(latex_df)

Unnamed: 0,Unnamed: 1,\textbf{Trigger},\textbf{Cause},\textbf{Effect},\textbf{Actor},\textbf{Aff.},\textbf{Sup.},\textbf{Contr.}
,,,,,,,,
\multirow{3}{*}{\textbf{Original}},\textbf{Ger.},49.2 (2),\textbf{28.7 (2)},\textbf{31.0 (5)},33.0 (13),20.5 (6),0.0 (0),5.8 (5)
RULE\multirow{3}{*}{\textbf{Coreference}},\textbf{Ger.},\textbf{49.7 (2)},28.1 (5),29.7 (4),\textbf{33.3 (11)},\textbf{21.8 (3)},0.0 (0),\textbf{10.1 (7)}


strict
\begin{tabular}{clccccccc}
\toprule
\textbf{\makecell{Pre-processing}} & \textbf{\makecell{BERT \\ Model}} & \textbf{Trigger} &     \textbf{Cause} &    \textbf{Effect} &      \textbf{Actor} &      \textbf{Aff.} & \textbf{Sup.} &    \textbf{Contr.} \\

\midrule
\multirow{3}{*}{\textbf{Original}} & \textbf{Ger.} &           49.2 (2) &  \textbf{28.7 (2)} &  \textbf{31 (5)} &           33 (13) &           20.5 (6) &       0 (0) &            5.8 (5) \\
\hline
\multirow{3}{*}{\textbf{Coreference}} & \textbf{Ger.} &  \textbf{49.7 (2)} &           28.1 (5) &           29.7 (4) &  \textbf{33.3 (11)} &  \textbf{21.8 (3)} &       0 (0) &  \textbf{10.1 (7)} \\
\bottomrule
\end{tabular}



Unnamed: 0,Unnamed: 1,\textbf{Trigger},\textbf{Cause},\textbf{Effect},\textbf{Actor},\textbf{Aff.},\textbf{Sup.},\textbf{Contr.}
,,,,,,,,
\multirow{3}{*}{\textbf{Original}},\textbf{Ger.},49.2 (2),\textbf{52.7 (2)},\textbf{59.4 (3)},36.0 (13),\textbf{28.7 (8)},0.0 (0),18.7 (3)
RULE\multirow{3}{*}{\textbf{Coreference}},\textbf{Ger.},\textbf{49.7 (2)},52.5 (2),58.8 (3),\textbf{38.7 (12)},27.3 (4),0.0 (0),\textbf{26.1 (8)}


relaxed
\begin{tabular}{clccccccc}
\toprule
\textbf{\makecell{Pre-processing}} & \textbf{\makecell{BERT \\ Model}} & \textbf{Trigger} &     \textbf{Cause} &    \textbf{Effect} &      \textbf{Actor} &      \textbf{Aff.} & \textbf{Sup.} &    \textbf{Contr.} \\

\midrule
\multirow{3}{*}{\textbf{Original}} & \textbf{Ger.} &           49.2 (2) &  \textbf{52.7 (2)} &  \textbf{59.4 (3)} &           36 (13) &  \textbf{28.7 (8)} &       0 (0) &           18.7 (3) \\
\hline
\multirow{3}{*}{\textbf{Coreference}} & \textbf{Ger.} &  \textbf{49.7 (2)} &           52.5 (2) &           58.8 (3) &  \textbf{38.7 (12)} &           27.3 (4) &       0 (0) &  \textbf{26.1 (8)} \\
\bottomrule
\end{tabular}



### Visualization

In [None]:
combined_results_all = []
for metric_kind in ["strict", "relaxed"]:
    combined_results = []

    # experiment
    for t, n in [[time, name], [coref_time, coref_name]]:

        transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
        for tl_name in transfer_learning_names:
            results_path = PROJECT_PATH + f"output/{t}_{n}/"  + f"{corpus_name}_predictions_{t}_{tl_name}.json"

            with open(results_path, "r") as f:
                results = json.load(f)

            metric_results = {}
            for model_kind in ['dbmdz/bert-base-german-cased']:

                metric_results_model = {}
                trigger_mean = np.mean([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
                trigger_std = np.std([results[model_kind]["oof_results_all"][fold]['detect_trigger_results'][f"overall_f1_{metric_kind}"] for fold in range(5)]) * 100
                metric_results_model["Trigger"] = trigger_mean

                for c_arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]:
                    c_arg_mean = np.mean([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                    c_arg_std = np.std([results[model_kind]["oof_results_all"][fold]["detect_args_results"][f"{c_arg}_{metric_kind}_f1"] for fold in range(5)]) * 100
                    
                    metric_results_model[c_arg] = c_arg_mean

                metric_results[tl_name] = metric_results_model

            combined_results.append(pd.DataFrame(metric_results).T)

    combined_results = pd.concat(combined_results, axis=0)

    index_df = pd.DataFrame({
        "Preprocessing": ["Original"]*4 + ["Coreference"]*4,
        "Transfer": ["\u2014", "BEC", "R&R", "R&R, BEC"]*2,
    })

    combined_results.index = pd.MultiIndex.from_frame(index_df)

    combined_results_sub = combined_results[[c_arg for c_arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]]]
    combined_results_sub["kind"] = metric_kind.title()

    combined_results_all.append(combined_results_sub)

combined_results_all = pd.concat(combined_results_all, axis=0).reset_index()
combined_results_all["Macro F1"] = combined_results_all[["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"]].mean(axis=1)
combined_results_all

Unnamed: 0,Preprocessing,Transfer,Cause,Effect,Actor,Affected,Support,Controlling,kind,Macro F1
0,Original,—,25.555826,24.743564,32.740436,19.964839,0.0,8.039497,Strict,18.50736
1,Original,BEC,27.306735,25.81467,29.592922,20.220313,0.0,9.006326,Strict,18.656828
2,Original,R&R,26.389672,29.764783,31.306295,20.575285,0.0,8.742946,Strict,19.463163
3,Original,"R&R, BEC",28.687005,31.025641,33.039614,20.46904,0.0,5.810486,Strict,19.838631
4,Coreference,—,24.643189,24.22743,26.004368,16.529214,0.0,4.311111,Strict,15.952552
5,Coreference,BEC,27.864601,27.366088,27.781494,20.346927,0.0,7.565359,Strict,18.487411
6,Coreference,R&R,28.642831,28.103135,31.493268,18.868041,0.0,7.256778,Strict,19.060675
7,Coreference,"R&R, BEC",28.093781,29.684476,33.300144,21.823187,0.0,10.112379,Strict,20.502328
8,Original,—,49.550221,52.690052,44.521596,23.482625,0.0,24.811714,Relaxed,32.509368
9,Original,BEC,51.52557,54.763312,42.583226,27.203276,0.0,29.472395,Relaxed,34.257963


In [None]:
import plotly.express as px

fig = go.Figure()

strict_f1 = combined_results_all.query("kind == 'Strict'")[["Preprocessing", "Transfer", "Macro F1"]]
relaxed_f1 = combined_results_all.query("kind == 'Relaxed'")[["Preprocessing", "Transfer", "Macro F1"]]

import plotly
cols = plotly.colors.DEFAULT_PLOTLY_COLORS

for i, pre in enumerate(strict_f1["Preprocessing"].unique()):
    fig.add_trace(go.Scatter(
        x=strict_f1.query("Preprocessing == @pre")["Transfer"],
        y=strict_f1.query("Preprocessing == @pre")["Macro F1"],
        line=dict(width=2, color=cols[i]),
        marker=dict(
            color=cols[i],
            size=10,
        ),        
        name=pre + " - Strict " ,
    ))

    fig.add_trace(go.Scatter(
        x=relaxed_f1.query("Preprocessing == @pre")["Transfer"],
        y=relaxed_f1.query("Preprocessing == @pre")["Macro F1"],
        line=dict(width=2, color=cols[i], dash='dot'),
        marker=dict(
            color=cols[i],
            size=10,
        ),
        name=pre + " - Relaxed " ,
    ))

fig.update_layout(
    autosize=False,
    width=1200,
    height=600,
    xaxis_title="Transfer Learning Data",
    yaxis_title="Macro F1 Score",
    # yaxis2_title="Macro F1 Score",
    legend_title="Models",
    font=dict(
        size=18,
    ),
    template="plotly_white",
)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#292828')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#292828')

fig.show()

fig.write_image("arguments_coref_comp.png")

## Matched Relations

In [None]:
metric_kind_results = []
for metric_kind in ["strict", "relaxed"]:

    combined_results = []

    # experiment
    for t, n in [[time, name], [coref_time, coref_name]]:

        results_path = PROJECT_PATH + f"output/{t}_{n}/"  + f"{corpus_name}_predictions_{t}_rehbein_dunietz.json"

        with open(results_path, "r") as f:
            results = json.load(f)

        metric_results = {}
        for model_kind in ['dbmdz/bert-base-german-cased']:

            metric_results_model = {}
            for match_metric in ["precision", "recall", "f1"]:
                match_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                match_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                metric_results_model["\textbf{" +metrics_dict[match_metric] + "}"] = f"{match_metric_mean:.1f} ({match_metric_std:.0f})"

            metric_results[model_names_dict[model_kind]] = metric_results_model

        combined_results.append(pd.DataFrame(metric_results).T)

    combined_results = pd.concat(combined_results)

    index_df = pd.DataFrame({
        " ": ["\multirow{3}{*}{\textbf{Original}}"] * 1 + ["RULE\multirow{3}{*}{\textbf{Corefernce}}"] * 1,
        "  ": combined_results.index,
    })


    combined_results.index = pd.MultiIndex.from_frame(index_df)
    # sort in order
    combined_results = combined_results[["\textbf{" + c_arg + "}" for c_arg in ["Precision", "Recall", "F1"]]]
    metric_kind_results.append(combined_results)

metric_kind_results = pd.concat(metric_kind_results, axis=1)
metric_kind_results = metric_kind_results.T

index_df = pd.DataFrame({
    "  ": ["\textbf{Strict}"] * 3 + ["\textbf{Relaxed}"] * 3,
    " ": metric_kind_results.index,
})

metric_kind_results.index = pd.MultiIndex.from_frame(index_df)
metric_kind_results = metric_kind_results.T

# make largest result bold
metric_kind_results = metric_kind_results.apply(bold_largest, axis=0)
display(metric_kind_results)

# convert to latex and change some things
latex_df = metric_kind_results.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
latex_df = latex_df.replace("RULE", "\hline\n")
latex_df = latex_df.replace("llllllll", "clcccccc")
latex_df = latex_df.replace(".0", "")
latex_df = latex_df.replace("\multicolumn{3}{l}", "\multicolumn{3}{c}")
latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"Baseline[\s\&]+Baseline", r"\\multicolumn{2}{c}{\\textbf{Baseline}}", latex_df, flags=re.MULTILINE)
latex_df = re.sub(r"^\s*\&\s*&\s*\\textbf\{Precision\}", r"\\textbf{\\makecell{Pre-processing}} & \\textbf{\\makecell{BERT \\\\ Model}} & \\textbf{Precision}", latex_df, flags=re.MULTILINE)
print(latex_df)


Unnamed: 0_level_0,Unnamed: 1_level_0,\textbf{Strict},\textbf{Strict},\textbf{Strict},\textbf{Relaxed},\textbf{Relaxed},\textbf{Relaxed}
Unnamed: 0_level_1,Unnamed: 1_level_1,\textbf{Precision},\textbf{Recall},\textbf{F1},\textbf{Precision},\textbf{Recall},\textbf{F1}
,,,,,,,
\multirow{3}{*}{\textbf{Original}},\textbf{Ger.},\textbf{8.8 (2)},\textbf{12.7 (3)},\textbf{10.4 (2)},\textbf{54.3 (1)},79.7 (3),64.5 (0)
RULE\multirow{3}{*}{\textbf{Corefernce}},\textbf{Ger.},7.1 (2),10.6 (2),8.5 (2),\textbf{54.3 (3)},\textbf{81.3 (2)},\textbf{65.0 (2)}


\begin{tabular}{clcccccc}
\toprule
                                         &    & \multicolumn{3}{c}{\textbf{Strict}} & \multicolumn{3}{c}{\textbf{Relaxed}} \\
\textbf{\makecell{Pre-processing}} & \textbf{\makecell{BERT \\ Model}} & \textbf{Precision} &    \textbf{Recall} &        \textbf{F1} & \textbf{Precision} &    \textbf{Recall} &        \textbf{F1} \\

\midrule
\multirow{3}{*}{\textbf{Original}} & \textbf{Ger.} &   \textbf{8.8 (2)} &  \textbf{12.7 (3)} &  \textbf{10.4 (2)} &  \textbf{54.3 (1)} &           79.7 (3) &           64.5 (0) \\
\hline
\multirow{3}{*}{\textbf{Corefernce}} & \textbf{Ger.} &            7.1 (2) &           10.6 (2) &            8.5 (2) &  \textbf{54.3 (3)} &  \textbf{81.3 (2)} &  \textbf{65 (2)} \\
\bottomrule
\end{tabular}



### Visualization

In [None]:
combined_results_all = []
for metric_kind in ["strict", "relaxed"]:
    combined_results = []

    # experiment
    for t, n in [[time, name], [coref_time, coref_name]]:

        # experiment
        transfer_learning_names = ["no_transfer", "dunietz", "rehbein", "rehbein_dunietz"]
        for tl_name in transfer_learning_names:

            results_path = PROJECT_PATH + f"output/{t}_{n}/"  + f"{corpus_name}_predictions_{t}_{tl_name}.json"

            with open(results_path, "r") as f:
                results = json.load(f)

            metric_results = {}
            model_kind = 'dbmdz/bert-base-german-cased'

            metric_results_model = {}
            for match_metric in ["precision", "recall", "f1"]:
                match_metric_mean = np.mean([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                match_metric_std = np.std([results[model_kind]["oof_results_all"][fold]["match_relations_results"][metric_kind][match_metric] for fold in range(5)]) * 100
                metric_results_model[metrics_dict[match_metric]] = match_metric_mean
                metric_results_model["Preprocessing"] = n
                metric_results_model["Transfer"] = transfer_learning_names_dict_base[tl_name]
                metric_results_model["kind"] = metric_kind.title()

            metric_results[model_kind] = metric_results_model

            combined_results.append(pd.DataFrame(metric_results).T)

    combined_results = pd.concat(combined_results)
    combined_results_all.append(combined_results)

combined_results_all = pd.concat(combined_results_all, axis=0).reset_index()

combined_results_all

Unnamed: 0,index,F1,Precision,Preprocessing,Recall,Transfer,kind
0,dbmdz/bert-base-german-cased,7.154313,6.369652,normal_valid,8.189504,—,Strict
1,dbmdz/bert-base-german-cased,8.67251,7.657585,normal_valid,10.03458,BEC,Strict
2,dbmdz/bert-base-german-cased,9.004887,7.421696,normal_valid,11.490246,R&R,Strict
3,dbmdz/bert-base-german-cased,10.365927,8.753766,normal_valid,12.733431,"R&R, BEC",Strict
4,dbmdz/bert-base-german-cased,7.564494,6.700181,coref,8.724825,—,Strict
5,dbmdz/bert-base-german-cased,8.698892,7.622586,coref,10.152759,BEC,Strict
6,dbmdz/bert-base-german-cased,8.213675,6.709829,coref,10.609322,R&R,Strict
7,dbmdz/bert-base-german-cased,8.454396,7.058418,coref,10.586515,"R&R, BEC",Strict
8,dbmdz/bert-base-german-cased,68.037232,60.529212,normal_valid,77.981956,—,Relaxed
9,dbmdz/bert-base-german-cased,66.967,59.051532,normal_valid,77.580118,BEC,Relaxed


In [None]:
fig = make_subplots(
    rows=2, cols=1,
    # subplot_titles=("Strict", "Relaxed"),
    subplot_titles=(" ", " "),
    shared_xaxes=True,
    vertical_spacing=0.05
)

strict_f1 = combined_results_all.query("kind == 'Strict'")[["Preprocessing", "Transfer", "F1"]]
relaxed_f1 = combined_results_all.query("kind == 'Relaxed'")[["Preprocessing", "Transfer", "F1"]]

fig = go.Figure()

preprocess_dict = {
    "normal_valid": "Original",
    "coref": "Coreference",
}
for i, pre in enumerate(strict_f1["Preprocessing"].unique()):
    fig.add_trace(go.Scatter(
        x=strict_f1.query("Preprocessing == @pre")["Transfer"],
        y=strict_f1.query("Preprocessing == @pre")["F1"],
        line=dict(width=2, color=cols[i]),
        marker=dict(color=cols[i]),
        name=preprocess_dict[pre] + "<br>Strict "
    ))

fig.update_layout(
    autosize=False,
    width=750,
    height=450,
    xaxis_title="Transfer Learning Data",
    yaxis_title="F1 Score",
    # yaxis2_title="F1 Score",
    legend_title="Model",
    font=dict(
        size=18,
    ),
    template="plotly_white"
)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#595959')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#595959')

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

fig.show()

fig.write_image("matched_rel_coref_comb_strict.png")

In [None]:
fig = make_subplots(
    rows=2, cols=1,
    # subplot_titles=("Strict", "Relaxed"),
    subplot_titles=(" ", " "),
    shared_xaxes=True,
    vertical_spacing=0.05
)

relaxed_f1 = combined_results_all.query("kind == 'Relaxed'")[["Preprocessing", "Transfer", "F1"]]

fig = go.Figure()

preprocess_dict = {
    "normal_valid": "Original",
    "coref": "Coreference",
}
for i, pre in enumerate(relaxed_f1["Preprocessing"].unique()):
    fig.add_trace(go.Scatter(
        x=relaxed_f1.query("Preprocessing == @pre")["Transfer"],
        y=relaxed_f1.query("Preprocessing == @pre")["F1"],
        line=dict(width=2, color=cols[i], dash="dot"),
        marker=dict(color=cols[i]),
        name=preprocess_dict[pre] + "<br>Relaxed "
    ))

fig.update_layout(
    autosize=False,
    width=750,
    height=450,
    xaxis_title="Transfer Learning Data",
    yaxis_title="F1 Score",
    # yaxis2_title="F1 Score",
    legend_title="Model",
    font=dict(
        size=18,
    ),
    template="plotly_white"
)

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#595959')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#595959')

fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))

fig.show()

fig.write_image("matched_rel_coref_comb_relaxed.png")

## How well are coreferences predicted?

In [None]:
# normal results
results_path = PROJECT_PATH + f"output/{time}_{name}/"  + f"{corpus_name}_predictions_{time}_rehbein_dunietz.json"

with open(results_path, "r") as f:
    results= json.load(f)

oof_results = pd.DataFrame(results["dbmdz/bert-base-german-cased"]["oof_predictions"]).query("kind != 'Predicted without label'")
oof_results.head()


# coref results
results_path_coref = PROJECT_PATH + f"output/{coref_time}_{coref_name}/"  + f"{corpus_name}_predictions_{coref_time}_rehbein_dunietz.json"

with open(results_path_coref, "r") as f:
    results_coref = json.load(f)

oof_results_coref = pd.DataFrame(results_coref["dbmdz/bert-base-german-cased"]["oof_predictions"]).query("kind != 'Predicted without label'")
oof_results_coref.head()

Unnamed: 0,tokens,labels,kind,type,degree,relation_id,id,fold
0,"[[, Datei, :, /media, /, bernhard, /, 40FE-DE2...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Ground Truth,,,0,0,0
1,"[[, Datei, :, /media, /, bernhard, /, 40FE-DE2...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Predicted,,,0,0,0
2,"[Während, der, beherzte, Weidmann, im, Osten, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Predicted,Motivation,Facilitate,0,10,0
3,"[Während, der, beherzte, Weidmann, im, Osten, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",Ground Truth,Motivation,Facilitate,0,10,0
4,"[In, diesem, Wechsel, der, Verhältnisse, sind,...","[B-Cause, I-Cause, I-Cause, I-Cause, I-Cause, ...",Predicted,Consequence,Facilitate,0,11,0


In [None]:
correct_labels = []
pred_labels = []
random_labels = []

for id in tqdm(oof_results["id"].unique()[:]):
    orig_samples = oof_results.query("id == @id")
    coref_samples = oof_results_coref.query("id == @id")

    for rel_id in orig_samples["relation_id"].unique():
        orig_labels_gt = np.array(orig_samples.query("kind == 'Ground Truth' and relation_id == @rel_id").iloc[0]["labels"])
        coref_labels_gt = np.array(coref_samples.query("kind == 'Ground Truth' and relation_id == @rel_id").iloc[0]["labels"])

        # orig_labels_pred = np.array(orig_samples.query("kind == 'Predicted' and relation_id == @rel_id").iloc[0])
        coref_labels_pred = np.array(coref_samples.query("kind == 'Predicted' and relation_id == @rel_id").iloc[0]["labels"])

        if any(orig_labels_gt != coref_labels_gt): 

            coref_pos = np.where(orig_labels_gt != coref_labels_gt)[0]
            correct_labels.append(coref_labels_gt[coref_pos])
            pred_labels.append(coref_labels_pred[coref_pos])

            random_pos = np.random.choice(config["causal_arguments"] + ["O"], len(coref_pos))
            random_labels.append(random_pos)

100%|██████████| 1698/1698 [00:26<00:00, 63.95it/s]


In [None]:
# Pred - True
pred_labels_no_prefix = np.array([label[2:] if label!="O" else "O" for label in flatten(pred_labels)])
true_labels_no_prefix = np.array([label[2:] if label!="O" else "O" for label in flatten(correct_labels)])
# tags_no_other = sorted(set(pred_labels_no_prefix + true_labels_no_prefix) - set(["O"]))

# preserve right order
# tags_no_other = [arg for arg in ["Cause", "Effect", "Actor", "Affected", "Support", "Controlling"] if arg in tags_no_other]

# report = classification_report_sk(true_labels_no_prefix, pred_labels_no_prefix, zero_division=False, labels=tags_no_other, output_dict=True)
# report_df = pd.DataFrame(report)


# # Random - True
# pred_labels_no_prefix = flatten(random_labels)
# report_rand = classification_report_sk(true_labels_no_prefix, pred_labels_no_prefix, zero_division=False, labels=tags_no_other, output_dict=True)

# report_rand_df = pd.DataFrame(report_rand)


In [None]:
from sklearn.metrics import matthews_corrcoef

results = {}
for arg in np.unique(np.concatenate([true_labels_no_prefix])):

    mcc = matthews_corrcoef(true_labels_no_prefix==arg, pred_labels_no_prefix==arg)
    results["\textbf{" + arg + "}"] = np.round(mcc*100, 2)

results["\textbf{Overall}"] = np.round(matthews_corrcoef(true_labels_no_prefix, pred_labels_no_prefix) * 100, 2)

print(pd.DataFrame(results, index=["\textbf{Coreference MCC}"]).to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}"))

\begin{tabular}{lrrrrr}
\toprule
{} &  \textbf{Actor} &  \textbf{Affected} &  \textbf{Cause} &  \textbf{Effect} &  \textbf{Overall} \\
\midrule
\textbf{Coreference MCC} &            -2.6 &               39.9 &           -3.93 &             15.4 &              9.03 \\
\bottomrule
\end{tabular}



In [None]:
confusion_matrix(true_labels_no_prefix, pred_labels_no_prefix)

array([[ 0,  0,  0,  0, 25],
       [ 0, 10,  4,  0, 36],
       [ 1,  0,  6,  6, 42],
       [ 0,  0, 17, 10, 49],
       [ 0,  0,  0,  0,  0]])

In [None]:
matthews_corrcoef(true_labels_no_prefix, pred_labels_no_prefix)

0.0902764898399391

In [None]:
pred_labels_no_prefix

array(['Affected', 'Affected', 'Affected', 'Affected', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'Effect', 'Effect', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'Cause', 'Cause', 'Cause', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'Effect', 'Effect', 'Effect', 'Effect', 'O', 'Affected',
       'Affected', 'O', 'O', 'Cause', 'Cause', 'Cause', 'Cause', 'Cause',
       'Cause', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'Effect',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'Cause', 'Cause', 'Cause', 'Cause',
       'Cause', 'Cause', 'Cause', 'Cause', 'Cause', 'Cause', 'Cause',
       'Cause', 'Cause', 'Cause', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'Affected', 'Affected', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'Cause', 'Cause', 'O', 'O', 'O', 'O', 'O', 'O',
       'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

In [None]:
true_labels_no_prefix

array(['Affected', 'Affected', 'Affected', 'Affected', 'Affected',
       'Affected', 'Affected', 'Affected', 'Affected', 'Affected',
       'Affected', 'Cause', 'Cause', 'Effect', 'Effect', 'Effect',
       'Effect', 'Effect', 'Effect', 'Effect', 'Effect', 'Effect',
       'Effect', 'Effect', 'Effect', 'Effect', 'Effect', 'Effect',
       'Effect', 'Actor', 'Actor', 'Actor', 'Actor', 'Actor', 'Actor',
       'Actor', 'Actor', 'Actor', 'Actor', 'Actor', 'Actor', 'Actor',
       'Actor', 'Cause', 'Cause', 'Cause', 'Cause', 'Affected',
       'Affected', 'Affected', 'Affected', 'Affected', 'Cause', 'Cause',
       'Cause', 'Cause', 'Cause', 'Cause', 'Affected', 'Affected',
       'Affected', 'Affected', 'Affected', 'Affected', 'Affected',
       'Affected', 'Affected', 'Effect', 'Effect', 'Effect', 'Effect',
       'Effect', 'Effect', 'Effect', 'Actor', 'Actor', 'Actor',
       'Affected', 'Effect', 'Cause', 'Cause', 'Cause', 'Cause', 'Cause',
       'Affected', 'Affected', 'Effect', 'Ef

In [None]:
matthews_corrcoef(["a", "a", "a", "b"], ["a", "a", "b", "a"])

-0.3333333333333333

In [None]:
y_true = ["a", "a", "a", "b"]
y_pred = ["a", "b", "a", "c"]
matthews_corrcoef(y_true, y_pred)

0.12909944487358055

In [None]:
res = defaultdict(list)

for i in tqdm(range(10000)):
    for arg in tags_no_other:
        score = f1_score(
            true_labels_no_prefix, np.random.choice(config["causal_arguments"] + ["O"], len(true_labels_no_prefix)), 
            zero_division=False, labels=[arg], average="micro"
        )

        res[arg].append(score)

for arg in res:
    print(arg, np.mean(res[arg]))

  0%|          | 0/10000 [00:00<?, ?it/s]


NameError: ignored

In [None]:
comb_df = pd.DataFrame({
    "Model": report_df[tags_no_other].loc["f1-score"] * 100,
    "Random": report_df[tags_no_other].loc["support"].astype(int).apply(lambda n_a: n_a / ((4*n_a) + report_df.loc["support", "micro avg"]/2)) * 100, #report_rand_df[tags_no_other].loc["f1-score"] * 100,
}).T.round(2).astype(str)

comb_df.columns = ["\textbf{" + arg + "}" for arg in comb_df.columns]
comb_df.index = ["\textbf{" + arg + "}" for arg in comb_df.index]

comb_df = comb_df.apply(bold_largest, axis=0)
# comb_df.loc["\textbf{Num Tokens}"] = report_df[tags_no_other].loc["support"].astype(int).values
display(comb_df)

latex_df = comb_df.to_latex().replace('textbackslash ', '').replace("\{", "{").replace("\}", "}")
latex_df = latex_df.replace("\\textbf{Random}", "\\hline\n\\textbf{Random}")
latex_df = latex_df.replace("lllll", "lcccc")
latex_df = latex_df.replace("\textbf{Random}", "\\hline\n\textbf{Random}")
latex_df = latex_df.replace(".0", "")
latex_df = re.sub(r"^[\\\s\&]+$", "", latex_df, flags=re.MULTILINE)
print(latex_df)
