In [None]:
# This notebook allows replicating the experiments described in "Experiments" section of the thesis

In [None]:
import os
os.chdir("..")

In [None]:
import torch
import nltk
import re
import pandas as pd
import plotly.express as px
from experiments.enhancing_with_ner import test_enhancing_text_used_to_train_re
from experiments.hyperparameters import optuna_hp_space, optuna_hp_space_scientific
from experiments.model_size import test_ner_quality_depending_on_dataset_size, test_re_quality_depending_on_dataset_size
from named_entity.named_entity_model import NamedEntityModel
from relations.relations_model import RelationsModel
from utils.evaluation import evaluate_with_division_between_column
from utils.optuna_reader import read_optuna_logs
from utils.prediction import train_re_on_ner, predict_joint_models
from utils.preprocessing import filter_out_wrong_data
from utils.overlap import create_full_matrix
from utils.enhancement import enhance_with_nothing, enhance_with_entity, enhance_with_brackets, \
    enhance_with_entity_differentiated, enhance_with_special_characters, enhance_entities_only
from experiments.hyperparameters import test_hyperparameters_impact
from experiments.sample_languages import run_experiments_linguistic, perform_four_variations_linguistic, train_and_evaluate_on_language_subsets
from utils.overlap import remove_overlapping_entities
nltk.download('punkt')

In [None]:
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_colwidth', 500)

In [None]:
import wandb

wandb.init(mode="disabled")

# Hyperparameter optimization

In [None]:
NER_BROAD_FILE_PATH='results/hyperparameter_optimization_ner_broad.txt'
RE_BROAD_FILE_PATH='results/hyperparameter_optimization_re_broad.txt'
NER_SCIENTIFIC_FILE_PATH='results/hyperparameter_optimization_ner_scientific.txt'
RE_SCIENTIFIC_FILE_PATH='results/hyperparameter_optimization_re_scientific.txt'

In [None]:
torch.cuda.empty_cache()
train_df = pd.read_csv("merged_train.tsv", sep="\t")
test_df = pd.read_csv("merged_test.tsv", sep="\t")
train_df=train_df.sample(frac=0.02,random_state=42)
train_df = filter_out_wrong_data(train_df)
test_df = filter_out_wrong_data(test_df)
ner_model=NamedEntityModel()
re_model=RelationsModel()

## Broad search

In [None]:
%%capture captured
ner_model.perform_hyperparameter_search(space=optuna_hp_space,train_df=train_df, study_name="ner_hyperparameter_search_broad")
with open(NER_BROAD_FILE_PATH, 'w') as f:
    f.write(captured.stdout)

In [None]:
%%capture captured
re_model.perform_hyperparameter_search(space=optuna_hp_space,train_df=train_df, study_name="re_hyperparameter_search_broad")
with open(RE_BROAD_FILE_PATH, 'w') as f:
    f.write(captured.stdout)

## Scientific-based search

In [None]:
%%capture captured
ner_model.perform_hyperparameter_search(space=optuna_hp_space_scientific,train_df=train_df, study_name="ner_hyperparameter_search_scientific")
with open(NER_SCIENTIFIC_FILE_PATH, 'w') as f:
    f.write(captured.stdout)

In [None]:
%%capture captured
re_model.perform_hyperparameter_search(space=optuna_hp_space_scientific,train_df=train_df, study_name="re_hyperparameter_search_scientific")
with open(RE_SCIENTIFIC_FILE_PATH, 'w') as f:
    f.write(captured.stdout)

## Analysis

In [None]:
def plot_histogram(df, column, yaxis_range=None):
    fig = px.histogram(df, x=column, y="metric", title=f"{column} impact on metric", histfunc='avg')
    # Check if a y-axis range is provided, and if so, set it
    if yaxis_range:
        fig.update_layout(yaxis=dict(range=yaxis_range))
    fig.show()

def plot_scatter(df, column, yaxis_range=None):
    fig = px.scatter(df, x=column, y="metric", title=f"{column} impact on metric")
    # Check if a y-axis range is provided, and if so, set it
    if yaxis_range:
        fig.update_layout(yaxis=dict(range=yaxis_range))
    fig.show()


def analyze_optuna_results(
    file_path,
    all_histograms=False,
    exclude_columns=["metric", "trial_number", "trial_runtime"],
    calculate_correlation=True,
    yaxis_range=None
):
    file_name = os.path.splitext(os.path.basename(file_path))[0]
    print(f"Analyzing: {file_name}")
    df = read_optuna_logs(file_path)
    for column in df.columns:
        if column not in exclude_columns:
            if all_histograms or df[column].dtype == 'int64':
                plot_histogram(df, column, yaxis_range)
            else:
                plot_scatter(df, column, yaxis_range)
    plot_scatter(df, "trial_number", yaxis_range)
    if calculate_correlation:
        correlation_data = []
        for column in df.columns:
            if column not in exclude_columns:
                corr = df[column].corr(df['metric'], method='spearman')
                corr = round(corr, 2)
                correlation_data.append({'Parameter': column, 'Spearman Correlation': corr})
        correlation_df = pd.DataFrame(correlation_data)
        print("Spearman Correlations with Metric:")
        display(correlation_df.sort_values(by='Spearman Correlation', ascending=False))
    display(df.sort_values(by="metric", ascending=False))

def compare_two_studies(exploratory_file_path, literature_based_file_path, yaxis_range=None):
    df1 = read_optuna_logs(exploratory_file_path)
    df2 = read_optuna_logs(literature_based_file_path)
    df1['study'] = "exploratory"
    df2['study'] = "literature-based"
    combined_df = pd.concat([df1, df2], ignore_index=True)
    fig = px.scatter(combined_df, x='trial_number', y='metric', color='study', title="Trial number vs F1")
    fig.update_layout(showlegend=False)
    if yaxis_range:
        fig.update_layout(yaxis=dict(range=yaxis_range))
    fig.update_xaxes(tickvals=list(range(0,50)))
    fig.show()

In [None]:
analyze_optuna_results(file_path=NER_BROAD_FILE_PATH, yaxis_range=[0.75,0.85])

In [None]:
analyze_optuna_results(file_path=RE_BROAD_FILE_PATH, yaxis_range=[0.9,1])

In [None]:
analyze_optuna_results(file_path=NER_SCIENTIFIC_FILE_PATH, yaxis_range=[0.75,0.85], all_histograms=True)

In [None]:
analyze_optuna_results(file_path=RE_SCIENTIFIC_FILE_PATH, yaxis_range=[0.9,1], all_histograms=True)

In [None]:
compare_two_studies(NER_BROAD_FILE_PATH, NER_SCIENTIFIC_FILE_PATH, [0.75,0.85])

In [None]:
compare_two_studies(RE_BROAD_FILE_PATH, RE_SCIENTIFIC_FILE_PATH, [0.9,1])

## Freezing hyperparameters

In [None]:
torch.cuda.empty_cache()
train_df = pd.read_csv("data/merged_train.tsv", sep="\t")
test_df = pd.read_csv("data/merged_test.tsv", sep="\t")
train_df=train_df.sample(frac=0.1,random_state=42)
train_df = filter_out_wrong_data(train_df)
test_df = filter_out_wrong_data(test_df)
ner_model=NamedEntityModel('models/freezing_hyperparameters')
re_model=RelationsModel('models/freezing_re_hyperparameters')

In [None]:
ner_results_freezing=test_hyperparameters_impact(ner_model,train_df,test_df)

In [None]:
ner_results_freezing

In [None]:
re_results_freezing=test_hyperparameters_impact(ner_model,train_df,test_df)

In [None]:
re_results_freezing

# Dataset size impact

In [None]:
SIZES=[
    100,500,1000,2000,5000,10000,20000,50000,100000,200000,
    # 300000,400000,500000
]

In [None]:
total_results_ner={}
total_results_re={}
total_results_re_no_tags={}
for dataset in [
    'merged',
    # 'en-full',
    # 'pl'
]:
    print(f"DATASET: {dataset}")
    train_df = pd.read_csv(f"data/{dataset}_corpora_train.tsv", sep="\t")
    test_df = pd.read_csv(f"data/{dataset}_corpora_test.tsv", sep="\t")
    train_df = filter_out_wrong_data(train_df)
    test_df = filter_out_wrong_data(test_df)
    ner_model=NamedEntityModel()
    # re_model=RelationsModel()
    ner_results=test_ner_quality_depending_on_dataset_size(model=ner_model, train_df=train_df, test_df=test_df, sizes=SIZES, random_state=42)
    total_results_ner[dataset]=ner_results
    # re_results=test_re_quality_depending_on_dataset_size(model=re_model, train_df=train_df, test_df=test_df, sizes=SIZES, random_state=42, remove_tags=False)
    # total_results_re[dataset]=re_results
    # re_no_tags_results=test_re_quality_depending_on_dataset_size(model=re_model, train_df=train_df, test_df=test_df, sizes=SIZES, random_state=42, enhancement_func=enhance_with_nothing,
    #                                                              remove_tags=True)
    # total_results_re_no_tags[dataset]=re_no_tags_results

In [None]:
total_results_ner

In [None]:
total_results_ner_2

In [None]:
total_results_re

In [None]:
total_results_re_no_tags

# Training base NER

In [None]:
torch.cuda.empty_cache()
train_df = pd.read_csv("data/merged_train.tsv", sep="\t")
test_df = pd.read_csv("data/merged_test.tsv", sep="\t")
train_df=train_df.sample(frac=0.5,random_state=42)
train_df = filter_out_wrong_data(train_df)
test_df = filter_out_wrong_data(test_df)
ner_model=NamedEntityModel('models/default_50_ner')
ner_model.train(train_df=train_df)
ner_model.evaluate(df=test_df)

# Joining The Models Together

In [None]:
torch.cuda.empty_cache()
train_df = pd.read_csv("merged_train.tsv", sep="\t")
test_df = pd.read_csv("merged_test.tsv", sep="\t")
train_df=train_df.sample(frac=1,random_state=42)
train_df = filter_out_wrong_data(train_df)
test_df = filter_out_wrong_data(test_df)
ner_model=NamedEntityModel('models/base_ner')
re_model=RelationsModel()

In [None]:
# read parameter needs to be set to False if predicting for the first time (and lacking NER prediction results for a given dataset subset)
test_enhancing_text_used_to_train_re(train_df, test_df, ner_model, re_model, results_file='results_base_ner.pkl', read=True, train_ner=False)

# Model Variant Comparison

In [None]:
torch.cuda.empty_cache()
train_df = pd.read_csv("merged_train.tsv", sep="\t")
test_df = pd.read_csv("merged_test.tsv", sep="\t")
train_df=train_df.sample(frac=1,random_state=42)
train_df = filter_out_wrong_data(train_df)
test_df = filter_out_wrong_data(test_df)

## DistilBERT

In [None]:
ner_model=NamedEntityModel(model_type='distilbert-base-multilingual-cased',model_path='models/distilbert_ner')
re_model=RelationsModel(model_type='distilbert-base-multilingual-cased',model_path='models/distillbert_re')
# ner_model.train(train_df=train_df)
# distilbert_results_ner=ner_model.evaluate(df=test_df)
distilbert_results_re=train_re_on_ner(ner_model=ner_model, re_model=re_model, train_df=train_df, test_df=test_df, enhancement_func=enhance_with_special_characters, results_file='results_distilbert_ner.pkl', read=True)
# After the first call, the prediction results are surely saved, so we can set read to Tru

In [None]:
distilbert_results_re_enhance_with_nothing=train_re_on_ner(ner_model=ner_model, re_model=re_model, train_df=train_df, test_df=test_df, enhancement_func=enhance_with_nothing, results_file='results_distilbert_ner.pkl', read=True)

## XLMRoBERTa

In [None]:
ner_model=NamedEntityModel(model_type='xlm-roberta-base',model_path='models/xlmroberta_ner')
re_model=RelationsModel(model_type='xlm-roberta-base',model_path='models/xlmroberta_re')
# ner_model.train(train_df=train_df)
# xlmroberta_results_ner=ner_model.evaluate(df=test_df)
xlmroberta_results_re=train_re_on_ner(ner_model=ner_model, re_model=re_model, train_df=train_df, test_df=test_df, enhancement_func=enhance_with_special_characters, results_file='results_xlm_roberta.pkl', read=True)
# After the first call, the prediction results are surely saved, so we can set read to True

In [None]:
xlmroberta_results_re

In [None]:
ner_model=NamedEntityModel(model_type='xlm-roberta-base',model_path='models/xlmroberta_ner')
re_model=RelationsModel(model_type='xlm-roberta-base',model_path='models/xlmroberta_re')
# ner_model.train(train_df=train_df)
# xlmroberta_results_ner=ner_model.evaluate(df=test_df)
xlmroberta_results_re_no_tags=train_re_on_ner(ner_model=ner_model, re_model=re_model, train_df=train_df, test_df=test_df, enhancement_func=enhance_with_nothing, results_file='results_xlm_roberta.pkl', read=True)
# After the first call, the prediction results are surely saved, so we can set read to True

In [None]:
xlmroberta_results_re_no_tags

# Prediction Error Analysis

In [None]:
torch.cuda.empty_cache()
train_df = pd.read_csv("merged_train.tsv", sep="\t")
test_df = pd.read_csv("merged_test.tsv", sep="\t")
train_df=train_df.sample(frac=1,random_state=42)
train_df = filter_out_wrong_data(train_df)
test_df = filter_out_wrong_data(test_df)
ner_model=NamedEntityModel(model_path='models/base_ner')
re_model=RelationsModel(model_path='models/re_entity_with_special_characters')

## Model predictions detailed dataframe generator

In [None]:
prediction_results=predict_joint_models(test_df, ner_model, re_model, enhance_function=enhance_with_special_characters)
prediction_results

In [None]:
def normalize_text(text):
    return ''.join(char.lower() for char in text if char.isalnum()).strip()

In [None]:
prediction_results['entity_1_norm'] = prediction_results['entity_1'].apply(normalize_text)
prediction_results['predicted_entity_1_norm'] = prediction_results['predicted_entity_1'].apply(normalize_text)
prediction_results['entity_2_norm'] = prediction_results['entity_2'].apply(normalize_text)
prediction_results['predicted_entity_2_norm'] = prediction_results['predicted_entity_2'].apply(normalize_text)
wrong_ner_results = prediction_results[
    (prediction_results['entity_1_norm'] != prediction_results['predicted_entity_1_norm']) |
    (prediction_results['entity_2_norm'] != prediction_results['predicted_entity_2_norm'])
]
wrong_re_results = prediction_results[prediction_results['label'] != prediction_results['predicted_label']]
print(len(wrong_ner_results))
print(len(wrong_re_results))
wrong_ner_results.to_csv('results/wrong_ner_results.csv', index=False)
wrong_re_results.to_csv('results/wrong_re_results.csv', index=False)

In [None]:
len(test_df)

# Entity1 vs Entity2

In [None]:
ner_results=ner_model.evaluate(df=test_df)
print(f"Entity 1 F1: {ner_results['eval_Entity1_f1']}")
print(f"Entity 1 F1: {ner_results['eval_Entity2_f1']}")

## F1 per relation

In [None]:
f1_per_relation_df

In [None]:
f1_per_relation_df=evaluate_with_division_between_column(model=re_model, test_df=test_df, column_name="label")
relation_counts = train_df.groupby('label').size().reset_index(name='num_examples')
combined_df = pd.merge(f1_per_relation_df, relation_counts, on="label")
display(combined_df)

In [None]:
correlation = combined_df['num_examples'].corr(combined_df['f1'], method='spearman')
print(f"Correlation between number of examples and F1 score: {correlation:.2f}")

In [None]:
fig = px.scatter(combined_df, x='num_examples', y='f1',
                 title="Dependency of relation's F1 score on number of examples",
                height=600, labels={"num_examples": "Number of examples", "f1":"f1"}, text="label"
                )

In [None]:
import plotly.graph_objects as go
import random
from itertools import cycle

positions = ['top left', 'top center', 'top right', 'middle right', 'bottom right', 'bottom center', 'bottom left', 'middle left']
cycled_list = cycle(positions)

def update_point(trace, points, selector):
    p = list(scatter.textposition)  # get the current location assignments
    for i in points.point_inds:  # all selected point indeces
        p[i] = next(cycled_list)  # replace corresponding list item by new position
        with fig.batch_update():
            scatter.textposition = p

def random_text_position(x):
    positions = ['top left', 'top center', 'top right', 'middle left', 'middle right', 'bottom left', 'bottom center', 'bottom right']  # you can add more: left center ...
    return [random.choice(positions) for i in range(len(x))]

fig = go.FigureWidget()

fig.add_trace(go.Scatter(
    x=combined_df['num_examples'],
    y=combined_df['f1'],
    mode="markers+text",
    name="Markers and Text",
    text=combined_df['label'],
    textposition=random_text_position(combined_df['label']),
))

scatter = fig.data[0]

scatter.on_click(update_point)
fig.update_layout(width=1000, height=1000)

fig



## F1 per language

In [None]:
f1_per_language_ner=evaluate_with_division_between_column(model=ner_model, test_df=test_df, column_name="lang")
display(f1_per_language_ner)

In [None]:
f1_per_language_ner

In [None]:
f1_per_language_re=evaluate_with_division_between_column(model=re_model, test_df=test_df, column_name="lang")
display(f1_per_language_re)

In [None]:
f1_per_language_re

# Linguistic Experiments

In [None]:
torch.cuda.empty_cache()
ner_model=NamedEntityModel()
re_model=RelationsModel()

In [None]:
# Language family
language_family_results=[]
language_family_results.append(perform_four_variations_linguistic(
    ["pl"],
    ["pt"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=5000,
    downsample_main=500,
    train_monolingual=False,
    number_of_runs=5
))

language_family_results.append(perform_four_variations_linguistic(
    ["es"],
    ["pt"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=5000,
    downsample_main=500,
    train_monolingual=False,
    number_of_runs=5
))

language_family_results.append(perform_four_variations_linguistic(
    ["pl"],
    ["ru"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=5000,
    downsample_main=500,
    train_monolingual=False,
    number_of_runs=5
))

language_family_results.append(perform_four_variations_linguistic(
    ["ar"],
    ["ru"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=5000,
    downsample_main=500,
    train_monolingual=False,
    number_of_runs=5
))
display(pd.DataFrame(language_family_results))

In [None]:
# SVO
svo_results=[]
svo_results.append(perform_four_variations_linguistic(
    ["ko"],
    ["fa"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=5000,
    downsample_main=500,
    number_of_runs=5
))
svo_results.append(perform_four_variations_linguistic(
    ["it"],
    ["fa"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=5000,
    downsample_main=500,
    train_monolingual=False,
    number_of_runs=5
))
svo_results.append(perform_four_variations_linguistic(
    ["pl"],
    ["fa"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=5000,
    downsample_main=500,
    train_monolingual=False,
    number_of_runs=5
))
display(pd.DataFrame(svo_results))

In [None]:
# SVO but the other way round
svo_other_results=[]
svo_other_results.append(perform_four_variations_linguistic(
    ["fa"],
    ["pl"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=2000,
    downsample_main=200,
    number_of_runs=5
)
svo_other_results.append(perform_four_variations_linguistic(
    ["ko"],
    ["pl"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=2000,
    downsample_main=200,
    number_of_runs=5
))
svo_other_results.append(perform_four_variations_linguistic(
    ["it"],
    ["pl"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=2000,
    downsample_main=200,
    number_of_runs=5
))
display(pd.DataFrame(svo_other_results))

In [None]:
# Cross-script
cross_script_result=[]
cross_script_result.append(perform_four_variations_linguistic(
    ["fr"],
    ["nl"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=5000,
    downsample_main=500,
    number_of_runs=5
))
cross_script_result.append(perform_four_variations_linguistic(
    ["ru"],
    ["nl"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=5000,
    downsample_main=500,
    train_monolingual=False,
    number_of_runs=5
))
display(pd.DataFrame(cross_script_result))

In [None]:
# Simple vs complex language
simple_language_results=[]
simple_language_results.append(perform_four_variations_linguistic(
    ["pl"],
    ["es"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=5000,
    downsample_main=500,
    number_of_runs=5
))

simple_language_results.append(perform_four_variations_linguistic(
    ["nl"],
    ["es"],
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type="micro",
    downsample_number=5000,
    downsample_main=500,
    number_of_runs=5
))
display(pd.DataFrame(simple_language_results))

In [None]:
simple_language_results

In [None]:
train_df=pd.read_csv('data/fr_corpora_train.tsv',sep='\t')
test_df=pd.read_csv('data/it_corpora_test.tsv',sep='\t')
len(train_df)

In [None]:
train_df=remove_overlapping_entities(train_df,test_df)
len(train_df)

In [None]:
train_and_evaluate_on_language_subsets(
    train_df,
    test_df,
    ner_model,
    re_model,
    enhance_function=enhance_with_special_characters,
    average_type='micro',
)

# Dataset Error Analysis

In [None]:
train_df=pd.read_csv('merged_train.tsv',sep='\t')
test_df=pd.read_csv('merged_test.tsv',sep='\t')
df=pd.concat([train_df,test_df])

In [None]:
def add_problem(df, description, pattern=None, problems=None, contains=True, func=None):
    if func:  # If a function is provided
        df['is_problem'] = df.apply(func, axis=1)
    else:
        if contains:
            df['is_problem'] = df['text'].str.contains(pattern, regex=True, na=False)
        else:
            df['is_problem'] = ~df['text'].str.contains(pattern, regex=True, na=False)
    problem_df = df[df['is_problem'] == True]
    problem = {"description": description, 'row_count': len(problem_df)}
    print(f"Problem: {description}. Number of examples affected: {len(problem_df)} out of {len(df)} ({round(100 * len(problem_df) / len(df), 2)}%)")
    print("**************************")
    problems.append(problem)
    return problem_df

def entity_mismatch_with_text_tag(row, entity_number):
    tag = f"<e{entity_number}>"
    closing_tag = f"</e{entity_number}>"
    entity_key = f"entity_{entity_number}"
    if isinstance(row[entity_key], str) and isinstance(row['text'], str):
        parts = row['text'].split(tag)
        if len(parts) > 1:
            entity_in_text = parts[1].split(closing_tag)[0]
            return row[entity_key].lower() not in entity_in_text.lower()
    return False

def is_tag_inside_word(row):
    # Patterns to identify entity tags inside words
    patterns = [r"\w<e1>", r"</e1>\w", r"\w<e2>", r"</e2>\w"]
    text = row['text'] if isinstance(row['text'], str) else ""

    # Check if any pattern is found in the text
    return any(re.search(pattern, text) for pattern in patterns)

In [None]:
problems=[]
problem_dfs=[]
problem_dfs.append(add_problem(
    df=df,
    description="Does not contain correct entity tag pattern",
    pattern=r"(<e1>.*</e1>.*<e2>.*</e2>)|(<e2>.*</e2>.*<e1>.*</e1>)",
    problems=problems,
    contains=False
))
problem_dfs.append(add_problem(df=df, description="Entity mismatch with text tags",
func=lambda row: entity_mismatch_with_text_tag(row, 1) or entity_mismatch_with_text_tag(row, 2),
problems=problems))
problem_dfs.append(add_problem(df=df,description="Empty entity in text",pattern=r'(.*<e1></e1>.*)|(.*<e2></e2>.*)', problems=problems))
problem_dfs.append(add_problem(df=df,description="Multiple entities in text",pattern=r"(.*<e1>.*<e1>.*)|(.*<e2>.*<e2>.*)",problems=problems))
problem_dfs.append(add_problem(df=df, description="Overlapping entities", pattern=r"<e1>.*<e2>.*</e1>.*</e2>", problems=problems))
problem_dfs.append(add_problem(df=df, description="Missing entity_1 in text", func=lambda row: isinstance(row['entity_1'], str) and isinstance(row['text'], str) and row['entity_1'].lower() not in row['text'].lower(), problems=problems))
problem_dfs.append(add_problem(df=df, description="Missing entity_2 in text", func=lambda row: isinstance(row['entity_2'], str) and isinstance(row['text'], str) and row['entity_2'].lower() not in row['text'].lower(), problems=problems))

In [None]:
import pandas as pd

def df_percentage_contained(df1, df2):
    """Calculate the percentage of rows in df2 that are also in df1."""
    if len(df2) == 0:
        return 0  # Avoid division by zero for empty dataframes
    merged = df1.merge(df2, how='inner', indicator=True)
    match_count = merged['_merge'].value_counts().get('both', 0)
    return round((match_count / len(df2)) * 100)

descriptions = [problem['description'] for problem in problems]
percentage_matrix = []
for df1 in problem_dfs:
    row = []
    for df2 in problem_dfs:
        row.append(df_percentage_contained(df1, df2))
    percentage_matrix.append(row)
percentage_df = pd.DataFrame(percentage_matrix, columns=descriptions, index=descriptions)
display(percentage_df)

In [None]:
problem_dfs[1][problem_dfs[1]['lang']=='en'].head(5)

In [None]:
all_problems_df = pd.concat(problem_dfs).drop_duplicates()
unique_problems_ratio = len(all_problems_df) / len(df)
unique_problems_ratio

## Overlap analysis

In [None]:
full_matrix_all = create_full_matrix("results/train_overlap_all.csv")
full_matrix_distinct = create_full_matrix("results/train_overlap_distinct.csv")
display(full_matrix_all)
display(full_matrix_distinct)

## Miscellaneous

### Dataset size graphs

In [None]:
import plotly.graph_objects as go

# Data for the graphs (extracted from the provided table)
dataset_sizes = [100, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 300000, 400000, 500000]

# RE with tags
re_with_tags_merged = [0.58, 0.82, 0.90, 0.92, 0.94, 0.94, 0.95, 0.96, 0.96, 0.96, 0.97, 0.97, 0.96]
re_with_tags_en_full = [0.51, 0.85, 0.88, 0.90, 0.92, 0.95, 0.96, 0.96, 0.96, 0.96, 0.97, 0.96, 0.97]
re_with_tags_pl = [0.46, 0.86, 0.91, 0.93, 0.97, 0.97] + [None] * 7

# RE no tags
re_no_tags_merged = [0.53, 0.68, 0.72, 0.73, 0.77, 0.77, 0.79, 0.82, 0.82, 0.84, 0.85, 0.85, 0.84]
re_no_tags_en_full = [0.57, 0.68, 0.73, 0.73, 0.77, 0.79, 0.81, 0.83, 0.84, 0.85, 0.85, 0.85, 0.86]
re_no_tags_pl = [0.58, 0.75, 0.77, 0.78, 0.83, 0.90] + [None] * 7

# NER
ner_merged = [0.50, 0.67, 0.71, 0.73, 0.77, 0.80, 0.81, 0.83, 0.84, 0.86, 0.86, 0.87, 0.88]
ner_en_full = [0.52, 0.71, 0.74, 0.77, 0.79, 0.81, 0.81, 0.85, 0.84, 0.86, 0.87, 0.87, 0.88]
ner_pl = [0.55, 0.71, 0.78, 0.82, 0.87, 0.86] + [None] * 7

# Creating the Plotly graphs for each model type
fig_re_with_tags = go.Figure()
fig_re_no_tags = go.Figure()
fig_ner = go.Figure()

# RE with tags Graph
for dataset, name in zip([re_with_tags_merged, re_with_tags_en_full, re_with_tags_pl],
                         ['merged', 'en-full', 'pl']):
    fig_re_with_tags.add_trace(go.Scatter(x=dataset_sizes, y=dataset, mode='lines+markers', name=name))

# RE no tags Graph
for dataset, name in zip([re_no_tags_merged, re_no_tags_en_full, re_no_tags_pl],
                         ['merged', 'en-full', 'pl']):
    fig_re_no_tags.add_trace(go.Scatter(x=dataset_sizes, y=dataset, mode='lines+markers', name=name))

# NER Graph
for dataset, name in zip([ner_merged, ner_en_full, ner_pl],
                         ['merged', 'en-full', 'pl']):
    fig_ner.add_trace(go.Scatter(x=dataset_sizes, y=dataset, mode='lines+markers', name=name))

# Update layout for each graph for better visualization
for fig, title in zip([fig_re_with_tags, fig_re_no_tags, fig_ner],
                      ['Performance of RE with tags', 'Performance of RE without tags', 'Performance of NER']):
    fig.update_layout(
        title=title,
        xaxis_title='dataset size',
        yaxis_title='F1 ccore',
        xaxis_type='log',  # Log scale for better visibility on large ranges of x
        yaxis_range=[0, 1],  # F1 scores range from 0 to 1
        legend_title='dataset type'
    )

# Show the figures
fig_re_with_tags.show()
fig_re_no_tags.show()
fig_ner.show()


## Calculating baseline F1 values for merged train

In [None]:
import pandas as pd
from sklearn.metrics import f1_score
import numpy as np
from collections import Counter

def random_baseline_f1(labels):
    label_counts = Counter(labels)
    total_count = sum(label_counts.values())
    label_probabilities = np.array(list(label_counts.values())) / total_count
    random_predictions = np.random.choice(a=list(label_counts.keys()),
                                          p=label_probabilities,
                                          size=len(labels))
    return f1_score(labels, random_predictions, average='weighted')

def most_frequent_baseline_f1(labels):
    most_common_label = Counter(labels).most_common(1)[0][0]
    constant_predictions = [most_common_label] * len(labels)
    return f1_score(labels, constant_predictions, average='weighted')

train_df = pd.read_csv('merged_train.tsv', sep='\t')
test_df = pd.read_csv('merged_test.tsv', sep='\t')
train_labels = train_df['label']
test_labels = test_df['label']
train_random_f1 = random_baseline_f1(train_labels)
train_most_frequent_f1 = most_frequent_baseline_f1(train_labels)
test_random_f1 = random_baseline_f1(test_labels)
test_most_frequent_f1 = most_frequent_baseline_f1(test_labels)
print(f"Train Random Baseline F1: {train_random_f1}")
print(f"Train Most Frequent Baseline F1: {train_most_frequent_f1}")
print(f"Test Random Baseline F1: {test_random_f1}")
print(f"Test Most Frequent Baseline F1: {test_most_frequent_f1}")

## Analyzing the labeled errors for NER

In [None]:
df=pd.read_csv('results/error_analysis_ner.csv')
correct_df=df[df['is_model_wrong']=='no']
wrong_df=df[df['is_model_wrong']=='yes']

In [None]:
round(correct_df['issue'].value_counts(normalize=True) * 100,2)

In [None]:
round(wrong_df['issue'].value_counts(normalize=True) * 100,0)

## Analyzing the labeled errors for RE

In [None]:
df=pd.read_csv('results/error_analysis_re.csv')
correct_df=df[df['is_model_wrong']=='no']
wrong_df=df[df['is_model_wrong']=='yes']

In [None]:
print(len(df),len(correct_df),len(wrong_df))

In [None]:
print(len(wrong_df[wrong_df["predicted_label"]=="birth-place"]))

In [None]:
wrong_df['predicted_label'].value_counts()

In [None]:
wrong_df