In [1]:
import torch
import pandas as pd
from experiments.hyperparameters import optuna_hp_space, optuna_hp_space_scientific
from named_entity.named_entity_model import NamedEntityModel
from relations.relations_model import RelationsModel

In [None]:
pd.set_option('display.max_rows', 10000)
os.chdir("..")

# Hyperparameter optimization

In [None]:
NER_BROAD_FILE_PATH='results/hyperparameter_optimization_ner_broad.txt'
RE_BROAD_FILE_PATH='results/hyperparameter_optimization_re_broad.txt'
NER_SCIENTIFIC_FILE_PATH='results/hyperparameter_optimization_ner_scientific.txt'
RE_SCIENTIFIC_FILE_PATH='results/hyperparameter_optimization_re_scientific.txt'

In [None]:
torch.cuda.empty_cache()
train_df = pd.read_csv("merged_train.tsv", sep="\t")
test_df = pd.read_csv("merged_test.tsv", sep="\t")
train_df=train_df.sample(frac=0.02,random_state=42)
train_df = filter_out_wrong_data(train_df)
test_df = filter_out_wrong_data(test_df)
ner_model=NamedEntityModel()
re_model=RelationsModel()

## Broad search

In [None]:
%%capture captured
ner_model.perform_hyperparameter_search(space=optuna_hp_space,train_df=train_df, study_name="ner_hyperparameter_search_broad")
with open(NER_BROAD_FILE_PATH, 'w') as f:
    f.write(captured.stdout)

In [None]:
%%capture captured
re_model.perform_hyperparameter_search(space=optuna_hp_space,train_df=train_df, study_name="re_hyperparameter_search_broad")
with open(RE_BROAD_FILE_PATH, 'w') as f:
    f.write(captured.stdout)

## Scientific-based search

In [None]:
%%capture captured
ner_model.perform_hyperparameter_search(space=optuna_hp_space_scientific,train_df=train_df, study_name="ner_hyperparameter_search_scientific")
with open(NER_SCIENTIFIC_FILE_PATH, 'w') as f:
    f.write(captured.stdout)

In [None]:
%%capture captured
re_model.perform_hyperparameter_search(space=optuna_hp_space_scientific,train_df=train_df, study_name="re_hyperparameter_search_scientific")
with open(RE_SCIENTIFIC_FILE_PATH, 'w') as f:
    f.write(captured.stdout)

In [None]:
def analyze_optuna_results(file_path):
    file_name = os.path.splitext(os.path.basename(ner_broad_file_path))[0]
    print(f"Analyzing: {file_name}")
    df=read_optuna_logs(file_path)
    for column in df.columns.values:
        if column not in ["metric","trial_number"]:
            if df[column].dtype == 'int64':
                fig = px.histogram(df, x=column, y="metric", title=f"{column} impact on metric", histfunc='avg')
            else:
                fig = px.scatter(df, x=column, y="metric", text='trial_number', trendline="ols", title=f"{column} impact on metric")
            fig.show()

## Analysis

In [None]:
analyze_optuna_results(file_path=NER_BROAD_FILE_PATH)

In [None]:
analyze_optuna_results(file_path=RE_BROAD_FILE_PATH)

In [None]:
analyze_optuna_results(file_path=NER_SCIENTIFIC_FILE_PATH)

In [None]:
analyze_optuna_results(file_path=RE_SCIENTIFIC_FILE_PATH)

# Dataset size impact

In [None]:
SIZES=[100,1000,5000,10000,20000,50000,100000,200000,300000,400000,500000]

In [None]:
torch.cuda.empty_cache()
train_df = pd.read_csv("merged_train.tsv", sep="\t")
test_df = pd.read_csv("merged_test.tsv", sep="\t")
train_df=train_df.sample(frac=0.5,random_state=42)
train_df = filter_out_wrong_data(train_df)
test_df = filter_out_wrong_data(test_df)
ner_model=NamedEntityModel()
re_model=RelationsModel()

In [None]:
ner_results=test_model_quality_depending_on_dataset_size(model=ner_model, train_df=train_df, test_df=test_df, sizes=SIZES, random_state=42)

In [None]:
re_results=test_model_quality_depending_on_dataset_size(model=re_model, train_df=train_df, test_df=test_df, sizes=SIZES, random_state=42)

In [None]:
display(ner_results)

In [None]:
display(re_results)

# Joining The Models Together

In [None]:
torch.cuda.empty_cache()
train_df = pd.read_csv("merged_train.tsv", sep="\t")
test_df = pd.read_csv("merged_test.tsv", sep="\t")
train_df=train_df.sample(frac=0.5,random_state=42)
train_df = filter_out_wrong_data(train_df)
test_df = filter_out_wrong_data(test_df)
ner_model=NamedEntityModel()
re_model=RelationsModel()

In [None]:
# read parameter needs to be set to False if predicting for the first time (and lack NER prediction results for a given dataset subset)
test_enhancing_text_used_to_train_re(train_df, test_df, ner_model, re_model, results_file='results_optimized_ner.txt',read=True):

# Model Variant Comparison

In [None]:
torch.cuda.empty_cache()
train_df = pd.read_csv("merged_train.tsv", sep="\t")
test_df = pd.read_csv("merged_test.tsv", sep="\t")
train_df=train_df.sample(frac=0.5,random_state=42)
train_df = filter_out_wrong_data(train_df)
test_df = filter_out_wrong_data(test_df)


## DistilBERT

In [None]:
ner_model=NamedEntityModel(model_name='distilbert-base-multilingual-cased',model_path='models/distilbert_ner')
ner_model=RelationsModel(model_name='distilbert-base-multilingual-cased',model_path='models/distillbert_re')
ner_model.train(train_df=train_df,base_config=None)
distilbert_results_ner=ner_model.evaluate(df=test_df)
distilbert_results_re=train_re_on_ner(ner_model=ner_model, re_model=re_model, train_df=train_df, test_df=test_df, enhancement_func=TODO, results_file='results_optimized_ner.txt', read=False)
# After the first call, the prediction results are surely saved, so we can set read to True

## XLMRoBERTa

In [None]:
ner_model=NamedEntityModel(model_name='xlm-roberta-base',model_path='models/xlmroberta_ner')
ner_model=RelationsModel(model_name='xlm-roberta-base',model_path='models/xlmroberta_re')
ner_model.train(train_df=train_df,base_config=None)
xlmroberta_results_ner=ner_model.evaluate(df=test_df)
xlmroberta_results_nererta_results_re=train_re_on_ner(ner_model=ner_model, re_model=re_model, train_df=train_df, test_df=test_df, enhancement_func=TODO, results_file='results_optimized_ner.txt', read=False)
# After the first call, the prediction results are surely saved, so we can set read to True

# Prediction Error Analysis

In [None]:
torch.cuda.empty_cache()
train_df = pd.read_csv("merged_train.tsv", sep="\t")
test_df = pd.read_csv("merged_test.tsv", sep="\t")
train_df=train_df.sample(frac=0.5,random_state=42)
train_df = filter_out_wrong_data(train_df)
test_df = filter_out_wrong_data(test_df)
ner_model=NamedEntityModel(model_path=BEST_NER_MODEL)
re_model=RelationsModel(model_path=BEST_RE_MODEL)

## F1 per relation

In [None]:
evaluate_with_division_between_column(model=re_model, test_df=test_df, column_name="relation"):

## Entity 1 vs Entity 2

# Linguistic Experiments

# Dataset Error Analysis