In [1]:
from pathlib import Path
from typing import List

import logging
import random

import pandas as pd

from enunlg.data_management.pipelinecorpus import TextPipelineCorpus


logger = logging.getLogger('enunlg-scripts.corpus_to_humeval_csv')

In [2]:

corpus_files = {
    # 'multitask_webnlg_sv': '../outputs/2024-06-02/22-08-48/evaluation-output.corpus',
    # 'multitask_webnlg_rdf': '../outputs/2024-06-02/22-08-53/evaluation-output.corpus',
    # 'multitask_webnlg_rdf_role-delex': '../outputs/2024-06-02/22-34-11/evaluation-output.corpus',
    # 'multitask_webnlg_rdf_agent-pred-delex': None,
    # 'singletask_webnlg_sv': '../outputs/2024-06-02/23-28-16/evaluation-output.corpus',
    'singletask_webnlg_sv_role-delex': '../outputs/2024-06-03/14-41-21/evaluation-output.corpus',
    # 'singletask_webnlg_rdf': '../outputs/2024-06-02/23-28-21/evaluation-output.corpus',
    'singletask_webnlg_rdf_role-delex': '../outputs/2024-06-02/23-39-09/evaluation-output.corpus',
    # 'singletask_webnlg_rdf_agent-pred-delex': '../outputs/2024-06-02/22-42-08/evaluation-output.corpus',
    'llm_webnlg_sv': "../for-analysis/llm/webnlg_slot-value.txt",
    'llm_webnlg_rdf': "../for-analysis/llm/webnlg_rdf.txt",
    # 'multitask_e2e_sv': '../outputs/2024-06-02/21-57-28/evaluation-output.corpus',
    # 'multitask_e2e_rdf': '../outputs/2024-06-02/21-57-33/evaluation-output.corpus',
    'singletask_e2e_sv': '../outputs/2024-06-02/22-49-07/evaluation-output.corpus',
    'singletask_e2e_rdf': '../outputs/2024-06-02/23-20-53/evaluation-output.corpus',
    'llm_e2e_sv': "../for-analysis/llm/e2e_slot-value.txt",
    'llm_e2e_rdf': "../for-analysis/llm/e2e_rdf.txt",
    'ref_webnlg': "../for-analysis/enriched-webnlg_refs.txt",
    'ref_e2e': "../tmp.txt",
}

In [3]:


def sample_ids() -> List[str]:
    sample = []
    for idx in range(1, 500, 10):
        i2 = random.choice((1, 2, 3))
        print(f"Id{idx}-Id{i2}")
        sample.append(f"Id{idx}-Id{i2}")
    return sample


In [4]:

# Load all the results corpora
corpora_for_analysis = {}
for sys_corpus_format_delex in corpus_files:
    if corpus_files[sys_corpus_format_delex] is not None:
        corpus_fp = Path(corpus_files[sys_corpus_format_delex])
        corpora_for_analysis[sys_corpus_format_delex] = TextPipelineCorpus.load(corpus_fp)

In [5]:

metadata_columns = ["id", "system", "corpus", "format", "delex"]
dfs_for_analysis = {}
for key in corpora_for_analysis:
    print(key)
    parts = key.split("_")
    system = parts[0]
    corpus = parts[1]
    if parts[2:]:
        mr_type = parts[2]
    else:
        mr_type = "none"
        delex = "none"
    if parts[3:]:
        delex = parts[3]
    else:
        if corpus == "e2e":
            delex = 'name-near-exact-match'
        else:
            delex = 'dbpedia-ontology-classes'
    df_metadata = [system, corpus, mr_type, delex]
    if "llm" in key:
        delex = "none"
        df_metadata[-1] = delex
        annotation_layers = ["raw_input", "GPT4_output", "Llama_output"]
        layer_column_labels = [f"{mr_type}", f"GPT4_{mr_type}", f"Llama_{mr_type}"]
    elif "ref" in key:
        annotation_layers = ["raw_output"]
        layer_column_labels = ["ref"]
    else:
        annotation_layers = ["best_output_relexed"]
        layer_column_labels = [f"{system}_{corpus}_{mr_type}_{delex}"]
    rows = []
    for entry in corpora_for_analysis[key]:
        # if entry.metadata.get('id') in sampled_ids:
        row = [entry.metadata.get('id')]  # + df_metadata
        for layer_name in annotation_layers:
            row.append(entry[layer_name])
        rows.append(row)
    print(df_metadata)
    dfs_for_analysis[key] = pd.DataFrame(rows, columns=['id'] + layer_column_labels)
print(len(dfs_for_analysis))

singletask_webnlg_sv_role-delex
['singletask', 'webnlg', 'sv', 'role-delex']
singletask_webnlg_rdf_role-delex
['singletask', 'webnlg', 'rdf', 'role-delex']
llm_webnlg_sv
['llm', 'webnlg', 'sv', 'none']
llm_webnlg_rdf
['llm', 'webnlg', 'rdf', 'none']
singletask_e2e_sv
['singletask', 'e2e', 'sv', 'name-near-exact-match']
singletask_e2e_rdf
['singletask', 'e2e', 'rdf', 'name-near-exact-match']
llm_e2e_sv
['llm', 'e2e', 'sv', 'none']
llm_e2e_rdf
['llm', 'e2e', 'rdf', 'none']
ref_webnlg
['ref', 'webnlg', 'none', 'dbpedia-ontology-classes']
ref_e2e
['ref', 'e2e', 'none', 'name-near-exact-match']
10


In [6]:

e2e_common_ids = set()
e2e_df = None
for key in dfs_for_analysis:
    if "e2e" in key:
        df = dfs_for_analysis[key]
        print(df.head())
        id_set = set(df['id'])
        if e2e_common_ids:
            print("intersecting")
            e2e_common_ids = e2e_common_ids.intersection(id_set)
        else:
            e2e_common_ids = id_set
        print(len(e2e_common_ids))
        if e2e_df is None:
            e2e_df = df
        else:
            e2e_df = e2e_df.merge(df, on=["id"])

print(len(e2e_df))
e2e_df = e2e_df.loc[:,~e2e_df.columns.str.contains('_[xy]$')]



        id     singletask_e2e_sv_name-near-exact-match
0  Id1-Id1     Blue Spice is a pub in the city centre.
1  Id1-Id2     Blue Spice is a pub in the city centre.
2  Id2-Id1  Blue Spice is a pub in the riverside area.
3  Id2-Id2  Blue Spice is a pub in the riverside area.
4  Id3-Id1     Blue Spice is a pub in the city centre.
4566
        id         singletask_e2e_rdf_name-near-exact-match
0  Id1-Id1  Blue Spice is a pub located in the city centre.
1  Id1-Id2  Blue Spice is a pub located in the city centre.
2  Id2-Id1       Blue Spice is a pub in the riverside area.
3  Id2-Id2       Blue Spice is a pub in the riverside area.
4  Id3-Id1  Blue Spice is a pub located in the city centre.
intersecting
4566
        id                                                 sv  \
0  Id1-Id1  name == Blue Spice <PAIR_SEP> eat type == coff...   
1  Id1-Id2  name == Blue Spice <PAIR_SEP> eat type == coff...   
2  Id2-Id1  name == Blue Spice <PAIR_SEP> eat type == coff...   
3  Id2-Id2  name == Blue Sp

In [7]:
e2e_df

Unnamed: 0,id,singletask_e2e_sv_name-near-exact-match,singletask_e2e_rdf_name-near-exact-match,sv,GPT4_sv,Llama_sv,rdf,GPT4_rdf,Llama_rdf,ref
0,Id1-Id1,Blue Spice is a pub in the city centre.,Blue Spice is a pub located in the city centre.,name == Blue Spice <PAIR_SEP> eat type == coff...,Blue Spice is a coffee shop located in the cit...,Blue Spice is a coffee shop in the city centre.,<SUBJECT> Blue Spice <PREDICATE> eat type <OBJ...,Blue Spice is a coffee shop located in the cit...,Blue Spice is a type of coffee shop located in...,A coffee shop in the city centre area called B...
1,Id1-Id2,Blue Spice is a pub in the city centre.,Blue Spice is a pub located in the city centre.,name == Blue Spice <PAIR_SEP> eat type == coff...,Blue Spice is a coffee shop located in the cit...,Blue Spice is a coffee shop located in the cit...,<SUBJECT> Blue Spice <PREDICATE> eat type <OBJ...,Blue Spice is a coffee shop located in the cit...,Blue Spice is a type of coffee shop that can b...,Blue Spice is a coffee shop in city centre .
2,Id2-Id1,Blue Spice is a pub in the riverside area.,Blue Spice is a pub in the riverside area.,name == Blue Spice <PAIR_SEP> eat type == coff...,Blue Spice is a coffee shop located by the riv...,"""Blue Spice is a coffee shop located in the ri...",<SUBJECT> Blue Spice <PREDICATE> eat type <OBJ...,Blue Spice is a coffee shop located by the riv...,Blue Spice is a type of coffee shop that is lo...,There is a coffee shop Blue Spice in the river...
3,Id2-Id2,Blue Spice is a pub in the riverside area.,Blue Spice is a pub in the riverside area.,name == Blue Spice <PAIR_SEP> eat type == coff...,Blue Spice is a coffee shop located by the riv...,Blue Spice is a coffee shop in the Riverside a...,<SUBJECT> Blue Spice <PREDICATE> eat type <OBJ...,Blue Spice is a coffee shop located by the riv...,Blue Spice is a type of eatery located at a co...,"At the riverside , there is a coffee shop call..."
4,Id3-Id1,Blue Spice is a pub in the city centre.,Blue Spice is a pub located in the city centre.,name == Blue Spice <PAIR_SEP> eat type == pub ...,Blue Spice is a pub located in the city centre.,"""Blue Spice is a pub in the city centre.""",<SUBJECT> Blue Spice <PREDICATE> eat type <OBJ...,Blue Spice is a pub located in the city centre.,Blue Spice is a type of eatery located in the ...,A pub in the city centre area called Blue Spice .
...,...,...,...,...,...,...,...,...,...,...
4561,Id630-Id2,The Wrestlers is a coffee shop providing Japan...,The Wrestlers is a family friendly Japanese pu...,name == The Wrestlers <PAIR_SEP> eat type == p...,The Wrestlers is a riverside pub offering Japa...,"The Wrestlers, a pub serving Japanese food, is...",<SUBJECT> The Wrestlers <PREDICATE> eat type <...,The Wrestlers is a riverside pub that serves J...,The Wrestlers is a pub serving Japanese food. ...,"The Wrestlers is an expensive , family friendl..."
4562,Id630-Id3,The Wrestlers is a coffee shop providing Japan...,The Wrestlers is a family friendly Japanese pu...,name == The Wrestlers <PAIR_SEP> eat type == p...,The Wrestlers is a pub located in the riversid...,"""The Wrestlers is a Japanese pub in the Rivers...",<SUBJECT> The Wrestlers <PREDICATE> eat type <...,The Wrestlers is a pub located in the riversid...,The Wrestlers pub serves Japanese food and cos...,Near Raja Indian Cuisine in riverside is low r...
4563,Id630-Id4,The Wrestlers is a coffee shop providing Japan...,The Wrestlers is a family friendly Japanese pu...,name == The Wrestlers <PAIR_SEP> eat type == p...,The Wrestlers is a Japanese pub located by the...,"The Wrestlers, a Japanese pub in the riverside...",<SUBJECT> The Wrestlers <PREDICATE> eat type <...,The Wrestlers is a pub located in the riversid...,The Wrestlers is a pub that serves Japanese fo...,The Wrestlers is a Japanese food restaurant wi...
4564,Id630-Id5,The Wrestlers is a coffee shop providing Japan...,The Wrestlers is a family friendly Japanese pu...,name == The Wrestlers <PAIR_SEP> eat type == p...,The Wrestlers is a riverside pub offering Japa...,The Wrestlers is a Japanese pub in the Riversi...,<SUBJECT> The Wrestlers <PREDICATE> eat type <...,The Wrestlers is a riverside pub serving Japan...,The Wrestlers is a pub that serves Japanese fo...,A Family friendly pub The Wrestlers serves Jap...


In [8]:

webnlg_common_ids = set()
webnlg_df = None
for key in dfs_for_analysis:
    if "webnlg" in key:
        df = dfs_for_analysis[key]
        print(df.head())
        id_set = set(df['id'])
        if webnlg_common_ids:
            print("intersecting")
            webnlg_common_ids = webnlg_common_ids.intersection(id_set)
        else:
            webnlg_common_ids = id_set
        print(len(webnlg_common_ids))
        if webnlg_df is None:
            webnlg_df = df
        else:
            webnlg_df = webnlg_df.merge(df, on=["id"])

print(len(webnlg_df))
# webnlg_df = webnlg_df.loc[:,~webnlg_df.columns.str.contains('_[xy]$')]

        id                    singletask_webnlg_sv_role-delex
0  Id1-Id1  The Al Taqaddum Air Base serves the city of Ab...
1  Id1-Id2  Abilene Regional Airport serves the city of Ab...
2  Id2-Id1  Adolfo Suárez Madrid–Barajas Airport is locate...
3  Id2-Id2  Adolfo Suarez Madrid -Barejas airport is locat...
4  Id2-Id3  Adolfo Suarez Madrid -Barajas airport is locat...
4914
        id                   singletask_webnlg_rdf_role-delex
0  Id1-Id1  Abilene, Texas is the operating organisation o...
1  Id1-Id2  Abilene Regional Airport serves the city of Ap...
2  Id2-Id1  Adolfo Suárez Madrid–Barajas Airport is locate...
3  Id2-Id2  The Adolfo Suarez Madrid -Barajas airport is l...
4  Id2-Id3  The Adolfo Suarez Madrid -Barajas Airport is l...
intersecting
4914
        id                                                 sv  \
0  Id1-Id1  name == Abilene Regional Airport <PAIR_SEP> ci...   
1  Id1-Id2  name == Abilene Regional Airport <PAIR_SEP> ci...   
2  Id2-Id1  name == Adolfo Suárez Madr

In [9]:
webnlg_df


Unnamed: 0,id,singletask_webnlg_sv_role-delex,singletask_webnlg_rdf_role-delex,sv,GPT4_sv,Llama_sv,rdf,GPT4_rdf,Llama_rdf,ref
0,Id1-Id1,The Al Taqaddum Air Base serves the city of Ab...,"Abilene, Texas is the operating organisation o...",name == Abilene Regional Airport <PAIR_SEP> ci...,"Abilene Regional Airport serves Abilene, Texas.","Abilene Regional Airport serves Abilene, Texas.",<SUBJECT> Abilene Regional Airport <PREDICATE>...,Abilene Regional Airport serves the city of Ab...,Abilene Regional Airport serves the city of Ab...,"Abilene , Texas is served by the Abilene regio..."
1,Id1-Id2,Abilene Regional Airport serves the city of Ab...,Abilene Regional Airport serves the city of Ap...,name == Abilene Regional Airport <PAIR_SEP> ci...,Abilene Regional Airport serves the city of Ab...,"Abilene Regional Airport serves Abilene, Texas.",<SUBJECT> Abilene Regional Airport <PREDICATE>...,Abilene Regional Airport serves the city of Ab...,Abilene Regional Airport serves the city of Ab...,Abilene Regional Airport serves the city of Ab...
2,Id2-Id1,Adolfo Suárez Madrid–Barajas Airport is locate...,Adolfo Suárez Madrid–Barajas Airport is locate...,name == Adolfo Suárez Madrid– Barajas Airport ...,Adolfo Suárez Madrid–Barajas Airport is locate...,Adolfo Suárez Madrid–Barajas Airport is locate...,<SUBJECT> Adolfo Suárez Madrid– Barajas Airpor...,Adolfo Suárez Madrid–Barajas Airport is locate...,Adolfo Suárez Madrid–Barajas Airport is locate...,Adolfo Suárez Madrid – Barajas Airport can be ...
3,Id2-Id2,Adolfo Suarez Madrid -Barejas airport is locat...,The Adolfo Suarez Madrid -Barajas airport is l...,name == Adolfo Suárez Madrid– Barajas Airport ...,Adolfo Suárez Madrid–Barajas Airport is locate...,Adolfo Suárez Madrid–Barajas Airport is locate...,<SUBJECT> Adolfo Suárez Madrid– Barajas Airpor...,Adolfo Suárez Madrid–Barajas Airport is locate...,Adolfo Suárez Madrid–Barajas Airport is locate...,Adolfo Suarez Madrid- Barajas airport is locat...
4,Id2-Id3,Adolfo Suarez Madrid -Barajas airport is locat...,The Adolfo Suarez Madrid -Barajas Airport is l...,name == Adolfo Suárez Madrid– Barajas Airport ...,Adolfo Suárez Madrid–Barajas Airport is locate...,Adolfo Suárez Madrid–Barajas Airport is locate...,<SUBJECT> Adolfo Suárez Madrid– Barajas Airpor...,Adolfo Suárez Madrid–Barajas Airport is locate...,Adolfo Suárez Madrid–Barajas Airport is locate...,Adolfo Suarez Madrid- Barajas Airport is locat...
...,...,...,...,...,...,...,...,...,...,...
4909,Id958-Id1,The School of Business and Social Sciences at ...,The School of Business and Social Sciences at ...,name == School of Business and Social Sciences...,The School of Business and Social Sciences at ...,The School of Business and Social Sciences at ...,<SUBJECT> School of Business and Social Scienc...,The School of Business and Social Sciences at ...,* The School of Business and Social Sciences a...,The School of Business and Social Sciences at ...
4910,Id958-Id2,The School of Business and Social Sciences at ...,The School of Business and Social Sciences at ...,name == School of Business and Social Sciences...,The School of Business and Social Sciences at ...,The School of Business and Social Sciences at ...,<SUBJECT> School of Business and Social Scienc...,The School of Business and Social Sciences at ...,The School of Business and Social Sciences at ...,"Established in 1928 , the School of Business a..."
4911,Id959-Id1,"Brussels is the leader of Denmark, where the c...",The School of Business and Social Sciences at ...,name == School of Business and Social Sciences...,The School of Business and Social Sciences at ...,The School of Business and Social Sciences is ...,<SUBJECT> School of Business and Social Scienc...,The School of Business and Social Sciences at ...,The School of Business and Social Sciences is ...,Denmark is led by the Monarchy of Demark and t...
4912,Id959-Id2,The School of Business and Social Sciences at ...,The School of Business and Social Sciences at ...,name == School of Business and Social Sciences...,The School of Business and Social Sciences at ...,The School of Business and Social Sciences is ...,<SUBJECT> School of Business and Social Scienc...,The School of Business and Social Sciences at ...,The School of Business and Social Sciences at ...,The School of Business and Social Sciences at ...


In [10]:
# Sample 50 ids from the e2e corpus
import random
random.seed(42)
e2e_sample_ids = random.choices(list(e2e_common_ids), k=48)
e2e_items = e2e_df.loc[e2e_df['id'].isin(e2e_sample_ids)]
e2e_items

Unnamed: 0,id,singletask_e2e_sv_name-near-exact-match,singletask_e2e_rdf_name-near-exact-match,sv,GPT4_sv,Llama_sv,rdf,GPT4_rdf,Llama_rdf,ref
60,Id27-Id2,Blue Spice is a pub near Crowne Plaza Hotel wi...,Blue Spice is a pub near Crowne Plaza Hotel wi...,name == Blue Spice <PAIR_SEP> eat type == coff...,Blue Spice is a coffee shop with a customer ra...,"""Blue Spice is a highly-rated coffee shop near...",<SUBJECT> Blue Spice <PREDICATE> eat type <OBJ...,Blue Spice is a coffee shop with a customer ra...,Blue Spice is a type of eatery located in a co...,"The Blue Spice coffee shop , near Crowne Plaza..."
167,Id48-Id4,Green Man is a pub near All Bar One with a cus...,Green Man is a pub with a customer rating of 3...,name == Green Man <PAIR_SEP> eat type == pub <...,Green Man is a pub with a customer rating of 3...,"""The Green Man pub has a 3 out of 5 customer r...",<SUBJECT> Green Man <PREDICATE> eat type <OBJE...,The Green Man is a pub with a customer rating ...,The Green Man is a pub where you can eat. It h...,"Located near All Bar One , the Green Man pub h..."
200,Id55-Id6,Zizzi is a pub with an average customer rating...,Zizzi is a pub located near Burger King. It ha...,name == Zizzi <PAIR_SEP> eat type == coffee sh...,Zizzi is a coffee shop with an average custome...,"Zizzi, a coffee shop, has an average customer ...",<SUBJECT> Zizzi <PREDICATE> eat type <OBJECT> ...,Zizzi is a coffee shop with an average custome...,Zizzi is a type of coffee shop. It has an aver...,The Zizzi coffee shop is located near Burger K...
473,Id77-Id3,The Cricketers is a restaurant located near Ca...,The Cricketers is a non family - friendly rest...,name == The Cricketers <PAIR_SEP> eat type == ...,The Cricketers is a restaurant with an average...,The Cricketers is a restaurant with average cu...,<SUBJECT> The Cricketers <PREDICATE> eat type ...,The Cricketers is a restaurant with an average...,The Cricketers eat at a restaurant. They have ...,The Cricketers restaurant has an average custo...
563,Id84-Id18,The Cricketers is a family friendly restaurant...,The Cricketers is a family friendly restaurant...,name == The Cricketers <PAIR_SEP> eat type == ...,The Cricketers is a family-friendly restaurant...,The Cricketers is a family-friendly restaurant...,<SUBJECT> The Cricketers <PREDICATE> eat type ...,The Cricketers is a restaurant near Ranch. It ...,The Cricketers eat at a type of restaurant. Th...,"Ranch is located near the low rated , children..."
668,Id93-Id4,Blue Spice is a Chinese restaurant located in ...,Blue Spice provides Chinese food It is located...,name == Blue Spice <PAIR_SEP> eat type == rest...,Blue Spice is a Chinese restaurant in the city...,Blue Spice is a Chinese restaurant in the city...,<SUBJECT> Blue Spice <PREDICATE> eat type <OBJ...,Blue Spice is a Chinese restaurant located in ...,Blue Spice is a Chinese restaurant in the city...,Near Rainbow Vegetarian Café Blue Spice restau...
713,Id98-Id4,Blue Spice is a family - friendly English rest...,Blue Spice is a family - friendly English rest...,name == Blue Spice <PAIR_SEP> eat type == rest...,Blue Spice is an English restaurant located in...,Blue Spice is a restaurant in the city centre ...,<SUBJECT> Blue Spice <PREDICATE> eat type <OBJ...,Blue Spice is a family-friendly restaurant loc...,Blue Spice is a restaurant that serves English...,A family- friendly English restaurant near Rai...
966,Id120-Id7,Giraffe is a family friendly French restaurant...,Giraffe is a family friendly French restaurant...,name == Giraffe <PAIR_SEP> eat type == restaur...,Giraffe is a French restaurant located riversi...,"Giraffe, a French restaurant, is a family-frie...",<SUBJECT> Giraffe <PREDICATE> eat type <OBJECT...,Giraffe is a family-friendly French restaurant...,The Giraffe restaurant serves French food and ...,If you are looking for a family- friendly rest...
1051,Id126-Id22,Green Man is a family friendly pub that serves...,Green Man is a family friendly Japanese pub ne...,name == Green Man <PAIR_SEP> eat type == pub <...,The Green Man is a family-friendly Japanese pu...,"""Green Man, a Japanese pub, is a family-friend...",<SUBJECT> Green Man <PREDICATE> eat type <OBJE...,Green Man is a family-friendly pub serving Jap...,The Green Man is a pub that serves Japanese fo...,Green Man is a pub that offers Japanese food ....
1134,Id132-Id1,Green Man is a family friendly Japanese restau...,Green Man is a family friendly Japanese restau...,name == Green Man <PAIR_SEP> eat type == resta...,Green Man is a family-friendly Japanese restau...,Green Man is a Japanese restaurant located in ...,<SUBJECT> Green Man <PREDICATE> eat type <OBJE...,Green Man is a Japanese restaurant located riv...,The Green Man is a restaurant that serves Japa...,The Green Man is a family friendly restaurant ...


In [11]:
# Create 12 lists of 28 items each to cover the 48 ids over 7 conditions (3 systems x 2 input reps + 1 ref)
conditions = ["singletask_e2e_sv_name-near-exact-match", "singletask_e2e_rdf_name-near-exact-match", "GPT4_sv", "GPT4_rdf", "Llama_sv", "Llama_rdf", "ref"]
indices = e2e_items.index
from itertools import cycle
count = 0
combos = set()
item_headers = []
for idx in range(1, 31):
    item_headers.extend([f"item{idx}_id", f"item{idx}_sys", f"item{idx}_format", f"item{idx}_input", f"item{idx}_text"])
headers = ["listID"] + item_headers
rows = []
curr_row_lists = []
for idx, condition in zip(cycle(indices), cycle(conditions)):
    item_id = e2e_items.loc[idx, 'id']
    parts = condition.split("_")
    if parts[2:]:
        sys = f"{parts[0]}"
        mr_type = f"{parts[2]}"
    elif parts[1:]:
        sys = parts[0]
        mr_type = parts[1]
    else:
        sys = "ref"
        mr_type = "none"
    rdf_input = e2e_items.loc[idx, 'rdf'].replace("<TRIPLE_SEP> ", "")
    text = e2e_items.loc[idx, condition]
    list_id = len(rows) + 1
    curr_row_lists.append([item_id, sys, mr_type, rdf_input, text])
    if len(curr_row_lists) == 4:
        curr_row_lists.append(["attn1", "dave", "lol", "<SUBJECT> Please mark <PREDICATE> all Input <OBJECT> as missing for this task", "Please rate this Output as Very Disfluent."])
    if len(curr_row_lists) == 24:
        curr_row_lists.append(["attn2", "dave", "lol", "<SUBJECT> Please mark <PREDICATE> all Input <OBJECT> as incorrect for this task", "Please rate this Output as Somewhat Fluent."])
    if len(curr_row_lists) == 30:
        random.shuffle(curr_row_lists)
        row = [list_id]
        for item in curr_row_lists:
            row.extend(item)
        rows.append(row)
        random.shuffle(curr_row_lists)
        row = [list_id]
        for item in curr_row_lists:
            row.extend(item)
        rows.append(row)
        random.shuffle(curr_row_lists)
        row = [list_id]
        for item in curr_row_lists:
            row.extend(item)
        rows.append(row)
        curr_row_lists = []
    count += 1
    if count == 336:
        break
    
# Convert the lists into an input CSV for Lewis

In [12]:
len(rows)

36

In [13]:
e2e_humeval_df = pd.DataFrame(rows, columns=headers)

In [14]:
e2e_humeval_df

Unnamed: 0,listID,item1_id,item1_sys,item1_format,item1_input,item1_text,item2_id,item2_sys,item2_format,item2_input,...,item29_id,item29_sys,item29_format,item29_input,item29_text,item30_id,item30_sys,item30_format,item30_input,item30_text
0,1,Id163-Id9,Llama,sv,<SUBJECT> The Mill <PREDICATE> eat type <OBJEC...,The Mill is a high-end English restaurant in t...,attn2,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,...,Id284-Id4,GPT4,rdf,<SUBJECT> The Vaults <PREDICATE> eat type <OBJ...,The Vaults is a high-priced restaurant in the ...,Id259-Id2,Llama,rdf,<SUBJECT> The Vaults <PREDICATE> eat type <OBJ...,The Vaults is a pub that serves French food at...
1,1,attn2,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,Please rate this Output as Somewhat Fluent.,Id98-Id4,ref,none,<SUBJECT> Blue Spice <PREDICATE> eat type <OBJ...,...,Id301-Id6,Llama,rdf,<SUBJECT> The Waterman <PREDICATE> eat type <O...,The Waterman is a pub that serves Italian food...,Id48-Id4,singletask,rdf,<SUBJECT> Green Man <PREDICATE> eat type <OBJE...,Green Man is a pub with a customer rating of 3...
2,1,Id166-Id2,Llama,rdf,<SUBJECT> The Mill <PREDICATE> eat type <OBJEC...,The Mill is a restaurant that serves English f...,Id171-Id1,ref,none,<SUBJECT> The Mill <PREDICATE> eat type <OBJEC...,...,Id93-Id4,Llama,rdf,<SUBJECT> Blue Spice <PREDICATE> eat type <OBJ...,Blue Spice is a Chinese restaurant in the city...,Id180-Id5,singletask,rdf,<SUBJECT> The Phoenix <PREDICATE> eat type <OB...,The Phoenix is a fast food pub located in the ...
3,4,Id370-Id10,GPT4,sv,<SUBJECT> The Wrestlers <PREDICATE> eat type <...,The Wrestlers is a Japanese restaurant located...,Id547-Id4,ref,none,<SUBJECT> The Punter <PREDICATE> eat type <OBJ...,...,attn2,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,Please rate this Output as Somewhat Fluent.,Id575-Id4,singletask,rdf,<SUBJECT> The Vaults <PREDICATE> eat type <OBJ...,The Vaults is a pub providing Italian food in ...
4,4,Id547-Id4,ref,none,<SUBJECT> The Punter <PREDICATE> eat type <OBJ...,Near Express by Holiday Inn in Riverside is a ...,attn2,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,...,Id463-Id8,Llama,sv,<SUBJECT> The Mill <PREDICATE> eat type <OBJEC...,The Mill is a restaurant serving English food ...,Id120-Id7,ref,none,<SUBJECT> Giraffe <PREDICATE> eat type <OBJECT...,If you are looking for a family- friendly rest...
5,4,Id447-Id2,singletask,rdf,<SUBJECT> The Mill <PREDICATE> eat type <OBJEC...,The Mill is a fast food coffee shop located in...,Id55-Id6,singletask,rdf,<SUBJECT> Zizzi <PREDICATE> eat type <OBJECT> ...,...,Id48-Id4,singletask,sv,<SUBJECT> Green Man <PREDICATE> eat type <OBJE...,Green Man is a pub near All Bar One with a cus...,Id77-Id3,GPT4,sv,<SUBJECT> The Cricketers <PREDICATE> eat type ...,The Cricketers is a restaurant with an average...
6,7,Id313-Id2,Llama,rdf,<SUBJECT> The Waterman <PREDICATE> eat type <O...,The Waterman is a restaurant that serves India...,Id126-Id22,singletask,sv,<SUBJECT> Green Man <PREDICATE> eat type <OBJE...,...,Id276-Id8,Llama,rdf,<SUBJECT> The Vaults <PREDICATE> eat type <OBJ...,The Vaults is a French restaurant in the river...,Id390-Id5,GPT4,sv,<SUBJECT> The Cricketers <PREDICATE> eat type ...,The Cricketers is a high-priced Chinese restau...
7,7,Id197-Id1,singletask,rdf,<SUBJECT> The Phoenix <PREDICATE> eat type <OB...,The Phoenix is a fast food restaurant located ...,Id443-Id5,Llama,rdf,<SUBJECT> The Mill <PREDICATE> eat type <OBJEC...,...,attn1,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,Please rate this Output as Very Disfluent.,Id171-Id1,Llama,rdf,<SUBJECT> The Mill <PREDICATE> eat type <OBJEC...,The Mill is a restaurant that serves English f...
8,7,Id410-Id3,GPT4,rdf,<SUBJECT> The Cricketers <PREDICATE> eat type ...,"The Cricketers is a high-rated, family-friendl...",attn2,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,...,Id284-Id4,GPT4,sv,<SUBJECT> The Vaults <PREDICATE> eat type <OBJ...,The Vaults is a high-end Indian restaurant loc...,Id390-Id5,GPT4,sv,<SUBJECT> The Cricketers <PREDICATE> eat type ...,The Cricketers is a high-priced Chinese restau...
9,10,Id576-Id6,singletask,rdf,<SUBJECT> The Vaults <PREDICATE> eat type <OBJ...,The Vaults is a high - priced pub located near...,Id463-Id8,GPT4,rdf,<SUBJECT> The Mill <PREDICATE> eat type <OBJEC...,...,Id463-Id4,GPT4,sv,<SUBJECT> The Mill <PREDICATE> eat type <OBJEC...,The Mill is an English restaurant located in t...,Id98-Id4,Llama,sv,<SUBJECT> Blue Spice <PREDICATE> eat type <OBJ...,Blue Spice is a restaurant in the city centre ...


In [15]:
e2e_humeval_df.to_csv("e2e-humeval.csv")

In [16]:
# Sample 50 ids from the webnlg corpus
import random
random.seed(42)
webnlg_sample_ids = random.choices(list(webnlg_common_ids), k=48)
webnlg_items = webnlg_df.loc[webnlg_df['id'].isin(webnlg_sample_ids)]
webnlg_items

Unnamed: 0,id,singletask_webnlg_sv_role-delex,singletask_webnlg_rdf_role-delex,sv,GPT4_sv,Llama_sv,rdf,GPT4_rdf,Llama_rdf,ref
27,Id11-Id3,The first runway at Amsterdam Airport Schiphol...,The 1st runway at Amsterdam Airport Schiphol i...,name == Amsterdam Airport Schiphol <PAIR_SEP> ...,Amsterdam Airport Schiphol's 5th runway has an...,Amsterdam Airport Schiphol has a 5th runway wi...,<SUBJECT> Amsterdam Airport Schiphol <PREDICAT...,Amsterdam Airport Schiphol's 5th runway is sur...,Amsterdam Airport Schiphol's 5th runway has an...,The 5th runway of the Amsterdam Airport Schiph...
37,Id16-Id1,The 1st runway at Angola International Airport...,The 1st runway at Angola International Airport...,name == Angola International Airport <PAIR_SEP...,Angola International Airport's first runway su...,The Angola International Airport has a 1st run...,<SUBJECT> Angola International Airport <PREDIC...,The first runway surface type at Angola Intern...,Angola International Airport's 1st runway has ...,Angola International Airport is the first runw...
95,Id1127-Id1,Aaron Deer was born in The Horns of Happiness.,Aaron Deer was followed by The Horns of Happin...,name == Aaron Deer <PAIR_SEP> associated band/...,Aaron Deer is associated with the band The Hor...,Aaron Deer is a member of the band The Horns o...,<SUBJECT> Aaron Deer <PREDICATE> associated ba...,Aaron Deer is associated with The Horns of Hap...,Aaron Deer is associated with the band/musical...,Aaron Deer collaborates with The Horns of Happ...
232,Id143-Id3,California is a food found in Benitoite.,California is affiliated with the Benitoite.,name == California <PAIR_SEP> gemstone == Beni...,California's state gemstone is Benitoite.,California's official gemstone is Benitoite.,<SUBJECT> California <PREDICATE> gemstone <OBJ...,California's gemstone is Benitoite.,California's gemstone is Benitoite.,Benitoite is a gemstone from California .
408,Id69-Id2,New York City is part of New Netherland.,New York City is part of New Netherland.,name == New York City <PAIR_SEP> is part of ==...,New York City is part of New Netherland.,New York City is part of New Netherland.,<SUBJECT> New York City <PREDICATE> is part of...,New York City is part of New Netherland.,New York City is part of New Netherland.,New York city is part of New Netherland .
523,Id171-Id1,The leader of Atlanta is Kasim Reed.,The leader of Atlanta is Kasim Reed.,name == Atlanta <PAIR_SEP> leader == Kasim Reed,Atlanta is led by Kasim Reed.,The city of Atlanta is led by Kasim Reed.,<SUBJECT> Atlanta <PREDICATE> leader <OBJECT> ...,Kasim Reed is a leader in Atlanta.,Kasim Reed is the leader of Atlanta.,Kasim Reed is the leader in Atlanta .
524,Id171-Id2,The leader of Atlanta is Kasim Reed.,The leader of Atlanta is Kasim Reed.,name == Atlanta <PAIR_SEP> leader == Kasim Reed,Atlanta is led by Kasim Reed.,The city of Atlanta is led by Kasim Reed.,<SUBJECT> Atlanta <PREDICATE> leader <OBJECT> ...,Kasim Reed is a leader in Atlanta.,Kasim Reed is the leader of Atlanta.,Kasim Reed is the leader of Atlanta .
558,Id187-Id1,The leader of United States is Barack Obama.,The leader of United States is Barack Obama.,name == United States <PAIR_SEP> leader == Bar...,The United States was led by Barack Obama.,The leader of the United States is Barack Obama.,<SUBJECT> United States <PREDICATE> leader <OB...,Barack Obama is a leader of the United States.,The leader of the United States is Barack Obama.,Barack Obama is the leader of the United States .
737,Id1054-Id2,Ariane 5 was born in Guiana Space Centre.,The full name of Ariane 5 is Guiana Space Centre.,name == Ariane 5 <PAIR_SEP> launch site == Gui...,The Ariane 5 rocket was launched from the Guia...,The Ariane 5 was launched from the Guiana Spac...,<SUBJECT> Ariane 5 <PREDICATE> launch site <OB...,Ariane 5 is launched from the Guiana Space Cen...,The Ariane 5 launch site is the Guiana Space C...,The Ariane 5 was launched from the Guiana Spac...
783,Id1074-Id2,The United States was established in,"The parent company of United States is"" Addict...",name == United States <PAIR_SEP> anthem == The...,"The anthem of the United States is ""The Star-S...","The United States' national anthem is ""The Sta...",<SUBJECT> United States <PREDICATE> anthem <OB...,"The anthem of the United States is ""The Star-S...",The anthem of the United States is The Star-Sp...,The anthem of the United States is the Star Sp...


In [17]:

from typing import List

import abc
import collections
import regex

RegexRule = collections.namedtuple('RegexRule', ("match_expression", "replacement_expression"))

class AbstractTokeniser(abc.ABC):
    @classmethod
    def preprocess(cls, text: str):
        return text

    @classmethod
    def postprocess(cls, toks) -> str:
        return toks

    @classmethod
    @abc.abstractmethod
    def tokenise(cls, text: str) -> str:
        pass

    @classmethod
    def tokenize(cls, text: str) -> str:
        return cls.tokenise(text)

    @classmethod
    def tokenise_to_list(cls, text: str) -> List[str]:
        return cls.tokenise(text).split()

    @classmethod
    def tokenize_to_list(cls, text: str) -> List[str]:
        return cls.tokenise_to_list(text)

class INLG2024Tokenizer(AbstractTokeniser):
    """English tokenisation based mostly on TGen's tokenisation"""
    rules = (RegexRule(r'(([^\p{IsAlnum}\s\.\,−\-_])\2*)', r' \1 '),
             RegexRule(r'([^\p{N}])([,.])([^\p{N}])', r'\1 \2 \3'),
             RegexRule(r'([^\p{N}])([,.])([\p{N}])', r'\1 \2 \3'),
             RegexRule(r'([\p{N}])([,.])([^\p{N}])', r'\1 \2 \3'),
             RegexRule(r'(–-)([^\p{N}])', r'\1 \2'),
             RegexRule(r'(\p{N} *|[^ ])(-)', r'\1\2 '),
             RegexRule(r' ([-−])', r'\1'),
             RegexRule(r' ([\'’´])', r'\1'),
             # Second set keeps apostrophes together with words in most common contractions.
             RegexRule(r'([\'’´]) (s|m|d|ll|re|ve)\s', r' \1\2 '),
             RegexRule(r'(n [\'’´]) (t\s)', r' \1\2 '),
             # Third set of contractions based on Treex.
             RegexRule(r' ([Cc])annot\s', r' \1an not '),
             RegexRule(r' ([Dd]) \' ye\s', r' \1\' ye '),
             RegexRule(r' ([Gg])imme\s', r' \1im me '),
             RegexRule(r' ([Gg])onna\s', r' \1on na '),
             RegexRule(r' ([Gg])otta\s', r' \1ot ta '),
             RegexRule(r' ([Ll])emme\s', r' \1em me '),
             RegexRule(r' ([Mm])ore\'n\s', r' \1ore \'n '),
             RegexRule(r' \' ([Tt])is\s', r' \'\1 is '),
             RegexRule(r' \' ([Tt])was\s', r' \'\1 was '),
             RegexRule(r' ([Ww])anna\s', r' \1an na ')
             )
    detok_rules = (RegexRule(r' (([^\p{IsAlnum}\s\.\,−\-])\2*) ', r'\1 '),
                   RegexRule(r'([^\p{N}]) ([,.]) ([^\p{N}])', r'\1\2 \3'),
                   RegexRule(r'([^\p{N}]) ([,.]) ([\p{N}])', r'\1\2 \3'),
                   RegexRule(r'([\p{N}]) ([,.]) ([^\p{N}])', r'\1\2 \3'),
                   RegexRule(r'(–-) ([^\p{N}])', r'\1\2'),
                   RegexRule(r'(\p{N} *|[^ ])(-) ', r'\1\2'),
                   RegexRule(r'([-−])', r' \1'),
                   # Second set keeps apostrophes together with words in most common contractions.
                   # RegexRule(r'([\'’´]) (s|m|d|ll|re|ve)\s', r' \1\2 '),
                   # RegexRule(r'(n [\'’´]) (t\s)', r' \1\2 '),
                   # Third set of contractions based on Treex.
                   RegexRule(r' ([Cc])an not ', r' \1annot '),
                   RegexRule(r' ([Gg])im me\s', r' \1imme '),
                   RegexRule(r' ([Gg])on na\s', r' \1onna '),
                   RegexRule(r' ([Gg])ot ta\s', r' \1otta '),
                   RegexRule(r' ([Ll])em me ', r' \1emme '),
                   RegexRule(r' ([Ww])an na ', r' \1anna '),
                   # Fourth set removes remaining spaces before punctuation
                   RegexRule(r' ([.,?!;:\'])', r'\1'),
                   RegexRule(r' (__[\p{IsAlnum}][\p{IsAlnum}]*) (-[\p{N}]__) ', r' \1\2 '),
             )

    @classmethod
    def preprocess(cls, text: str):
        """TGen inserts spaces around text for easier regexes"""
        return f" {text} "

    @classmethod
    def postprocess(cls, toks: str):
        """TGen removes extra spaces from the text, so spaces can be used as token separators."""
        toks = regex.sub(r'\s+', ' ', toks).strip().split()
        retval = []
        curr_tok = ""
        for tok in toks:
            if "__" in tok:
                if tok.startswith("__") and not tok.endswith("__"):
                    curr_tok = tok
                elif tok.endswith("__") and not tok.startswith("__"):
                    if curr_tok:
                        retval.append(curr_tok+tok)
                        curr_tok = ""
            else:
                retval.append(tok)
        return " ".join(retval)

    @classmethod
    def tokenise(cls, text: str) -> str:
        intermediate_text = cls.preprocess(text)
        for rule in cls.rules:
            intermediate_text = regex.sub(rule.match_expression, rule.replacement_expression, intermediate_text)
        return cls.postprocess(intermediate_text)

    @classmethod
    def detokenise(cls, text: str) -> str:
        intermediate_text = cls.preprocess(text)
        for rule in cls.detok_rules:
            intermediate_text = regex.sub(rule.match_expression, rule.replacement_expression, intermediate_text)
        return cls.postprocess(intermediate_text)





# Create 12 lists of 28 items each to cover the 48 ids over 7 conditions (3 systems x 2 input reps + 1 ref)
conditions = ["singletask_webnlg_sv_role-delex", "singletask_webnlg_rdf_role-delex", "GPT4_sv", "GPT4_rdf", "Llama_sv", "Llama_rdf", "ref"]
indices = webnlg_items.index
from itertools import cycle
count = 0
combos = set()
item_headers = []
for idx in range(1, 31):
    item_headers.extend([f"item{idx}_id", f"item{idx}_sys", f"item{idx}_format", f"item{idx}_input", f"item{idx}_text"])
headers = ["listID"] + item_headers
rows = []
curr_row_lists = []
for idx, condition in zip(cycle(indices), cycle(conditions)):
    item_id = webnlg_items.loc[idx, 'id']
    parts = condition.split("_")
    if parts[2:]:
        sys = f"{parts[0]}"
        mr_type = f"{parts[2]}"
    elif parts[1:]:
        sys = parts[0]
        mr_type = parts[1]
    else:
        sys = "ref"
        mr_type = "none"
    rdf_input = webnlg_items.loc[idx, 'rdf'].replace("<TRIPLE_SEP> ", "")
    text = webnlg_items.loc[idx, condition]
    list_id = len(rows) + 1
    curr_row_lists.append([item_id, sys, mr_type, rdf_input, text])
    if len(curr_row_lists) == 4:
        curr_row_lists.append(["attn1", "dave", "lol", "<SUBJECT> Please mark <PREDICATE> all Input <OBJECT> as missing for this task", "Please rate this Output as Very Disfluent."])
    if len(curr_row_lists) == 24:
        curr_row_lists.append(["attn2", "dave", "lol", "<SUBJECT> Please mark <PREDICATE> all Input <OBJECT> as incorrect for this task", "Please rate this Output as Somewhat Fluent."])
    if len(curr_row_lists) == 30:
        random.shuffle(curr_row_lists)
        row = [list_id]
        for item in curr_row_lists:
            row.extend(item)
        rows.append(row)
        random.shuffle(curr_row_lists)
        row = [list_id]
        for item in curr_row_lists:
            row.extend(item)
        rows.append(row)
        random.shuffle(curr_row_lists)
        row = [list_id]
        for item in curr_row_lists:
            row.extend(item)
        rows.append(row)
        curr_row_lists = []
    count += 1
    if count == 336:
        break
    
# Convert the lists into an input CSV for Lewis

In [18]:
webnlg_humeval_df = pd.DataFrame(rows, columns=headers)
webnlg_humeval_df

Unnamed: 0,listID,item1_id,item1_sys,item1_format,item1_input,item1_text,item2_id,item2_sys,item2_format,item2_input,...,item29_id,item29_sys,item29_format,item29_input,item29_text,item30_id,item30_sys,item30_format,item30_input,item30_text
0,1,Id120-Id3,Llama,sv,<SUBJECT> Italy <PREDICATE> leader <OBJECT> Se...,The leader of Italy is Sergio Mattarella.,attn2,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,...,Id1443-Id3,GPT4,rdf,<SUBJECT> 10 Hygiea <PREDICATE> average speed ...,10 Hygiea has an average speed of 16.76 kilome...,Id1473-Id3,Llama,rdf,<SUBJECT> Alison O Donnell <PREDICATE> associa...,Alison O Donnell is associated with the bands ...
1,1,attn2,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,Please rate this Output as Somewhat Fluent.,Id171-Id2,ref,none,<SUBJECT> Atlanta <PREDICATE> leader <OBJECT> ...,...,Id504-Id3,Llama,rdf,<SUBJECT> United States <PREDICATE> capital <O...,"Attica, Indiana is part of Fountain County, In...",Id16-Id1,singletask,rdf,<SUBJECT> Angola International Airport <PREDIC...,The 1st runway at Angola International Airport...
2,1,Id137-Id4,Llama,rdf,<SUBJECT> School of Business and Social Scienc...,The School of Business and Social Sciences at ...,Id1304-Id1,ref,none,<SUBJECT> Alison O Donnell <PREDICATE> genre <...,...,Id171-Id1,Llama,rdf,<SUBJECT> Atlanta <PREDICATE> leader <OBJECT> ...,Kasim Reed is the leader of Atlanta.,Id1313-Id2,singletask,rdf,<SUBJECT> Andrew Rayel <PREDICATE> associated ...,A T Charlie Johnson is the editor of Andrew Ra...
3,4,Id1519-Id2,GPT4,sv,<SUBJECT> Atlas II <PREDICATE> launch site <OB...,The Atlas II was launched from Spaceport Flori...,Id1837-Id3,ref,none,<SUBJECT> Alfred Garth Jones <PREDICATE> death...,...,attn2,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,Please rate this Output as Somewhat Fluent.,Id1777-Id3,singletask,rdf,<SUBJECT> FC Terek Grozny <PREDICATE> ground <...,Aleksandr Prudnikov are managed by Grozny. He ...
4,4,Id1837-Id3,ref,none,<SUBJECT> Alfred Garth Jones <PREDICATE> death...,"Manchester born Alfred Garth Jones , died in S...",attn2,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,...,Id703-Id1,Llama,sv,<SUBJECT> 1634: The Bavarian Crisis <PREDICATE...,"""1634: The Bavarian Crisis"" by Eric Flint, pre...",Id187-Id1,ref,none,<SUBJECT> United States <PREDICATE> leader <OB...,Barack Obama is the leader of the United States .
5,4,Id1576-Id3,singletask,rdf,<SUBJECT> Akeem Dent <PREDICATE> debut team <O...,Akeem Dent was born in Houston Texans and died...,Id1127-Id1,singletask,rdf,<SUBJECT> Aaron Deer <PREDICATE> associated ba...,...,Id16-Id1,singletask,sv,<SUBJECT> Angola International Airport <PREDIC...,The 1st runway at Angola International Airport...,Id143-Id3,GPT4,sv,<SUBJECT> California <PREDICATE> gemstone <OBJ...,California's state gemstone is Benitoite.
6,7,Id560-Id1,Llama,rdf,<SUBJECT> Bandeja paisa <PREDICATE> region <OB...,The Bandeja paisa is from the Paisa Region. It...,Id1054-Id2,singletask,sv,<SUBJECT> Ariane 5 <PREDICATE> launch site <OB...,...,Id1531-Id1,Llama,rdf,<SUBJECT> Aaron Hunt <PREDICATE> club <OBJECT>...,Aaron Hunt plays for Vf L Wolfsburg. Vf L Wolf...,Id1382-Id2,GPT4,sv,<SUBJECT> Abdul Taib Mahmud <PREDICATE> reside...,Abdul Taib Mahmud resides in Sarawak and was b...
7,7,Id247-Id3,singletask,rdf,<SUBJECT> United States <PREDICATE> ethnic gro...,"Auburn, Alabama is from the United States wher...",Id1401-Id3,Llama,rdf,<SUBJECT> Airey Neave <PREDICATE> battles <OBJ...,...,attn1,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,Please rate this Output as Very Disfluent.,Id1304-Id1,Llama,rdf,<SUBJECT> Alison O Donnell <PREDICATE> genre <...,Alison O Donnell is a folk rock artist signed ...
8,7,Id1393-Id2,GPT4,rdf,<SUBJECT> Adam Koc <PREDICATE> battles <OBJECT...,"Adam Koc battles in the Polish–Soviet War, whi...",attn2,dave,lol,<SUBJECT> Please mark <PREDICATE> all Input <O...,...,Id1443-Id3,GPT4,sv,<SUBJECT> 10 Hygiea <PREDICATE> average speed ...,10 Hygiea travels at an average speed of 16.76...,Id1382-Id2,GPT4,sv,<SUBJECT> Abdul Taib Mahmud <PREDICATE> reside...,Abdul Taib Mahmud resides in Sarawak and was b...
9,10,Id853-Id3,singletask,rdf,<SUBJECT> Barny Cakes <PREDICATE> country <OBJ...,"Arros negre comes from the region of France, w...",Id703-Id1,GPT4,rdf,<SUBJECT> 1634: The Bavarian Crisis <PREDICATE...,...,Id762-Id3,GPT4,sv,<SUBJECT> Bandeja paisa <PREDICATE> main ingre...,Bandeja paisa is a traditional dish from the P...,Id171-Id2,Llama,sv,<SUBJECT> Atlanta <PREDICATE> leader <OBJECT> ...,The city of Atlanta is led by Kasim Reed.


In [19]:
webnlg_humeval_df.to_csv("webnlg-humeval.csv")

In [20]:
from collections import defaultdict
e2e_refs = defaultdict(list)
for _, (id, ref) in e2e_df[['id', 'ref']].iterrows():
    e2e_refs[id.split('-')[0]].append(ref)

In [21]:
import sacrebleu.metrics as sm
import bert_score

e2e_ref_lists = [[], [], []]
ids = [x.split('-')[0] for x in list(e2e_df.loc[:, 'id'])]
for entry_id in ids:
    entry_refs = e2e_refs[entry_id]
    e2e_ref_lists[0].append(entry_refs[0])
    if len(entry_refs) > 1:
        e2e_ref_lists[1].append(e2e_refs[entry_id][1])
    else:
        e2e_ref_lists[1].append("")
    if len(entry_refs) > 2:
        e2e_ref_lists[2].append(e2e_refs[entry_id][2])
    else:
        e2e_ref_lists[2].append("")
e2e_bs_refs = list(zip(e2e_ref_lists[0], e2e_ref_lists[1], e2e_ref_lists[2]))
bleu = sm.BLEU()
bleu_lower = sm.BLEU(lowercase=True)
for sys in ("singletask_e2e_sv_name-near-exact-match", "singletask_e2e_rdf_name-near-exact-match", "GPT4_sv", "GPT4_rdf", "Llama_sv", "Llama_rdf"):
    texts = list(e2e_df.loc[:, sys])
    bleu_score = bleu.corpus_score(texts, e2e_ref_lists)
    bleu_score_lower = bleu_lower.corpus_score(texts, e2e_ref_lists)
    p, r, f1 = bert_score.score(texts, e2e_bs_refs, rescale_with_baseline=True, lang='en', verbose=True, device="cuda:0")
    p_nb, r_nb, f1_nb = bert_score.score(texts, e2e_bs_refs, rescale_with_baseline=False, lang='en', verbose=True, device="cuda:0")
    print(sys)
    for text in texts[:10]:
        print(text)
    print(bleu_score)
    print(bleu_score_lower)
    print(f"BERTScore: {p.mean()} / {r.mean()} / {f1.mean()}")
    print(f"BERTScore: {p_nb.mean()} / {r_nb.mean()} / {f1_nb.mean()}")
    print("-----")
    

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/37 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 17.47 seconds, 784.09 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/37 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 16.61 seconds, 824.79 sentences/sec
singletask_e2e_sv_name-near-exact-match
Blue Spice is a pub in the city centre.
Blue Spice is a pub in the city centre.
Blue Spice is a pub in the riverside area.
Blue Spice is a pub in the riverside area.
Blue Spice is a pub in the city centre.
Blue Spice is a pub in the city centre.
Blue Spice is a pub in the riverside area.
Blue Spice is a pub in the riverside area.
Clowns is a pub near The Sorrento.
Cocum is a pub near The Rice Boat.
BLEU = 46.56 80.6/56.6/38.7/26.6 (BP = 1.000 ratio = 1.015 hyp_len = 129114 ref_len = 127170)
BLEU = 47.35 81.5/57.5/39.5/27.1 (BP = 1.000 ratio = 1.015 hyp_len = 129114 ref_len = 127170)
BERTScore: 0.7086080312728882 / 0.630554735660553 / 0.663634717464447
BERTScore: 0.9509021639823914 / 0.9377462863922119 / 0.9432302117347717
-----


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/37 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 16.57 seconds, 826.50 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/37 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 16.65 seconds, 822.78 sentences/sec
singletask_e2e_rdf_name-near-exact-match
Blue Spice is a pub located in the city centre.
Blue Spice is a pub located in the city centre.
Blue Spice is a pub in the riverside area.
Blue Spice is a pub in the riverside area.
Blue Spice is a pub located in the city centre.
Blue Spice is a pub located in the city centre.
Blue Spice is a pub in the riverside area.
Blue Spice is a pub in the riverside area.
Clowns is a pub near The Sorrento.
Cocum is a pub near The Rice Boat.
BLEU = 46.05 80.1/55.7/38.1/26.4 (BP = 1.000 ratio = 1.014 hyp_len = 128590 ref_len = 126758)
BLEU = 46.86 81.1/56.7/38.9/27.0 (BP = 1.000 ratio = 1.014 hyp_len = 128590 ref_len = 126758)
BERTScore: 0.6986987590789795 / 0.6256824731826782 / 0.6565150618553162
BERTScore: 0.9492325186729431 / 0.9369252324104309 / 0.9420285224914551
-----


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/89 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 40.39 seconds, 339.16 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/89 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 40.29 seconds, 339.95 sentences/sec
GPT4_sv
Blue Spice is a coffee shop located in the city centre.
Blue Spice is a coffee shop located in the city centre.
Blue Spice is a coffee shop located by the riverside.
Blue Spice is a coffee shop located by the riverside.
Blue Spice is a pub located in the city centre.
Blue Spice is a pub located in the city centre.
Blue Spice is a pub located in the riverside area.
Blue Spice is a pub located riverside.
Clowns is a pub near The Sorrento.
Cocum is a coffee shop near The Rice Boat.
BLEU = 40.88 80.5/51.7/33.1/20.7 (BP = 0.995 ratio = 0.995 hyp_len = 122221 ref_len = 122813)
BLEU = 41.61 81.4/52.6/33.8/21.1 (BP = 0.995 ratio = 0.995 hyp_len = 122221 ref_len = 122813)
BERTScore: 0.7126024961471558 / 0.6520476341247559 / 0.676703929901123
BERTScore: 0.9515752196311951 / 0.9413679838180542 / 0.9454358816146851
-----


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/93 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 42.11 seconds, 325.30 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/93 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 41.89 seconds, 327.03 sentences/sec
GPT4_rdf
Blue Spice is a coffee shop located in the city centre.
Blue Spice is a coffee shop located in the city centre.
Blue Spice is a coffee shop located by the riverside.
Blue Spice is a coffee shop located by the riverside.
Blue Spice is a pub located in the city centre.
Blue Spice is a pub located in the city centre.
Blue Spice is a pub located by the riverside.
Blue Spice is a pub located by the riverside.
Clowns typically eat at pubs and are often found near The Sorrento.
Cocum is a coffee shop located near The Rice Boat.
BLEU = 39.09 77.5/50.0/31.4/19.2 (BP = 1.000 ratio = 1.054 hyp_len = 137320 ref_len = 130301)
BLEU = 39.81 78.5/50.9/32.1/19.6 (BP = 1.000 ratio = 1.054 hyp_len = 137320 ref_len = 130301)
BERTScore: 0.6930341124534607 / 0.656459629535675 / 0.669673502445221
BERTScore: 0.94827800989151 / 0.9421113729476929 / 0.9442493319511414
-----


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/87 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 39.67 seconds, 345.32 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/87 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 39.69 seconds, 345.15 sentences/sec
Llama_sv
Blue Spice is a coffee shop in the city centre.
Blue Spice is a coffee shop located in the city centre.
"Blue Spice is a coffee shop located in the riverside area."
Blue Spice is a coffee shop in the Riverside area.
"Blue Spice is a pub in the city centre."
Blue Spice is a pub in the city centre.
Blue Spice is a pub in the riverside area.
Blue Spice is a pub located in the riverside area.
Clowns eat at a pub near The Sorrento.
Cocum is a coffee shop near The Rice Boat.
BLEU = 34.87 73.2/44.2/27.6/16.5 (BP = 1.000 ratio = 1.015 hyp_len = 127327 ref_len = 125504)
BLEU = 35.80 74.1/45.2/28.5/17.2 (BP = 1.000 ratio = 1.015 hyp_len = 127327 ref_len = 125504)
BERTScore: 0.6516315937042236 / 0.6187602281570435 / 0.6294229626655579
BERTScore: 0.9413020014762878 / 0.9357588291168213 / 0.9374561309814453
-----


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/88 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 44.32 seconds, 309.09 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/88 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/215 [00:00<?, ?it/s]



done in 44.33 seconds, 308.98 sentences/sec
Llama_rdf
Blue Spice is a type of coffee shop located in the city centre.
Blue Spice is a type of coffee shop that can be found in the city centre.
Blue Spice is a type of coffee shop that is located in the riverside area.
Blue Spice is a type of eatery located at a coffee shop in the riverside area.
Blue Spice is a type of eatery located in the city centre.
Blue Spice is a type of eatery in the pub area, located in the city centre.
Blue Spice is a type of eatery located in the riverside area of a pub.
Blue Spice is a type of eatery located in the riverside area of a pub.
Clowns eat at a type of pub. Clowns are near The Sorrento.
Cocum is a type of eatery found in a coffee shop. Cocum is near The Rice Boat.
BLEU = 34.17 70.4/43.9/27.0/16.3 (BP = 1.000 ratio = 1.137 hyp_len = 156842 ref_len = 137950)
BLEU = 35.20 71.8/45.1/28.0/16.9 (BP = 1.000 ratio = 1.137 hyp_len = 156842 ref_len = 137950)
BERTScore: 0.6384944915771484 / 0.6428965926170349 



In [26]:
webnlg_refs = defaultdict(list)
for _, (id, ref) in webnlg_df[['id', 'ref']].iterrows():
    webnlg_refs[id.split('-')[0]].append(ref)
webnlg_ref_lists = [[], [], []]
ids = [x.split('-')[0] for x in list(webnlg_df.loc[:, 'id'])]
for entry_id in ids:
    entry_refs = webnlg_refs[entry_id]
    webnlg_ref_lists[0].append(entry_refs[0])
    if len(entry_refs) > 1:
        webnlg_ref_lists[1].append(webnlg_refs[entry_id][1])
    else:
        webnlg_ref_lists[1].append("")
    if len(entry_refs) > 2:
        webnlg_ref_lists[2].append(webnlg_refs[entry_id][2])
    else:
        webnlg_ref_lists[2].append("")
webnlg_bs_refs = list(zip(webnlg_ref_lists[0], webnlg_ref_lists[1], webnlg_ref_lists[2]))
bleu = sm.BLEU()
bleu_lower = sm.BLEU(lowercase=True)
for sys in ("singletask_webnlg_sv_role-delex", "singletask_webnlg_rdf_role-delex", "GPT4_sv", "GPT4_rdf", "Llama_sv", "Llama_rdf"):
    texts = list(webnlg_df.loc[:, sys])
    bleu_score = bleu.corpus_score(texts, webnlg_ref_lists)
    bleu_score_lower = bleu_lower.corpus_score(texts, webnlg_ref_lists)
    p, r, f1 = bert_score.score(texts, webnlg_bs_refs, rescale_with_baseline=True, lang='en', verbose=True, device="cuda:0")
    p_nb, r_nb, f1_nb = bert_score.score(texts, webnlg_bs_refs, rescale_with_baseline=False, lang='en', verbose=True, device="cuda:0")
    print(sys)
    for text in texts[:10]:
        print(text)
    print(bleu_score)
    print(bleu_score_lower)
    print(f"BERTScore: {p.mean()} / {r.mean()} / {f1.mean()}")
    print(f"BERTScore: {p_nb.mean()} / {r_nb.mean()} / {f1_nb.mean()}")
    print("-----")
    

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/122 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 55.69 seconds, 264.72 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/122 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 55.86 seconds, 263.90 sentences/sec
singletask_webnlg_sv_role-delex
The Al Taqaddum Air Base serves the city of Abilene, Texas.
Abilene Regional Airport serves the city of Abilene Texas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes and Alcobendas.
Adolfo Suarez Madrid -Barejas airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes and Alcobendas.
Adolfo Suarez Madrid -Barajas airport is located in Soldevanahalli, Acharya Dr. Sarvapalli Radhakrishnan Road, Hessarghatta Main Road, Bangalore– 560090.
The runway name of Adolfo Suárez Madrid–Barajas Airport is 18L/36R.
18L/36R is the runway name of Ardmore Airport in New Zealand.
18L/36R is the runway name of Ardmore Airport in New Zealand.
The ICAO Location Identifier of Afonso Pena International Airport is SBCT.
The ICAO Location Identifier of Afonso Pena International Airport is SBCT.
BLEU = 29.62 63.2/38.7/25.8/17.6 (BP = 0.912 ratio = 0.916

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/124 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 60.64 seconds, 243.09 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/124 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 61.10 seconds, 241.28 sentences/sec
singletask_webnlg_rdf_role-delex
Abilene, Texas is the operating organisation of Atlantic City International airport.
Abilene Regional Airport serves the city of Appleton.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes and Alcobendas.
The Adolfo Suarez Madrid -Barajas airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes and Alcobendas.
The Adolfo Suarez Madrid -Barajas Airport is located at San Sebastian de los Reyes in San Sebastian de los Reyes.
The runway name of Adolfo Suárez Madrid–Barajas Airport is 18L/36R.
18L/36R is the runway name of Ardmore Airport, New Zealand.
18L/36R is the runway name of Ardmore Airport, New Zealand.
The ICAO Location Identifier of Afonso Pena International Airport is SBCT.
The ICAO Location Identifier of Afonso Pena International Airport is SBCT.
BLEU = 29.83 58.8/35.6/23.6/16.0 (BP = 1.000 ratio = 1.029 hyp_len = 116939 r

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/140 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 70.88 seconds, 207.98 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/140 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 71.13 seconds, 207.25 sentences/sec
GPT4_sv
Abilene Regional Airport serves Abilene, Texas.
Abilene Regional Airport serves the city of Abilene, Texas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
Adolfo Suárez Madrid–Barajas Airport features a runway named 18L/36R.
Adolfo Suárez Madrid–Barajas Airport has a runway named 18L/36R.
Adolfo Suárez Madrid–Barajas Airport features a runway named 18L/36R.
Afonso Pena International Airport has the ICAO location identifier SBCT.
Afonso Pena International Airport is identified by the ICAO code SBCT.
BLEU = 45.88 78.9/55.2/38.3/26.6 (BP = 1.000 ratio = 1.008 hyp_len = 113516 ref_len = 112591)
BLEU = 46.95 79.9/5

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/140 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 75.59 seconds, 195.04 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/140 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 74.94 seconds, 196.73 sentences/sec
GPT4_rdf
Abilene Regional Airport serves the city of Abilene, Texas.
Abilene Regional Airport serves the city of Abilene, Texas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
The runway at Adolfo Suárez Madrid–Barajas Airport is named 18L/36R.
The runway at Adolfo Suárez Madrid–Barajas Airport is named 18L/36R.
The runway at Adolfo Suárez Madrid–Barajas Airport is named 18L/36R.
Afonso Pena International Airport has the ICAO location identifier SBCT.
Afonso Pena International Airport has the ICAO location identifier SBCT.
BLEU = 46.67 78.5/55.6/39.2/27.7 (BP = 1.000 ratio = 1.050 hyp_len = 121719 ref_len = 115888)
BL

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/137 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 66.62 seconds, 221.28 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/137 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 67.03 seconds, 219.92 sentences/sec
Llama_sv
Abilene Regional Airport serves Abilene, Texas.
Abilene Regional Airport serves Abilene, Texas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, as well as the surrounding areas of Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
Adolfo Suárez Madrid–Barajas Airport has a runway named 18 L/36 R.
Adolfo Suárez Madrid–Barajas Airport has a runway named 18 L/36 R.
Adolfo Suárez Madrid–Barajas Airport has a runway named 18 L/36 R.
Afonso Pena International Airport is located at SBCT.
Afonso Pena International Airport is identified by the ICAO location identifier SBCT.
BLEU = 44.02 77.9/53.2/36.3/25.0 (BP = 1.000 ratio = 1.005 hyp_len = 112627 ref_len = 112099)
BL

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/136 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 72.16 seconds, 204.30 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/136 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/231 [00:00<?, ?it/s]



done in 72.61 seconds, 203.02 sentences/sec
Llama_rdf
Abilene Regional Airport serves the city of Abilene, Texas.
Abilene Regional Airport serves the city of Abilene, Texas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, as well as the surrounding areas of Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
Adolfo Suárez Madrid–Barajas Airport is located in Madrid, Paracuellos de Jarama, San Sebastián de los Reyes, and Alcobendas.
The runway name at Adolfo Suárez Madrid–Barajas Airport is 18 L/36 R.
Adolfo Suárez Madrid–Barajas Airport's runway name is 18 L/36 R.
The runway name at Adolfo Suárez Madrid–Barajas Airport is 18 L/36 R.
Afonso Pena International Airport's ICAO location identifier is SBCT.
Afonso Pena International Airport's ICAO location identifier is SBCT.
BLEU = 44.38 75.5/53.2/37.1/26.0 (BP = 1.000 ratio = 1.045 hyp_len 

In [23]:
webnlg_bs_refs[:10]

[('Abilene , Texas is served by the Abilene regional airport .',
  'Abilene Regional Airport serves the city of Abilene in Texas .',
  ''),
 ('Abilene , Texas is served by the Abilene regional airport .',
  'Abilene Regional Airport serves the city of Abilene in Texas .',
  ''),
 ('Adolfo Suárez Madrid – Barajas Airport can be found in Madrid , Paracuellos de Jarama , San Sebastián de los Reyes and Alcobendas .',
  'Adolfo Suarez Madrid- Barajas airport is located at Madrid , Paracuellos de Jarama , San Sebastián de los Reyes and Alcobendas .',
  'Adolfo Suarez Madrid- Barajas Airport is located in Madrid , Paracuellos de Jarama , San Sebastian de los Reyes and Alcobendas .'),
 ('Adolfo Suárez Madrid – Barajas Airport can be found in Madrid , Paracuellos de Jarama , San Sebastián de los Reyes and Alcobendas .',
  'Adolfo Suarez Madrid- Barajas airport is located at Madrid , Paracuellos de Jarama , San Sebastián de los Reyes and Alcobendas .',
  'Adolfo Suarez Madrid- Barajas Airport is