# Evaluation of Data Integration

In this notebook, we evaluate how effective two relations can be integrated using the soft join operator.
Therefore, we utilize the [Datasets for DeepMatcher paper](https://github.com/anhaidgroup/deepmatcher/blob/master/Datasets.md).

The datasets contain two relations with the same entities from two different sources.
E.g. `iTunes-Amazon` contains song records obtained from iTunes and Amazon. So, the task is to identify same songs.

To test different Models, datasets, ..., there are parameters in the [modifications](#modifications) section.
**Modfy Code in [Modifications](#modifications) Section Only!**

We calculate:
* $ TP = \text{True Matches} \cap \text{Predicted Matches} $
* $ FN = \text{True Matches} \setminus \text{Predicted Matches} $
* $ FP = \text{Predicted Matches} \setminus \text{True Matches} $

To determine the scores:
* $ Precision = \frac{TP}{TP + FP}$
* $ Recall = \frac{TP}{TP + FN}$
* $ F_1 = \frac{2 \cdot precision \cdot recall}{precision + recall} $
* BLEU 1-4

## Imports

In [2]:
import os
import time

import tqdm
import requests
import zipfile

import pandas as pd
import numpy as np

from db.operators import Dummy, InnerSoftJoin

from models import ModelMgr
from models.embedding.SentenceTransformer import SentenceTransformerEmbeddingModel
from models.semantic_validation import LLaMAValidationModel

import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from nltk.translate.bleu_score import SmoothingFunction

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/nico/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/nico/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Modifications

In [3]:
dataset = ("Structured", "iTunes-Amazon") # The tested dataset
data_path = "../data/"  # Data Path (usually remains unchanged)

significant_columns_left = ["Album_Name", "Artist_Name", "Released", "Song_Name", "Time"]
significant_columns_right = ["Album_Name", "Artist_Name", "Released", "Song_Name", "Time"]

## Function and Dataset Declarations

In [4]:
# Available Dataset with meta data

datasets = {
    "Structured": {
        "Beer": {"filename": "beer_raw_data.zip",  "matches": "labeled_data.csv", "key_column": "Label", "label_column": "gold", "drop_columns": ["_id"]},
        "iTunes-Amazon": {"filename": "itunes_amazon_raw_data.zip",  "matches": "labeled_data.csv", "key_column": "Sno", "label_column": "label", "drop_columns": ["Unnamed: 0", "_id"]},
        "Fodors-Zagats": "fodors_zagat_raw_data.zip",
        "DBLP-ACM": "dblp_acm_raw_data.zip",
        "DBLP-GoogleScholar": "dblp_scholar_raw_data.zip",
        "Amazon-Google": "amazon_google_raw_data.zip",
        "Walmart-Amazon": "walmart_amazon_raw_data.zip",
    },
    "Textual": {
        "Abt-Buy": "abt_buy_raw_data.zip",
        "Company": "company_raw_data.zip"
    }
}

dataset_data = datasets[dataset[0]][dataset[1]]

In [5]:
def compute_bleu_representativeness(input_dataset, integrated_dataset, use_n_grams=4, smoothing_function = SmoothingFunction().method1):
    """
    Compute BLEU-based representativeness score between input and integrated datasets.

    :param input_dataset: List of text entries from the input dataset
    :param integrated_dataset: List of text entries from the integrated dataset
    :param use_n_grams: Number of n-grams to use (BLEU 1, 2, 3, 4)
    :param smoothing_function: Smoothing function to use for computing BLEU
    :return: Average BLEU score as the representativeness measure
    """
    assert use_n_grams in (1,2,3,4)
    weights = np.array([0., 0., 0., 0.])
    weights[:use_n_grams] = 1.0/float(use_n_grams)

    integrated_tokens = [word_tokenize(entry.lower()) for entry in integrated_dataset]
    input_tokens = [word_tokenize(entry.lower()) for entry in input_dataset]

    bleu_scores = []

    for input_entry in input_tokens:
        item_bleu_scores = max([sentence_bleu([integrated_entry], input_entry, smoothing_function=smoothing_function, weights=weights) for integrated_entry in integrated_tokens])
        bleu_scores.append(item_bleu_scores)

    return np.average(bleu_scores)

def download_file(url, save_file):
    """
    Download deep matcher dataset

    :param url: URL to download from
    :param save_file: File to save to
    """
    with requests.get(url, stream=True) as response:
        response.raise_for_status()
        with open(save_file, "wb") as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)

def process_dataset(ds):
    """
    Download deep matcher dataset and return DataFrame with joined data

    :param ds: The dataset which will be downloaded and processed
    """
    url = f"https://pages.cs.wisc.edu/~anhai/data1/deepmatcher_data/{ds[0]}/{ds[1]}/{dataset_data["filename"]}"
    save_path = f"{data_path}{dataset[0]}/{dataset[1]}/"
    save_file = save_path + dataset_data["filename"]

    os.makedirs(save_path, exist_ok=True)

    if not os.path.exists(save_file):
        download_file(url, save_file)

    # Unzip the file
    with zipfile.ZipFile(save_file, "r") as zip_ref:
        zip_ref.extractall(save_path)

    #pd.read_csv(save_path + dataset_data["tableA"], encoding="unicode_escape"),
    #pd.read_csv(save_path + dataset_data["tableB"], encoding="unicode_escape"),

    return pd.read_csv(save_path + dataset_data["matches"], encoding="unicode_escape", skiprows=5)


#table_a, table_b, matches = process_dataset(dataset)
candidates = process_dataset(dataset) # table_a.head(), table_b.head()
candidates.head()

Unnamed: 0.1,Unnamed: 0,_id,ltable.Sno,rtable.Sno,ltable.Album_Name,ltable.Artist_Name,ltable.CopyRight,ltable.Released,ltable.Song_Name,ltable.Time,rtable.Album_Name,rtable.Artist_Name,rtable.CopyRight,rtable.Released,rtable.Song_Name,rtable.Time,label
0,916,916,111,53124,vhs,x ambassadors,2015 kidinakorner/interscope records,30-Jun-15,vhs outro (interlude),1:25,vhs [explicit],x ambassadors,(c) 2015 kidinakorner/interscope records,"June 30, 2015",vhs outro (interlude) [explicit],1:25,1
1,1053,1053,148,50767,title (deluxe),meghan trainor,"2014, 2015 epic records, a division of sony m...",9-Jan-15,credit,2:51,title (deluxe),meghan trainor,"2011 what a music ltd, licence exclusive parl...","January 9, 2015",credit,2:51,1
2,1290,1290,206,41214,slow down (remixes),selena gomez,"2013 hollywood records, inc.",20-Aug-13,slow down (smash mode remix),5:21,slow down remixes,selena gomez,"(c) 2013 hollywood records, inc.","August 20, 2013",slow down (smash mode remix),5:21,1
3,1424,1424,211,19812,slow down (reggae remixes) - single,selena gomez,"2013 hollywood records, inc.",20-Aug-13,slow down (sure shot rockers reggae dub remix),3:15,good for you (remixes),selena gomez,(c) 2015 interscope records,"September 4, 2015",good for you (yellow claw & cesqeaux remix) [f...,3:01,0
4,1706,1706,250,53111,vhs,x ambassadors,2015 kidinakorner/interscope records,30-Jun-15,vhs outro (interlude),1:25,vhs [explicit],x ambassadors,(c) 2015 kidinakorner/interscope records,"June 30, 2015",first show (interlude),0:11,0


## Evaluation

### Determine Matching Set and Data

In [6]:
matches = candidates[candidates[dataset_data["label_column"]] == 1]
matches = matches.drop(columns=dataset_data["drop_columns"])

gt = {(x[f"ltable.{dataset_data['key_column']}"], x[f"rtable.{dataset_data['key_column']}"]) for _, x in matches.iterrows()}
print(str(gt)[0: 50], "...")

matches.head()

{(2743, 17193), (6533, 38335), (5713, 35091), (149 ...


Unnamed: 0,ltable.Sno,rtable.Sno,ltable.Album_Name,ltable.Artist_Name,ltable.CopyRight,ltable.Released,ltable.Song_Name,ltable.Time,rtable.Album_Name,rtable.Artist_Name,rtable.CopyRight,rtable.Released,rtable.Song_Name,rtable.Time,label
0,111,53124,vhs,x ambassadors,2015 kidinakorner/interscope records,30-Jun-15,vhs outro (interlude),1:25,vhs [explicit],x ambassadors,(c) 2015 kidinakorner/interscope records,"June 30, 2015",vhs outro (interlude) [explicit],1:25,1
1,148,50767,title (deluxe),meghan trainor,"2014, 2015 epic records, a division of sony m...",9-Jan-15,credit,2:51,title (deluxe),meghan trainor,"2011 what a music ltd, licence exclusive parl...","January 9, 2015",credit,2:51,1
2,206,41214,slow down (remixes),selena gomez,"2013 hollywood records, inc.",20-Aug-13,slow down (smash mode remix),5:21,slow down remixes,selena gomez,"(c) 2013 hollywood records, inc.","August 20, 2013",slow down (smash mode remix),5:21,1
5,250,53124,vhs,x ambassadors,2015 kidinakorner/interscope records,30-Jun-15,vhs outro (interlude),1:25,vhs [explicit],x ambassadors,(c) 2015 kidinakorner/interscope records,"June 30, 2015",vhs outro (interlude) [explicit],1:25,1
7,252,53004,vhs,x ambassadors,2015 kidinakorner/interscope records,30-Jun-15,vhs outro (interlude),1:25,vhs,x ambassadors,(c) 2015 kidinakorner/interscope records,"June 30, 2015",vhs outro (interlude),1:26,1


In [7]:
columns_left = [col.replace("ltable.", "") for col in matches.columns if col.startswith("ltable")]
columns_right = [col.replace("rtable.", "") for col in matches.columns if col.startswith("rtable")]
print(columns_left, columns_right, sep="\n")

['Sno', 'Album_Name', 'Artist_Name', 'CopyRight', 'Released', 'Song_Name', 'Time']
['Sno', 'Album_Name', 'Artist_Name', 'CopyRight', 'Released', 'Song_Name', 'Time']


In [8]:
data_left = list(set([tuple([row[f"ltable.{col}"] for col in columns_left]) for _, row in matches.iterrows()]))
data_right = list(set([tuple([row[f"rtable.{col}"] for col in columns_right]) for _, row in matches.iterrows()]))
print(data_left[0], data_right[0], sep="\n")

(3359, 'rolling papers', 'wiz khalifa', ' 2011 atlantic recording corporation for the united states and wea international inc. for the world outside of the united states', '28-Mar-11', 'cameras', '4:29')
(11280, 'control', 'disclosure', ' 2013 greco-roman ltd', ' April 2, 2013', 'boiling (medlar remix)', '5:52')


### Execute Operator and Evaluate

In [9]:
def evaluate(em, sv, threshold, method, embedding_comparison, embedding_method):
    key = (threshold, method, embedding_comparison, embedding_method)

    op1 = Dummy("ltable", columns_left, data_left)
    op2 = Dummy("rtable", columns_right, data_right)

    op = InnerSoftJoin(
        op1, op2, em=em, sv=sv,
        threshold=threshold, method=method,
        columns_left=significant_columns_left, columns_right=significant_columns_right,
        embedding_comparison=embedding_comparison,
        embedding_method = embedding_method
    )

    tic = time.time()
    result = op.open().fetch_all()
    toc = time.time()
    pred = {(r[f"ltable.{dataset_data['key_column']}"], r[f"rtable.{dataset_data['key_column']}"]) for r in result}

    tps, fns, fps = gt & pred, gt - pred, pred - gt
    tp, fn, fp = len(tps), len(fns), len(fps)

    # values = {"tp": tp, "fn": fn, "fp": fp}

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # BLEU
    joined_column = [f"ltable.{c}" for c in columns_left] + [f"rtable.{c}" for c in columns_right]

    serialized_results = [", ".join(str(v) for k,v in x.items()) for x in result]
    serialized_ground_truth = [", ".join([str(v[col]) for col in joined_column]) for k, v in matches.iterrows()]

    # print("Sample for matched records: ", serialized_results[0], serialized_ground_truth[0], sep="\n")

    scores = { "Precision": precision, "Recall": recall, "F1 Score": f1_score, "Runtime": toc-tic}

    for i in range(4):
        try:
            scores[f"bleu{i+1}"] = compute_bleu_representativeness(serialized_ground_truth, serialized_results, use_n_grams=i+1)
        except:
            scores[f"bleu{i+1}"] = -1

    print(key, ": ", scores, sep="")

    return key, scores

In [10]:
# Models
m = ModelMgr()
stem = SentenceTransformerEmbeddingModel(m)
lsv = LLaMAValidationModel(m)

thresholds = [1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]

evaluation_results = {}

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
for t in thresholds:
    res = evaluate(stem, lsv, threshold = t, method = "threshold", embedding_comparison = "RECORD_WISE", embedding_method = "FIELD_SERIALIZED")
    evaluation_results[res[0]] = res[1]
    if res[1]["Recall"] == 1.0:
        break

(1.0, 'threshold', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 0.8213868141174316, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.9, 'threshold', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 0.5048065185546875, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.8, 'threshold', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 0.5410277843475342, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.7, 'threshold', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 0.5418837070465088, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.6, 'threshold', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 0.5507450103759766, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.5, 'threshold', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Re

In [11]:
for t in thresholds:
    res = evaluate(stem, lsv, threshold = t, method = "threshold", embedding_comparison = "RECORD_WISE", embedding_method = "FULL_SERIALIZED")
    evaluation_results[res[0]] = res[1]
    if res[1]["Recall"] == 1.0:
        break

(1.0, 'threshold', 'RECORD_WISE', 'FULL_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 0.6876139640808105, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.9, 'threshold', 'RECORD_WISE', 'FULL_SERIALIZED'): {'Precision': 1.0, 'Recall': 0.007575757575757576, 'F1 Score': 0.015037593984962407, 'Runtime': 1.0624871253967285, 'bleu1': 0.11533143994144268, 'bleu2': 0.060385224463451015, 'bleu3': 0.04576146247909286, 'bleu4': 0.03915953854793376}
(0.8, 'threshold', 'RECORD_WISE', 'FULL_SERIALIZED'): {'Precision': 0.058222222222222224, 'Recall': 0.9924242424242424, 'F1 Score': 0.10999160369437448, 'Runtime': 0.6329195499420166, 'bleu1': 0.9976943346508566, 'bleu2': 0.9971338626678805, 'bleu3': 0.9968694393209289, 'bleu4': 0.9966700712267533}
(0.7, 'threshold', 'RECORD_WISE', 'FULL_SERIALIZED'): {'Precision': 0.008778346744696415, 'Recall': 1.0, 'F1 Score': 0.01740391588107324, 'Runtime': 0.9583263397216797, 'bleu1': 1.0, 'bleu2': 1.0, 'bleu3': 1.0, 'bleu4': 1.0}


In [12]:
for t in thresholds:
    res = evaluate(stem, lsv, threshold = t, method = "threshold", embedding_comparison = "COLUMN_WISE", embedding_method = None)
    evaluation_results[res[0]] = res[1]
    if res[1]["Recall"] == 1.0:
        break

(1.0, 'threshold', 'COLUMN_WISE', None): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 1.0187842845916748, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.9, 'threshold', 'COLUMN_WISE', None): {'Precision': 0.8105263157894737, 'Recall': 0.5833333333333334, 'F1 Score': 0.6784140969162996, 'Runtime': 1.036787509918213, 'bleu1': 0.8099082244808259, 'bleu2': 0.7603881066424467, 'bleu3': 0.7361576586079828, 'bleu4': 0.719170905705405}
(0.8, 'threshold', 'COLUMN_WISE', None): {'Precision': 0.8211920529801324, 'Recall': 0.9393939393939394, 'F1 Score': 0.8763250883392226, 'Runtime': 1.136911153793335, 'bleu1': 0.9727556089946999, 'bleu2': 0.964784029218628, 'bleu3': 0.9603010022464545, 'bleu4': 0.9570555055932958}
(0.7, 'threshold', 'COLUMN_WISE', None): {'Precision': 0.6804123711340206, 'Recall': 1.0, 'F1 Score': 0.8098159509202454, 'Runtime': 1.3441667556762695, 'bleu1': 1.0, 'bleu2': 1.0, 'bleu3': 1.0, 'bleu4': 1.0}


In [12]:
res = evaluate(stem, lsv, method = "zero-shot-prompting", threshold = None, embedding_comparison = None, embedding_method = None)
evaluation_results[res[0]] = res[1]

(None, 'zero-shot-prompting', None, None): {'Precision': 0.7040816326530612, 'Recall': 0.5227272727272727, 'F1 Score': 0.6, 'Runtime': 447.9653444290161, 'bleu1': 0.8166178517939763, 'bleu2': 0.7683757500144516, 'bleu3': 0.7451108521427563, 'bleu4': 0.7285428371635542}


In [14]:
for t in thresholds:
    res = evaluate(stem, lsv, threshold = t, method = "both", embedding_comparison = "RECORD_WISE", embedding_method = "FIELD_SERIALIZED")
    evaluation_results[res[0]] = res[1]
    if res[1]["Recall"] == 1.0:
        break

(1.0, 'both', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 2.201704740524292, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.9, 'both', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 0.4905233383178711, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.8, 'both', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 0.5249264240264893, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.7, 'both', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 0.5006089210510254, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.6, 'both', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 0.5346734523773193, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.5, 'both', 'RECORD_WISE', 'FIELD_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Run

In [15]:
for t in thresholds:
    res = evaluate(stem, lsv, threshold = t, method = "both", embedding_comparison = "RECORD_WISE", embedding_method = "FULL_SERIALIZED")
    evaluation_results[res[0]] = res[1]
    if res[1]["Recall"] == 1.0:
        break

(1.0, 'both', 'RECORD_WISE', 'FULL_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 1.7119879722595215, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.9, 'both', 'RECORD_WISE', 'FULL_SERIALIZED'): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 2.382183313369751, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.8, 'both', 'RECORD_WISE', 'FULL_SERIALIZED'): {'Precision': 0.7634408602150538, 'Recall': 0.5378787878787878, 'F1 Score': 0.6311111111111111, 'Runtime': 230.46481084823608, 'bleu1': 0.8091357162524635, 'bleu2': 0.7582338081316166, 'bleu3': 0.7329722199339458, 'bleu4': 0.7147947382289851}
(0.7, 'both', 'RECORD_WISE', 'FULL_SERIALIZED'): {'Precision': 0.71875, 'Recall': 0.5227272727272727, 'F1 Score': 0.6052631578947368, 'Runtime': 582.5686266422272, 'bleu1': 0.8128025953606497, 'bleu2': 0.7636547453904907, 'bleu3': 0.7392707462324616, 'bleu4': 0.7223206264585948}
(0.6, 'both', 'RECORD_WISE', 'FULL_SERIALIZED'): {'Precision': 0.679245

In [16]:
for t in thresholds:
    res = evaluate(stem, lsv, threshold = t, method = "both", embedding_comparison = "COLUMN_WISE", embedding_method = None)
    evaluation_results[res[0]] = res[1]
    if res[1]["Recall"] == 1.0:
        break

(1.0, 'both', 'COLUMN_WISE', None): {'Precision': 0, 'Recall': 0.0, 'F1 Score': 0, 'Runtime': 1.9077739715576172, 'bleu1': -1, 'bleu2': -1, 'bleu3': -1, 'bleu4': -1}
(0.9, 'both', 'COLUMN_WISE', None): {'Precision': 0.7843137254901961, 'Recall': 0.30303030303030304, 'F1 Score': 0.4371584699453552, 'Runtime': 88.06190299987793, 'bleu1': 0.6909997137565955, 'bleu2': 0.6064822302816835, 'bleu3': 0.5688032243799542, 'bleu4': 0.5417118299816917}
(0.8, 'both', 'COLUMN_WISE', None): {'Precision': 0.8481012658227848, 'Recall': 0.5075757575757576, 'F1 Score': 0.6350710900473934, 'Runtime': 126.59877324104309, 'bleu1': 0.7899973431185057, 'bleu2': 0.731787098837809, 'bleu3': 0.7046718153065405, 'bleu4': 0.685944893118298}
(0.7, 'both', 'COLUMN_WISE', None): {'Precision': 0.8089887640449438, 'Recall': 0.5454545454545454, 'F1 Score': 0.6515837104072397, 'Runtime': 129.05220818519592, 'bleu1': 0.8186039965165084, 'bleu2': 0.7706338567683519, 'bleu3': 0.7482203844076682, 'bleu4': 0.7322746594455944}

In [37]:
keys = ["threshold", "method", "embedding_comparison", "embedding_method"]
evaluation_results_list = [v | {ki: vi for ki, vi in zip(keys, k)} for k, v in evaluation_results.items()]
df_evaluation_results = pd.DataFrame.from_records(evaluation_results_list, index=keys)
df_evaluation_results.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Precision,Recall,F1 Score,Runtime,bleu1,bleu2,bleu3,bleu4
threshold,method,embedding_comparison,embedding_method,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,threshold,RECORD_WISE,FIELD_SERIALIZED,0.0,0.0,0.0,0.891951,-1.0,-1.0,-1.0,-1.0
0.9,threshold,RECORD_WISE,FIELD_SERIALIZED,0.0,0.0,0.0,0.492512,-1.0,-1.0,-1.0,-1.0
0.8,threshold,RECORD_WISE,FIELD_SERIALIZED,0.0,0.0,0.0,0.487332,-1.0,-1.0,-1.0,-1.0
0.7,threshold,RECORD_WISE,FIELD_SERIALIZED,0.0,0.0,0.0,0.619905,-1.0,-1.0,-1.0,-1.0
0.6,threshold,RECORD_WISE,FIELD_SERIALIZED,0.0,0.0,0.0,0.607469,-1.0,-1.0,-1.0,-1.0


In [38]:
df_evaluation_results.to_csv("results/EvaluationDataIntegration.csv")