In [1]:
import pandas
import html
import re
import numpy
import sys

from sentence_transformers import SentenceTransformer, util
from math import floor, ceil
#from gensim.models import KeyedVectors
#from gensim.downloader import load

#model = load('word2vec-google-news-300')

class Metrics:
    def __init__(self, metric_dict):
        #save paths to clean, dirty and corrected dataset
        self.clean_path = metric_dict["clean_data_path"]
        self.dirty_path = metric_dict["dirty_data_path"]
        #self.corrected_path = metric_dict["corrected_data_path"]
        
        #read csv-files of clean, dirty and corrected dataset
        self.clean_data = self.read_csv_dataset(metric_dict["clean_data_path"])
        self.dirty_data = self.read_csv_dataset(metric_dict["dirty_data_path"])
        self.corrected_data = self.read_csv_dataset(metric_dict["corrected_data_path"])
        
        #create dictionaries for errorneous cells
        #save clean, dirty and corrected values of erroneous cells seperately
        self.error_clean_val = self.get_dataframes_difference(self.dirty_data, self.clean_data) #clean values
        self.error_dirty_val = self.get_dataframes_difference(self.clean_data, self.dirty_data) #dirty values
        self.error_corrected_val = self.get_error_corrected_val() #corrected values
        
        #save attribute classification
        self.str_attr = metric_dict["str_attr"]
        self.short_str_attr = metric_dict["short_str_attr"]
        self.long_str_attr = metric_dict["long_str_attr"]
        self.numer_attr = metric_dict["numer_attr"]
        
        #calculate "normal" metrics precision, recall, f1
        self.standard_metric = self.get_data_cleaning_evaluation()
        
        #numeric TP
        self.numer_tp = self.get_numer_tp()
        
        #calculate and save fuzzy metrics
        #self.fuzzy_alt_metric = self.get_data_cleaning_evaluation_fuzzy_alt()
        self.fuzzy_jw = self.get_data_cleaning_evaluation_fuzzy_JW()
        self.fuzzy_me = self.get_data_cleaning_evaluation_fuzzy_ME()
        #self.fuzzy_ld_words = self.get_data_cleaning_evaluation_fuzzy_LD_Words() 
        self.fuzzy_ld_char = self.get_data_cleaning_evaluation_fuzzy_LD_Char() 
        #self.fuzzy_ld = self.get_data_cleaning_evaluation_fuzzy_LD() if self.short_str_attr or self.long_str_attr else {"LD Message": "short or long string attributes not declared"}
        self.fuzzy_semantics_sentences = self.get_data_cleaning_evaluation_fuzzy_semantic_sentences()
        
        #fuzzy metrics combined with outlier metric
        if self.numer_attr:
            #self.fuzzy_alt_num_metric = self.get_data_cleaning_evaluation_fuzzy_alt(True)
            self.fuzzy_jw_num = self.get_data_cleaning_evaluation_fuzzy_JW(True)
            self.fuzzy_me_num = self.get_data_cleaning_evaluation_fuzzy_ME(True)
            #self.fuzzy_ld_words_num = self.get_data_cleaning_evaluation_fuzzy_LD_Words(True)
            self.fuzzy_ld_char_num = self.get_data_cleaning_evaluation_fuzzy_LD_Char(True)
            #self.fuzzy_ld_num = self.get_data_cleaning_evaluation_fuzzy_LD(True) if self.short_str_attr or self.long_str_attr else {"LD Num Message": "short or long string attributes not declared"}
            self.fuzzy_semantics_sentences_num = self.get_data_cleaning_evaluation_fuzzy_semantic_sentences(True)
            
        #average metrics
        self.avg_string_metric = self.get_string_metric_avg()
        self.avg_string_semantic_metric = self.get_string_semantic_metric_avg()
        
        #combined evaluation
        self.combined_metric = self.get_combined_score_evaluation()

        
    def print_metrics(self):
        print(self.combined_metric)
        print("")
        print(self.standard_metric)
        #print(self.fuzzy_alt_metric)
        print(self.fuzzy_jw)
        print(self.fuzzy_me)
        #print(self.fuzzy_ld_words)
        print(self.fuzzy_ld_char)
        #print(self.fuzzy_ld)
        #print(self.fuzzy_semantics_words)
        print(self.fuzzy_semantics_sentences)
        print("")
        
        if self.numer_attr:
            #print(self.fuzzy_alt_num_metric)
            print(self.fuzzy_jw_num)
            print(self.fuzzy_me_num)
            #print(self.fuzzy_ld_words_num)
            print(self.fuzzy_ld_char_num)
            #print(self.fuzzy_ld_num)
            #print(self.fuzzy_semantics_words_num)
            print(self.fuzzy_semantics_sentences_num)
            
        else:
            print({"Num Metrics": "numeric attributes not declared"})

        
    def read_csv_dataset(self, dataset_path):
        """
        This method reads a dataset from a csv file path.
        """
        dataframe = pandas.read_csv(dataset_path, sep=",", header="infer", encoding="utf-8", dtype=str,
                                    keep_default_na=False, low_memory=False).applymap(self.value_normalizer)
        return dataframe
    
    @staticmethod
    def value_normalizer(value):
        """
        This method takes a value and minimally normalizes it.
        """
        value = html.unescape(value)
        value = re.sub("[\t\n ]+", " ", value, re.UNICODE)
        value = value.strip("\t\n ")
        return value
    
    def get_string_metric_avg(self):
        if True:
            ld_words_char_avg_p = list(self.fuzzy_ld_char.values())[0]
            ld_words_char_avg_r = list(self.fuzzy_ld_char.values())[1]
            ld_words_char_avg_f1 = list(self.fuzzy_ld_char.values())[2]
            avg_precision = (list(self.fuzzy_jw.values()))[0] + (list(self.fuzzy_me.values()))[0] +  ld_words_char_avg_p / 3
            avg_recall = (list(self.fuzzy_jw.values()))[1] + (list(self.fuzzy_me.values()))[1] +  ld_words_char_avg_r / 3
            avg_f1 = (list(self.fuzzy_jw.values()))[2] + (list(self.fuzzy_me.values()))[2] +  ld_words_char_avg_f1 / 3
        
        else:
            avg_precision = (list(self.fuzzy_jw.values())[0] + list(self.fuzzy_me.values())[0] + list(self.fuzzy_ld.values())[0])/3
            avg_recall = (list(self.fuzzy_jw.values())[1] + list(self.fuzzy_me.values())[1] + list(self.fuzzy_ld.values())[1])/3
            avg_f1 = (list(self.fuzzy_jw.values())[2] + list(self.fuzzy_me.values())[2] + list(self.fuzzy_ld.values())[0])/3

        return {"Average of String Metrics Precision": round(avg_precision,3), "Average of String Metrics Recall": round(avg_recall,3), "Average of String Metrics F1": round(avg_f1,3)}
        
    def get_string_semantic_metric_avg(self):
        avg_precision = (list(self.avg_string_metric.values())[0] + list(self.fuzzy_semantics_sentences.values())[0]) / 2
        avg_recall = (list(self.avg_string_metric.values())[1] + list(self.fuzzy_semantics_sentences.values())[1]) / 2
        avg_f1 = (list(self.avg_string_metric.values())[2] + list(self.fuzzy_semantics_sentences.values())[2]) / 2
        
        return {"Average of String and Semantics Metrics Precision": round(avg_precision,3), "Average of String and Semantics Metrics Recall": round(avg_recall,3), "Average of String and Semantics Metrics F1": round(avg_f1,3)}
        
        
    
    def get_dataframes_difference(self, dataframe_1, dataframe_2):
        """
        This method compares two dataframes and returns the different cells.
        """
        if dataframe_1.shape != dataframe_2.shape:
            sys.stderr.write("Two compared datasets do not have equal sizes!\n")
        difference_dictionary = {}
        difference_dataframe = dataframe_1.where(dataframe_1.values != dataframe_2.values).notna()
        for j in range(dataframe_1.shape[1]):
            for i in difference_dataframe.index[difference_dataframe.iloc[:, j]].tolist():
                difference_dictionary[(i, j)] = dataframe_2.iloc[i, j]
        return difference_dictionary
    
    def get_error_corrected_val(self):
        correction_dict = self.get_dataframes_difference(self.dirty_data, self.corrected_data)
        for key in list(correction_dict):
            if key not in self.error_clean_val:
                del correction_dict[key]
    
        return correction_dict
    
  
    def jaro_winkler_distance(self, s1, s2):
        """
        Compute Jaro-Winkler distance between two strings.
        """
        # If the s are equal
        if (s1 == s2):
            return 1.0

        # Length of two s
        len1 = len(s1)
        len2 = len(s2)

        # Maximum distance upto which matching
        # is allowed
        max_dist = floor(max(len1, len2) / 2) - 1

        # Count of matches
        match = 0

        # Hash for matches
        hash_s1 = [0] * len(s1)
        hash_s2 = [0] * len(s2)

        # Traverse through the first
        for i in range(len1):

            # Check if there is any matches
            for j in range(max(0, i - max_dist), 
                           min(len2, i + max_dist + 1)):

                # If there is a match
                if (s1[i] == s2[j] and hash_s2[j] == 0):
                    hash_s1[i] = 1
                    hash_s2[j] = 1
                    match += 1
                    break

        # If there is no match
        if (match == 0):
            return 0.0

        # Number of transpositions
        t = 0
        point = 0

        # Count number of occurrences
        # where two characters match but
        # there is a third matched character
        # in between the indices
        for i in range(len1):
            if (hash_s1[i]):

                # Find the next matched character
                # in second
                while (hash_s2[point] == 0):
                    point += 1

                if (s1[i] != s2[point]):
                    t += 1
                point += 1
        t = t//2

        # Return the Jaro Similarity
        return (match/ len1 + match / len2 +
                (match - t) / match)/ 3.0
    

    def jaro_winkler_distance_fuzzy(self, clean, dirty, corrected):
        jw_clean_dirty = self.jaro_winkler_distance(clean, dirty)
        jw_clean_corrected = self.jaro_winkler_distance(clean, corrected)


        return jw_clean_corrected - jw_clean_dirty
    
    
    def get_data_cleaning_evaluation_fuzzy_JW(self, num_metric=False):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ec_tp = 0.0
        output_size = 0.0
        
        for cell in self.error_corrected_val:
            output_size += 1
            if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                ec_tp += 1.0
            elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                metric_score = self.jaro_winkler_distance_fuzzy(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                ec_tp += metric_score
                print([self.error_clean_val[cell] ,  self.error_dirty_val[cell],  self.error_corrected_val[cell], metric_score])
                if metric_score >= 0:
                    pc_r += 1
                else:
                    pc_f += 1 
        
        
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy JW Num Precision": round(ec_p, 3),"Fuzzy JW Num Recall": round(ec_r, 3), "Fuzzy JW Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy JW Precision": round(ec_p, 3),"Fuzzy JW Recall": round(ec_r, 3), "Fuzzy JW F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}

    def get_data_cleaning_evaluation(self):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                ec_tp += 1.0
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        return {"Precision": round(ec_p, 3),"Recall": round(ec_r, 3), "F1": round(ec_f, 3), "Amount of fixed data errors": output_size}
    
    def get_single_outlier_score(self, clean, dirty, corrected):
        score = 1 - (abs(int(clean)-int(corrected)) / abs(int(clean) - int(dirty)))
    
        if score >= 0:
            return score

        else:
            return -1
        
    def get_fuzzy_score_outlier(self, clean, dirty, corrected):
        clean = re.findall(r'\d+', clean)
        dirty = re.findall(r'\d+', dirty)
        corrected = re.findall(r'\d+', corrected)

        if len(clean) != len(dirty) or len(dirty) != len(corrected) or len(corrected) != len(clean):
            return 0

        count = 0

        for (o,d,c) in  zip(clean, dirty, corrected):
            count += self.get_single_outlier_score(o, d, c)

        return count/len(clean)
    
    def get_numer_tp(self):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """

        ec_tp = 0.0
        for cell in self.error_corrected_val:
            if cell in self.error_clean_val:
                if cell[1] in self.numer_attr and self.error_corrected_val[cell] != self.error_clean_val[cell]:
                    ec_tp += self.get_fuzzy_score_outlier(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    
        return ec_tp 
    
    
    def get_data_cleaning_evaluation_fuzzy_alt(self, num_metric=False):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """

        ed_tp = 0.0
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                ed_tp += 1.0
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif cell[1] in self.numer_attr and num_metric:
                    ec_tp += self.get_fuzzy_score_outlier(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                elif cell[1] in self.str_attr:
                    ec_tp += self.get_fuzzy_score_string_alt(self.error_corrected_val[cell], self.error_clean_val[cell], self.error_corrected_val[cell])
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        
        if num_metric:
            return {"Fuzzy Alt Num Precision": round(ec_p, 3),"Fuzzy Alt Num Recall": round(ec_r, 3), "Fuzzy Alt Num F1": round(ec_f, 3)}
        else:
            return {"Fuzzy Alt Precision": round(ec_p, 3),"Fuzzy Alt Recall": round(ec_r, 3), "Fuzzy Alt F1": round(ec_f, 3)}
    
    def get_fuzzy_score_string_alt(self, clean, dirty, corrected ):
        if len(clean) != len(corrected) or len(dirty) != len(clean):
            return 0

        count_w = 0.0
        count_r = 0.0
        for o, c, d in zip(clean, corrected, dirty):
            if o != d:
                count_w += 1
                if o == c:
                    count_r += 1
            else:
                if c != d:
                    count_r -= 1

        if count_r <= 0:
            return 0

        return count_r / count_w
    
    def get_data_cleaning_evaluation_fuzzy_ME(self, num_metric=False):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    metric_score = self.monge_elkan_distance_fuzzy([self.error_corrected_val[cell]], [self.error_clean_val[cell]], [self.error_dirty_val[cell]])
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
    
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy ME Num Precision": round(ec_p, 3),"Fuzzy ME Num Recall": round(ec_r, 3), "Fuzzy ME Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy ME Precision": round(ec_p, 3),"Fuzzy ME Recall": round(ec_r, 3), "Fuzzy ME F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        
        
        
    
    def monge_elkan(self, bag1, bag2):
        """
        Compute Monge-Elkan similarity measure between two bags (lists).

        The Monge-Elkan similarity measure is a type of Hybrid similarity measure that combine the benefits of
        sequence-based and set-based methods. This can be effective for domains in which more control is needed
        over the similarity measure. It implicitly uses a secondary similarity measure, such as levenshtein to compute
        over all similarity score.

        Args:
            bag1,bag2 (list): Input lists

            sim_func (function): Secondary similarity function. This is expected to be a sequence-based
                similarity measure (defaults to levenshtein)

        Returns:
            Monge-Elkan similarity score (float)

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None


        Examples:
            >>> monge_elkan(['Niall'], ['Neal'])
            0.8049999999999999
            >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            0.8677218614718616
            >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=needleman_wunsch)
            2.0
            >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=affine)
            2.25
            >>> monge_elkan([''], ['a'])
            0.0
            >>> monge_elkan(['Niall'], ['Nigel'])
            0.7866666666666667

        References:
            * Principles of Data Integration book
        """

        # if exact match return 1.0
        if bag1 == bag2:
            return 1.0
        # if one of the strings is empty return 0
        if (len(bag1) == 0) or (len(bag2) == 0):
            return 0
        # aggregated sum of all the max sim score of all the elements in bag1
        # with elements in bag2
        sum_of_maxes = 0
        for t1 in bag1:
            max_sim = float('-inf')
            for t2 in bag2:
                max_sim = max(max_sim, self.jaro_winkler_distance(t1, t2))
            sum_of_maxes += max_sim
        sim = float(sum_of_maxes) / float(len(bag1))
        return sim


    def monge_elkan_distance_fuzzy(self, clean, dirty, corrected):
        me_clean_dirty = self.monge_elkan(clean, dirty)
        me_clean_corrected = self.monge_elkan(clean, corrected)
        #print(monge_elkan(clean, dirty))
        #print(monge_elkan(clean, corrected))


        return me_clean_corrected - me_clean_dirty
        
    def get_data_cleaning_evaluation_fuzzy_LD(self, num_metric=False):
        
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ed_tp = 0.0
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                ed_tp += 1.0
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                    
                elif cell[1] in self.numer_attr and num_metric:
                    ec_tp += self.get_fuzzy_score_outlier(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    
                elif cell[1] in self.long_str_attr:
                    metric_score = self.fuzzy_LD_Words(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])       
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
                        
                elif cell[1] in self.short_str_attr:
                    metric_score = self.fuzzy_LD_Char(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
                        
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        
        if num_metric:
            return {"Fuzzy LD Num Precision": round(ec_p, 3),"Fuzzy LD Recall": round(ec_r, 3), "Fuzzy LD F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            return {"Fuzzy LD Precision": round(ec_p, 3),"Fuzzy LD Recall": round(ec_r, 3), "Fuzzy LD F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
    

    def get_data_cleaning_evaluation_fuzzy_LD_Char(self, num_metric=False):
        
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                ec_tp += 1.0
            elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                metric_score = self.fuzzy_LD_Char(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                ec_tp += metric_score
                #print([self.error_clean_val[cell] ,  self.error_dirty_val[cell],  self.error_corrected_val[cell], metric_score])
                if metric_score >= 0:
                    pc_r += 1
                else:
                    pc_f += 1
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Char Num Precision": round(ec_p, 3),"Fuzzy LD Char Num Recall": round(ec_r, 3), "Fuzzy LD Char Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else: 
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Char Precision": round(ec_p, 3),"Fuzzy LD Char Recall": round(ec_r, 3), "Fuzzy LD Char F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
    
    def get_data_cleaning_evaluation_fuzzy_LD_Words(self, num_metric=False):
        
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ed_tp = 0.0
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                ed_tp += 1.0
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    metric_score = self.fuzzy_LD_Words(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])       
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
        
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Words Num Precision": round(ec_p, 3),"Fuzzy LD Words Num Recall": round(ec_r, 3), "Fuzzy LD Words Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Words Precision": round(ec_p, 3),"Fuzzy LD Words Recall": round(ec_r, 3), "Fuzzy LD Words F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}


    def fuzzy_LD_Char(self, clean, dirty, corrected):
        LD_clean_dirty =  self.levenshteinDistanceChar(clean, dirty)
        LD_clean_corrected = self.levenshteinDistanceChar(clean, corrected)
        print([clean, dirty, corrected, LD_clean_dirty, LD_clean_corrected,(LD_clean_corrected-LD_clean_dirty) ])

        return (LD_clean_corrected-LD_clean_dirty)
    
    
    def fuzzy_LD_Words(self, clean, dirty, corrected):
        LD_clean_dirty =  self.levenshteinDistanceWords(clean, dirty)
        LD_clean_corrected = self.levenshteinDistanceWords(clean, corrected, (LD_clean_dirty - LD_clean_corrected) / max([len(clean), len(dirty)]))

       
        return (LD_clean_dirty - LD_clean_corrected) / max([len(clean), len(dirty)])
        
    def levenshteinDistanceWords(self, token1, token2):
        token1 = token1.split()
        token2 = token2.split()
        distances = numpy.zeros((len(token1) + 1, len(token2) + 1))

        for t1 in range(len(token1) + 1):
            distances[t1][0] = t1

        for t2 in range(len(token2) + 1):
            distances[0][t2] = t2

        a = 0
        b = 0
        c = 0

        for t1 in range(1, len(token1) + 1):
            for t2 in range(1, len(token2) + 1):
                if (token1[t1-1] == token2[t2-1]):
                    distances[t1][t2] = distances[t1 - 1][t2 - 1]
                else:
                    a = distances[t1][t2 - 1]
                    b = distances[t1 - 1][t2]
                    c = distances[t1 - 1][t2 - 1]

                    if (a <= b and a <= c):
                        distances[t1][t2] = a + 1
                    elif (b <= a and b <= c):
                        distances[t1][t2] = b + 1
                    else:
                        distances[t1][t2] = c + 1


        return distances[len(token1)][len(token2)]

    def levenshteinDistanceChar(self, token1, token2):
        distances = numpy.zeros((len(token1) + 1, len(token2) + 1))

        for t1 in range(len(token1) + 1):
            distances[t1][0] = t1

        for t2 in range(len(token2) + 1):
            distances[0][t2] = t2

        a = 0
        b = 0
        c = 0

        for t1 in range(1, len(token1) + 1):
            for t2 in range(1, len(token2) + 1):
                if (token1[t1-1] == token2[t2-1]):
                    distances[t1][t2] = distances[t1 - 1][t2 - 1]
                else:
                    a = distances[t1][t2 - 1]
                    b = distances[t1 - 1][t2]
                    c = distances[t1 - 1][t2 - 1]

                    if (a <= b and a <= c):
                        distances[t1][t2] = a + 1
                    elif (b <= a and b <= c):
                        distances[t1][t2] = b + 1
                    else:
                        distances[t1][t2] = c + 1


        return 1 - ((distances[len(token1)][len(token2)])/max([len(token1), len(token2)]))
    
    
    
    
    def get_fuzzy_score_semantic_sentence(self, clean, dirty, corrected):
        model = SentenceTransformer('all-MiniLM-L6-v2')

        embeddings1 = model.encode(clean, convert_to_tensor=True)
        embeddings2 = model.encode(dirty, convert_to_tensor=True)
        embeddings3 = model.encode(corrected, convert_to_tensor=True)

        cosine_scores_clean_dirty = util.cos_sim(embeddings1, embeddings2)
        cosine_scores_clean_corrected = util.cos_sim(embeddings1, embeddings3)

        score = (cosine_scores_clean_corrected[0][0].item() - 0.2) / (0.8) - (cosine_scores_clean_dirty[0][0].item() - 0.2) / (0.8)
        print([clean, dirty, corrected,cosine_scores_clean_dirty,cosine_scores_clean_corrected, score])
        return score
    
    def get_semantic_score(self, s1, s2):
        model = SentenceTransformer('all-MiniLM-L6-v2')

        embeddings1 = model.encode(s1, convert_to_tensor=True)
        embeddings2 = model.encode(s2, convert_to_tensor=True)
        score = util.cos_sim(embeddings1, embeddings2)
        
        return (score[0][0].item() - 0.2) / (0.8)
        
        


    def get_data_cleaning_evaluation_fuzzy_semantic_sentences(self, num_metric=False):

        """
        This method evaluates data cleaning process using fuzzy metrics
        """

        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    metric_score = self.get_fuzzy_score_semantic_sentence(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1

        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy Semantic Sentences Num Precision": round(ec_p, 3),"Fuzzy Semantic Sentences Num Recall": round(ec_r, 3), "Fuzzy Semantic Sentences Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:    
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy Semantic Sentences Precision": round(ec_p, 3),"Fuzzy Semantic Sentences Recall": round(ec_r, 3), "Fuzzy Semantic Sentences F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        
        
    def get_combined_score_evaluation(self, num_metric=False):
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    
                    clean_dirty_semantic_score = self.get_semantic_score(self.error_clean_val[cell], self.error_dirty_val[cell])
                    clean_dirty_string_score = self.get_string_score_avg(self.error_clean_val[cell], self.error_dirty_val[cell])
                    clean_corrected_semantic_score = self.get_semantic_score(self.error_clean_val[cell], self.error_corrected_val[cell])
                    clean_corrected_string_score = self.get_string_score_avg(self.error_clean_val[cell], self.error_corrected_val[cell])
                    
                    combined_score = self.get_combined_score(clean_dirty_semantic_score, clean_dirty_string_score, clean_corrected_semantic_score, clean_corrected_string_score)
                    
                    ec_tp += combined_score
                    if combined_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
                        
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        return {"Combined Precision": round(ec_p, 3),"Combined Recall": round(ec_r, 3), "Combined F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
                        
                        
    def get_string_score_avg(self, s1, s2):
        score_avg = ((self.levenshteinDistanceChar(s1, s2) / max([len(s1), len(s2)])) + self.monge_elkan(s1, s2) + self.jaro_winkler_distance(s1, s2)) / 3
        return score_avg
    
    def get_combined_score(self, cd_semantic, cd_string, cc_semantic, cc_string):
        
        threshold = 0.7
        
        string_score = cc_string - cd_string
        semantic_score = cc_semantic - cd_semantic
        avg_score = (string_score + semantic_score) / 2
        
        
        #semantic score high and string score high
        if cd_semantic >= threshold and cc_string >=threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return avg_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return string_score

            elif cc_semantic < threshold and cc_string >= threshold:
                return semantic_score

            elif cc_semantic < threshold and cc_string < threshold:
                return avg_score
        
        
        #semantic score high and string score low
        elif cd_semantic >= threshold and cd_string < threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return string_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return avg_score
                
            elif cc_semantic < threshold and cc_string >= threshold:
                return avg_score 

            elif cc_semantic < threshold and cc_string < threshold:
                return semantic_score
        
        
        
        
        #semantic score low and string score high
        elif cd_semantic < threshold and cd_string >= threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return semantic_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return avg_score 

            elif cc_semantic < threshold and cc_string >= threshold:
                return avg_score 

            elif cc_semantic < threshold and cc_string < threshold:
                return string_score
        
        
        #semantic score low and string score low
        elif cd_semantic < threshold and cd_string < threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return avg_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return semantic_score

            elif cc_semantic < threshold and cc_string >= threshold:
                return string_score

            elif cc_semantic < threshold and cc_string < threshold:
                return avg_score
            
        return avg_score
        
        
        


     

In [2]:
metric_dict_hospital = {
    "clean_data_path": "../datasets/hospital/clean.csv",
    "dirty_data_path": "../datasets/hospital/dirty.csv",
    "corrected_data_path": "../datasets/hospital/holoclean_correction.csv",
    "str_attr": [2,3,6,7,9, 11, 12, 13, 14, 15, 16, 19],
    "short_str_attr": [3, 6, 7, 9, 13, 17, 18, 19],
    "long_str_attr": [2,11, 12, 14, 15, 16],
    "numer_attr": [17, 18]
    
}

m_hospital = Metrics(metric_dict_hospital)

m_hospital.print_metrics()

print(m_hospital.numer_tp)

print(m_hospital.avg_string_metric)
print(m_hospital.avg_string_semantic_metric)







['andalusia regional hospital', 'andaxusia regionax hospitax', 'huntsville hospital', -0.2601933073424302]
['33700 highway 43', '33700 highway x3', '4370 west main street', -0.373015873015873]
['1720 university blvd', '1720 univxrsity blvd', '1000 first street north', -0.4269726247987118]
['anniston', 'axxistox', 'birmingham', -0.43333333333333335]
['montgomery', 'montxomery', 'centre', -0.3277777777777777]
['calhoun', 'calhoxn', 'jefferson', -0.4021164021164021]
['surgical infection prevention', 'suxgical infection pxevention', 'pneumonia', -0.4238277686553549]
['surgical infection prevention', 'surgical ixfectiox prevextiox', 'pneumonia', -0.37785075716110206]
['hf-2', 'hf-x', 'hf-1', 0.0]
['scip-inf-4', 'scip-inx-4', 'scip-inf-3', 0.0]
['scip-inf-3', 'scxp-xnf-3', 'pn-7', -0.30000000000000004]
['scip-vte-1', 'sxip-vte-1', 'scip-inf-6', -0.19999999999999984]
['ami-5', 'ami-x', 'ami-3', 0.0]
['pn-5c', 'px-5c', 'pn-3b', -0.1333333333333333]
['cac-1', 'xax-1', 'scip-vte-2', -0.200000000

['all heart surgery patients whose blood sugar (blood glucose) is kept under good control in the days right after surgery', 'all heart surgerx patients whose blood sugar (blood glucose) is kept under good control in the daxs right after surgerx', 'surgery patients who were given the right kind of antibiotic to help prevent infection', 0.9747899159663865, 0.33613445378151263, -0.6386554621848739]
['surgery patients needing hair removed from the surgical area before surgery who had hair removed using a safer method (electric clippers or hair removal cream c not a razor)', 'surgeryxpatientsxneedingxhairxremovedxfromxthexsurgicalxareaxbeforexsurgery& xwhoxhadxhairxremovedxusingxaxsaferxmethodx(electricxclippersxorxhairxremovalxcreamxï¿½cxnotxaxrazor)', 'all heart surgery patients whose blood sugar (blood glucose) is kept under good control in the days right after surgery', 0.8202247191011236, 0.2832369942196532, -0.5369877248814704]
['heart failure patients given discharge instructions', '

['heart failure patients given discharge instructions', 'heaxt failuxe patients given dischaxge instxuctions', 'heart attack patients given pci within 90 minutes of arrival', tensor([[0.2643]]), tensor([[0.3274]]), 0.07877290248870848]
['heart attack patients given ace inhibitor or arb for left ventricular systolic dysfunction (lvsd)', 'hxart attack patixnts givxn ace inhibitor or arb for lxft vxntricular systolic dysfunction (lvsd)', 'heart attack patients given aspirin at arrival', tensor([[0.6695]]), tensor([[0.4970]]), -0.21567624062299734]
['pneumonia patients assessed and given influenza vaccination', 'pneumonix pxtients assessed xnd given influenzx vxccinxtion', 'pneumonia patients given initial antibiotic(s) within 6 hours after arrival', tensor([[0.1655]]), tensor([[0.5278]]), 0.45280676335096354]
['pneumonia patients assessed and given influenza vaccination', 'pxeumoxia patiexts assessed axd givex ixfluexza vaccixatiox', 'pneumonia patients given initial antibiotic(s) within 

['all heart surgery patients whose blood sugar (blood glucose) is kept under good control in the days right after surgery', 'all heart surgerx patients whose blood sugar (blood glucose) is kept under good control in the daxs right after surgerx', 'surgery patients who were given the right kind of antibiotic to help prevent infection', 0.9747899159663865, 0.33613445378151263, -0.6386554621848739]
['surgery patients needing hair removed from the surgical area before surgery who had hair removed using a safer method (electric clippers or hair removal cream c not a razor)', 'surgeryxpatientsxneedingxhairxremovedxfromxthexsurgicalxareaxbeforexsurgery& xwhoxhadxhairxremovedxusingxaxsaferxmethodx(electricxclippersxorxhairxremovalxcreamxï¿½cxnotxaxrazor)', 'all heart surgery patients whose blood sugar (blood glucose) is kept under good control in the days right after surgery', 0.8202247191011236, 0.2832369942196532, -0.5369877248814704]
['heart failure patients given discharge instructions', '

['heart failure patients given discharge instructions', 'heaxt failuxe patients given dischaxge instxuctions', 'heart attack patients given pci within 90 minutes of arrival', tensor([[0.2643]]), tensor([[0.3274]]), 0.07877290248870848]
['heart attack patients given ace inhibitor or arb for left ventricular systolic dysfunction (lvsd)', 'hxart attack patixnts givxn ace inhibitor or arb for lxft vxntricular systolic dysfunction (lvsd)', 'heart attack patients given aspirin at arrival', tensor([[0.6695]]), tensor([[0.4970]]), -0.21567624062299734]
['pneumonia patients assessed and given influenza vaccination', 'pneumonix pxtients assessed xnd given influenzx vxccinxtion', 'pneumonia patients given initial antibiotic(s) within 6 hours after arrival', tensor([[0.1655]]), tensor([[0.5278]]), 0.45280676335096354]
['pneumonia patients assessed and given influenza vaccination', 'pxeumoxia patiexts assessed axd givex ixfluexza vaccixatiox', 'pneumonia patients given initial antibiotic(s) within 

In [3]:
metric_dict_flights = {
    "clean_data_path": "../datasets/flights/clean.csv",
    "dirty_data_path": "../datasets/flights/dirty.csv",
    "corrected_data_path": "../datasets/flights/flights_repaired_holoclean.csv",
    "str_attr": [3,4,5,6],
    "short_str_attr": [],
    "long_str_attr": [],
    "numer_attr": []
    
}

m_flights = Metrics(metric_dict_flights)

m_flights.print_metrics()




['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['8:00 p.m.', 'Thu 20:00', 'thu 20:00', 0.0]
['8:00 a.m.', 'Not Available', 'not available', 0.0]
['8:41 a.m.', 'Not Available', 'not available', 0.0]
['6:00 a.m.', 'Not Available', 'not available', 0.0]
['8:15 a.m.', 'Not Available', 'not available', 0.0]
['1:55 p.m.', 'Not Available', 'not available', 0.0]
['1:33 p.m.', 'Not Available', 'not available', 0.0]
['2:30 p.m.', 'Not Available', 'not available', 0.0]
['8:29 a.m.', 'Not Available', 'not available', 0.0]
['7:35 a.m.', 'Not Available', 'not available', 0.0]
['1:45 p.m.', 'Not Available', 'not available', 0.0]
['7:53 a.m.', 'Not Available', 'not available', 0.0]
['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['7:10 a.m.', '7:10aDec 1', '7:10adec 1', 0.0]
['6:40 a.m.', '6:40aDec 1', '6:40adec 1', 0.0]
['8:00 a.m.', '8:19aDec 1', '8:19adec 1', 0.0]
['8:41 a.m.', '8:41aDec 1', '8:41adec 1', 0.0]
['11:25 a.m.', '11:25aDec

['12:00 a.m.', '', '12:00 p.m.', tensor([[0.0604]]), tensor([[0.9186]]), 1.0727627063170075]
['12:00 a.m.', '', '12:00 p.m.', tensor([[0.0604]]), tensor([[0.9186]]), 1.0727627063170075]
['8:00 p.m.', 'Thu 20:00', 'thu 20:00', tensor([[0.6278]]), tensor([[0.6278]]), 0.0]
['8:00 a.m.', 'Not Available', 'not available', tensor([[0.0944]]), tensor([[0.0944]]), 0.0]
['8:41 a.m.', 'Not Available', 'not available', tensor([[0.1131]]), tensor([[0.1131]]), 0.0]
['6:00 a.m.', 'Not Available', 'not available', tensor([[0.0819]]), tensor([[0.0819]]), 0.0]
['8:15 a.m.', 'Not Available', 'not available', tensor([[0.1154]]), tensor([[0.1154]]), 0.0]
['1:55 p.m.', 'Not Available', 'not available', tensor([[0.1416]]), tensor([[0.1416]]), 0.0]
['1:33 p.m.', 'Not Available', 'not available', tensor([[0.0964]]), tensor([[0.0964]]), 0.0]
['2:30 p.m.', 'Not Available', 'not available', tensor([[0.1740]]), tensor([[0.1740]]), 0.0]
['8:29 a.m.', 'Not Available', 'not available', tensor([[0.0790]]), tensor([[0

['9:00 p.m.', '', '9:06 p.m.', tensor([[0.0996]]), tensor([[0.8895]]), 0.9874055720865726]
['8:25 p.m.', '', '8:29 p.m.', tensor([[0.1009]]), tensor([[0.9330]]), 1.040127044543624]
['1:30 p.m.', '', '11:54 a.m.', tensor([[0.0917]]), tensor([[0.6118]]), 0.6501350924372673]
['8:00 a.m.', '', '7:10 a.m.', tensor([[0.0580]]), tensor([[0.8452]]), 0.9839552640914917]
['8:25 p.m.', '', '7:27 p.m.', tensor([[0.1009]]), tensor([[0.8280]]), 0.9089192654937506]
['3:36 p.m.', '', '3:27 p.m.', tensor([[0.1057]]), tensor([[0.9260]]), 1.0254989936947823]
['11:08 p.m.', '', '11/30/2011 11:34 p.m.', tensor([[0.0974]]), tensor([[0.7818]]), 0.8554479666054249]
['11:25 p.m.', 'Not Available', 'not available', tensor([[0.1566]]), tensor([[0.1566]]), 0.0]
['6:30 p.m.', 'Not Available', 'not available', tensor([[0.1586]]), tensor([[0.1586]]), 0.0]
['7:15 p.m.', 'Not Available', 'not available', tensor([[0.1982]]), tensor([[0.1982]]), 0.0]
['11:55 p.m.', '', '12:10 a.m. (estimated runway)', tensor([[0.1098]])

['11:05 a.m.', '', '10:46 a.m.', tensor([[0.0643]]), tensor([[0.8120]]), 0.9347326215356588]
['7:22 a.m.', '', '6:56 a.m.', tensor([[0.0822]]), tensor([[0.8325]]), 0.9378513041883707]
['4:29 p.m.', '', '4:16 p.m.', tensor([[0.1064]]), tensor([[0.8686]]), 0.9527633991092443]
['6:08 a.m.', '', '5:58 a.m.', tensor([[0.0456]]), tensor([[0.7816]]), 0.920092985033989]
['7:39 a.m.', '', '7:22 a.m.', tensor([[0.0885]]), tensor([[0.9385]]), 1.0625880770385265]
['2:04 p.m.', '', '1:48 p.m.', tensor([[0.1308]]), tensor([[0.6636]]), 0.6660907343029976]
['8:25 p.m.', '', '8:29 p.m.', tensor([[0.1009]]), tensor([[0.9330]]), 1.040127044543624]
['12:12 p.m.', '', '11:54 a.m.', tensor([[0.1121]]), tensor([[0.7136]]), 0.7518422603607178]
['11:01 a.m.', '', '10:47 a.m.', tensor([[0.0794]]), tensor([[0.8159]]), 0.9206582233309746]
['4:29 p.m.', '', '4:16 p.m.', tensor([[0.1064]]), tensor([[0.8686]]), 0.9527633991092443]
['3:27 p.m.', '', '3:00 p.m.', tensor([[0.1355]]), tensor([[0.8952]]), 0.9496774151921

['11:25 p.m.', '', '11:29 p.m.', tensor([[0.0952]]), tensor([[0.9483]]), 1.0663930047303438]
['5:56 a.m.', '6:09aDec 1', '6:09adec 1', tensor([[0.4448]]), tensor([[0.4448]]), 0.0]
['12:41 a.m.', '12:41aDec 1', '12:41adec 1', tensor([[0.7300]]), tensor([[0.7300]]), 0.0]
['10:19 a.m.', '10:19aDec 1', '10:19adec 1', tensor([[0.6754]]), tensor([[0.6754]]), 0.0]
['10:10 a.m.', '9:59aDec 1', '9:59adec 1', tensor([[0.5917]]), tensor([[0.5917]]), 0.0]
['8:12 a.m.', '8:12aDec 1', '8:12adec 1', tensor([[0.6447]]), tensor([[0.6447]]), 0.0]
['7:25 a.m.', '7:04aDec 1', '7:04adec 1', tensor([[0.3151]]), tensor([[0.3151]]), 0.0]
['10:55 a.m.', '10:42aDec 1', '10:42adec 1', tensor([[0.5480]]), tensor([[0.5480]]), 0.0]
['8:00 a.m.', '8:00aDec 1', '8:00adec 1', tensor([[0.6940]]), tensor([[0.6940]]), 0.0]
['12:12 p.m.', '11:54aDec 1', '11:54adec 1', tensor([[0.5760]]), tensor([[0.5760]]), 0.0]
['11:56 a.m.', '11:56aDec 1', '11:56adec 1', tensor([[0.7375]]), tensor([[0.7375]]), 0.0]
['6:08 a.m.', '5:58aD

['6:57 a.m.', '6:41aDec 1', '6:41adec 1', tensor([[0.5850]]), tensor([[0.5850]]), 0.0]
['7:16 a.m.', '', '7:02 a.m.', tensor([[0.0960]]), tensor([[0.9023]]), 1.0078437253832817]
['6:56 a.m.', '', '5:59 a.m.', tensor([[0.0412]]), tensor([[0.8743]]), 1.0412931023165584]
['7:30 a.m.', '', '7:10 a.m.', tensor([[0.0662]]), tensor([[0.9438]]), 1.0969792678952217]
['2:44 p.m.', '', '2:32 p.m.', tensor([[0.1018]]), tensor([[0.8612]]), 0.9493326488882303]
['7:22 a.m.', '', '7:10 a.m.', tensor([[0.0822]]), tensor([[0.9257]]), 1.0544492397457361]
['8:23 p.m.', '', '8:01 p.m.', tensor([[0.1303]]), tensor([[0.9288]]), 0.9981423802673817]
['1:19 p.m.', '', '11:01 a.m.', tensor([[0.1290]]), tensor([[0.6935]]), 0.705674309283495]
['6:55 p.m.', '', '6:40 p.m.', tensor([[0.0920]]), tensor([[0.9196]]), 1.0344631876796484]
['8:04 a.m.', '', '7:28 a.m.', tensor([[0.1418]]), tensor([[0.6450]]), 0.6290188059210777]
['12:41 p.m.', '', '10:46 a.m.', tensor([[0.0927]]), tensor([[0.7063]]), 0.7670922391116619]
[

['6:55 a.m.', '', '6:55 a.m. dec 02', tensor([[0.0448]]), tensor([[0.7963]]), 0.9393700631335378]
['3:19 p.m.', '', '3:30 p.m.', tensor([[0.1432]]), tensor([[0.9426]]), 0.9992357715964317]
['8:05 p.m.', '', '7:38 p.m.', tensor([[0.0909]]), tensor([[0.8120]]), 0.9014799073338509]
['10:34 a.m.', '', '9:55 a.m.', tensor([[0.0782]]), tensor([[0.8415]]), 0.9541420824825764]
['6:55 a.m.', '6:55 a.m. Dec 02', '6:55 a.m. dec 02', tensor([[0.7963]]), tensor([[0.7963]]), 0.0]
['1:39 p.m.', '', '12:35 p.m.', tensor([[0.0996]]), tensor([[0.7535]]), 0.8173063769936562]
['8:22 p.m.', '', '8:05 p.m.', tensor([[0.1166]]), tensor([[0.8808]]), 0.9552928898483515]
['12:21 a.m.', '12:21 a.m. Dec 02', '12:21 a.m. dec 02', tensor([[0.7813]]), tensor([[0.7813]]), 0.0]
['4:17 p.m.', '', '3:55 p.m.', tensor([[0.1547]]), tensor([[0.7070]]), 0.690411813557148]
['5:50 a.m.', '5:50 a.m.Dec 02', '5:50 a.m.dec 02', tensor([[0.7460]]), tensor([[0.7460]]), 0.0]
['12:29 a.m.', '12:29 a.m. Dec 02', '12:29 a.m. dec 02', 

['8:05 p.m.', '', '7:38 p.m.', tensor([[0.0909]]), tensor([[0.8120]]), 0.9014799073338509]
['10:34 a.m.', '', '9:55 a.m.', tensor([[0.0782]]), tensor([[0.8415]]), 0.9541420824825764]
['9:40 a.m.', '9:40aDec 1', '9:40adec 1', tensor([[0.7027]]), tensor([[0.7027]]), 0.0]
['9:25 a.m.', '9:25aDec 1', '9:25adec 1', tensor([[0.6876]]), tensor([[0.6876]]), 0.0]
['11:55 a.m.', '11:55aDec 1', '11:55adec 1', tensor([[0.7644]]), tensor([[0.7644]]), 0.0]
['9:54 a.m.', '9:54aDec 1', '9:54adec 1', tensor([[0.7419]]), tensor([[0.7419]]), 0.0]
['11:19 a.m.', '11:19aDec 1', '11:19adec 1', tensor([[0.6818]]), tensor([[0.6818]]), 0.0]
['6:55 a.m.', '6:55aDec 2', '6:55adec 2', tensor([[0.7501]]), tensor([[0.7501]]), 0.0]
['7:23 a.m.', '7:23aDec 1', '7:23adec 1', tensor([[0.6394]]), tensor([[0.6394]]), 0.0]
['5:35 a.m.', '5:35aDec 1', '5:35adec 1', tensor([[0.6680]]), tensor([[0.6680]]), 0.0]
['9:21 a.m.', '9:21aDec 2', '9:21adec 2', tensor([[0.6716]]), tensor([[0.6716]]), 0.0]
['10:06 a.m.', '10:06aDec 1'

['11:53 a.m.', '', '12:18 p.m.', tensor([[0.0600]]), tensor([[0.7418]]), 0.8522216929122806]
['6:55 a.m.', '6:55 a.m. Dec 02', '6:55 a.m. dec 02', tensor([[0.7963]]), tensor([[0.7963]]), 0.0]
['12:21 a.m.', '12:21 a.m. Dec 02', '12:21 a.m. dec 02', tensor([[0.7813]]), tensor([[0.7813]]), 0.0]
['5:50 a.m.', '5:50 a.m. Dec 02', '5:50 a.m. dec 02', tensor([[0.7460]]), tensor([[0.7460]]), 0.0]
['12:29 a.m.', '12:29 a.m. Dec 02', '12:29 a.m. dec 02', tensor([[0.8037]]), tensor([[0.8037]]), 0.0]
['12:15 a.m.', '12:15 a.m. Dec 02', '12:15 a.m. dec 02', tensor([[0.7742]]), tensor([[0.7742]]), 0.0]
['9:43 a.m.', '', 'not available', tensor([[0.0496]]), tensor([[0.0962]]), 0.05824217572808266]
['5:11 a.m.', '5:11 a.m.Dec 02', '5:11 a.m.dec 02', tensor([[0.7575]]), tensor([[0.7575]]), 0.0]
['10:30 p.m.', '', '10:28 p.m.', tensor([[0.1090]]), tensor([[0.9609]]), 1.0649103112518787]
['6:55 a.m.', '', '12/2/11 6:35 a.m.', tensor([[0.0448]]), tensor([[0.7169]]), 0.8400595607236028]
['5:50 a.m.', '', 

['9:13 a.m.', '', '9:11 a.m.', tensor([[0.1161]]), tensor([[0.7710]]), 0.8186390250921249]
['3:50 p.m.', '', '3:26 p.m.', tensor([[0.1287]]), tensor([[0.8625]]), 0.9172222390770912]
['4:59 p.m.', '', '4:55 p.m.', tensor([[0.1078]]), tensor([[0.9441]]), 1.0453490912914276]
['8:45 a.m.', '', '8:33 a.m.', tensor([[0.0788]]), tensor([[0.9310]]), 1.0652840416878462]
['8:02 a.m.', '', '7:50 a.m.', tensor([[0.0637]]), tensor([[0.7654]]), 0.8771023992449045]
['2:00 p.m.', '', '1:52 p.m.', tensor([[0.1218]]), tensor([[0.6916]]), 0.7121656369417906]
['9:20 p.m.', '', '9:11 p.m.', tensor([[0.1113]]), tensor([[0.7630]]), 0.8146587945520878]
['4:44 p.m.', '', '4:22 p.m.', tensor([[0.0998]]), tensor([[0.8943]]), 0.9931770339608192]
['7:10 p.m.', '', '6:59 p.m.', tensor([[0.1237]]), tensor([[0.7886]]), 0.8310642279684544]
['3:43 p.m.', '', '3:36 p.m.', tensor([[0.1020]]), tensor([[0.9424]]), 1.050542639568448]
['5:43 p.m.', '', '5:34 p.m.', tensor([[0.0724]]), tensor([[0.9292]]), 1.0709566064178944]


['6:40 p.m.', '', '6:09 p.m.', tensor([[0.0842]]), tensor([[0.8828]]), 0.9982715733349323]
['12:29 a.m.', '', '11:56 p.m.', tensor([[0.0722]]), tensor([[0.7902]]), 0.8974344190210104]
['2:46 p.m.', '2:46P', '2:46p', tensor([[0.8040]]), tensor([[0.8040]]), 0.0]
['10:04 a.m.', '10:08A', '10:08a', tensor([[0.7422]]), tensor([[0.7422]]), 0.0]
['7:39 p.m.', '7:39P', '7:39p', tensor([[0.8269]]), tensor([[0.8269]]), 0.0]
['9:05 a.m.', '9:04A', '9:04a', tensor([[0.4743]]), tensor([[0.4743]]), 0.0]
['11:06 a.m.', '11:06A', '11:06a', tensor([[0.8907]]), tensor([[0.8907]]), 0.0]
['3:21 p.m.', '', '3:07 p.m.', tensor([[0.1354]]), tensor([[0.9085]]), 0.9662788175046444]
['3:30 p.m.', '', '3:13 p.m.', tensor([[0.1164]]), tensor([[0.9101]]), 0.992089007049799]
['11:50 p.m.', '', '11:35 p.m.', tensor([[0.1209]]), tensor([[0.9279]]), 1.0086710564792156]
['4:09 p.m.', '', '3:59 p.m.', tensor([[0.1009]]), tensor([[0.7559]]), 0.8188198693096638]
['3:04 p.m.', '', '2:52 p.m.', tensor([[0.1478]]), tensor([[

['7:39 p.m.', '', '7:35 p.m.', tensor([[0.1196]]), tensor([[0.9717]]), 1.0652140900492668]
['5:05 p.m.', '', '4:57 p.m.', tensor([[0.0655]]), tensor([[0.7645]]), 0.8737723715603352]
['1:40 p.m.', '', '1:34 p.m.', tensor([[0.0884]]), tensor([[0.8944]]), 1.0075495205819607]
['5:34 p.m.', '', '5:24 p.m.', tensor([[0.0810]]), tensor([[0.8985]]), 1.0218342393636703]
['8:53 p.m.', '', '8:38 p.m.', tensor([[0.1072]]), tensor([[0.9225]]), 1.0192061960697174]
['12:21 a.m.', 'Not Available', 'not available', tensor([[0.1121]]), tensor([[0.1121]]), 0.0]
['6:34 p.m.', '', '6:13 p.m.', tensor([[0.0751]]), tensor([[0.9301]]), 1.068724412471056]
['4:23 p.m.', '', '4:18 p.m.', tensor([[0.1308]]), tensor([[0.9267]]), 0.9949457459151745]
['5:50 a.m.', '5:42 a.m. (Estimated runway)', '5:42 a.m. (estimated runway)', tensor([[0.6216]]), tensor([[0.6216]]), 0.0]
['5:11 a.m.', '', '5:21 a.m.', tensor([[0.0612]]), tensor([[0.9103]]), 1.061293762177229]
['6:40 p.m.', '', '6:19 p.m.', tensor([[0.0842]]), tensor

['4:23 p.m.', '', '4:18 p.m.', tensor([[0.1308]]), tensor([[0.9267]]), 0.9949457459151745]
['12:29 a.m.', '12:01 a.m. (Estimated runway)', '12:01 a.m. (estimated runway)', tensor([[0.6493]]), tensor([[0.6493]]), 0.0]
['12:15 a.m.', '11:49 p.m. (Estimated runway)', '11:49 p.m. (estimated runway)', tensor([[0.5598]]), tensor([[0.5598]]), 0.0]
['10:30 p.m.', '10:30 p.m. (Estimated)', '10:30 p.m. (estimated)', tensor([[0.8336]]), tensor([[0.8336]]), 0.0]
['9:20 p.m.', '9:19 p.m. (Estimated)', '9:19 p.m. (estimated)', tensor([[0.8306]]), tensor([[0.8306]]), 0.0]
['12:21 a.m.', '12:21 a.m. (Estimated)', '12:21 a.m. (estimated)', tensor([[0.8554]]), tensor([[0.8554]]), 0.0]
['11:58 a.m.', '', '11:30 a.m.', tensor([[0.0606]]), tensor([[0.9484]]), 1.1097931955009699]
['6:43 p.m.', '', '6:11 p.m.', tensor([[0.0627]]), tensor([[0.9212]]), 1.0730744618922472]
['11:44 a.m.', '', '11:33 a.m.', tensor([[0.0621]]), tensor([[0.9209]]), 1.073436108417809]
['11:12 a.m.', '', '10:49 a.m.', tensor([[0.0779

['10:30 p.m.', '', '10:28 p.m.', tensor([[0.1090]]), tensor([[0.9609]]), 1.0649103112518787]
['6:55 a.m.', '', '12/2/11 6:35 a.m.', tensor([[0.0448]]), tensor([[0.7169]]), 0.8400595607236028]
['3:30 p.m.', '3:20 p.m. (Runway)', '3:20 p.m. (runway)', tensor([[0.6911]]), tensor([[0.6911]]), 0.0]
['12:21 a.m.', '', '12:02 a.m.', tensor([[0.0966]]), tensor([[0.9277]]), 1.038812194019556]
['5:50 a.m.', '', '5:42 a.m. (estimated runway)', tensor([[0.0566]]), tensor([[0.6216]]), 0.7062219455838203]
['12:29 a.m.', '', '11:56 p.m.', tensor([[0.0722]]), tensor([[0.7902]]), 0.8974344190210104]
['12:15 a.m.', '', '11:49 p.m.', tensor([[0.0813]]), tensor([[0.7579]]), 0.8457233291119337]
['11:50 p.m.', '', '11:35 p.m.', tensor([[0.1209]]), tensor([[0.9279]]), 1.0086710564792156]
['3:50 p.m.', '', '3:26 p.m.', tensor([[0.1287]]), tensor([[0.8625]]), 0.9172222390770912]
['5:11 a.m.', '', '5:21 a.m.', tensor([[0.0612]]), tensor([[0.9103]]), 1.061293762177229]
['6:40 p.m.', '', '6:09 p.m.', tensor([[0.0

In [4]:
metric_dict_rayyan = {
    "clean_data_path": "../datasets/rayyan/clean.csv",
    "dirty_data_path": "../datasets/rayyan/dirty.csv",
    "corrected_data_path": "../datasets/rayyan/rayyan_repaired_holoclean.csv",
    "str_attr": [1, 2, 3, 4, 8, 10],
    "short_str_attr": [2, 4, 8],
    "long_str_attr": [1, 3, 10],
    "numer_attr": [6, 7]
    
}

m_rayyan = Metrics(metric_dict_rayyan)

m_rayyan.print_metrics()




['18F-flumazenil: a _-aminobutyric acid A-specific PET radiotracer for the localization of drug-resistant temporal lobe epilepsy.', '18F-flumazenil: a �_-aminobutyric acid A-specific PET radiotracer for the localization of drug-resistant temporal lobe epilepsy.', '18f-flumazenil: a �_-aminobutyric acid a-specific pet radiotracer for the localization of drug-resistant temporal lobe epilepsy.', -0.17914965739426014]
['Mild endoplasmic reticulum stress augments the proinflammatory effect of IL-1_ in pancreatic rat _-cells via the IRE1/XBP1s pathway.', 'Mild endoplasmic reticulum stress augments the proinflammatory effect of IL-1�_ in pancreatic rat �_-cells via the IRE1�/XBP1s pathway.', 'mild endoplasmic reticulum stress augments the proinflammatory effect of il-1�_ in pancreatic rat �_-cells via the ire1�/xbp1s pathway.', -0.16690071446169008]
['Optical turbulence: weak turbulence, condensates and collapsing filaments in the nonlinear Schr_dinger equation', 'Optical turbulence: weak tur

['18F-flumazenil: a _-aminobutyric acid A-specific PET radiotracer for the localization of drug-resistant temporal lobe epilepsy.', '18F-flumazenil: a �_-aminobutyric acid A-specific PET radiotracer for the localization of drug-resistant temporal lobe epilepsy.', '18f-flumazenil: a �_-aminobutyric acid a-specific pet radiotracer for the localization of drug-resistant temporal lobe epilepsy.', 0.9921875, 0.953125, -0.0390625]
['Mild endoplasmic reticulum stress augments the proinflammatory effect of IL-1_ in pancreatic rat _-cells via the IRE1/XBP1s pathway.', 'Mild endoplasmic reticulum stress augments the proinflammatory effect of IL-1�_ in pancreatic rat �_-cells via the IRE1�/XBP1s pathway.', 'mild endoplasmic reticulum stress augments the proinflammatory effect of il-1�_ in pancreatic rat �_-cells via the ire1�/xbp1s pathway.', 0.9777777777777777, 0.9111111111111111, -0.06666666666666665]
['Optical turbulence: weak turbulence, condensates and collapsing filaments in the nonlinear S

['{"V Cus_","H Galvez","M Iriondo","J Campistol","I Mlaga","A Garc_a Cazorla"}', '{"V Cus�_","H Galvez","M Iriondo","J Campistol","I M��laga","A Garc�_a Cazorla"}', '{"v cus�_","h galvez","m iriondo","j campistol","i m��laga","a garc�_a cazorla"}', 0.95, 0.7875, -0.16249999999999998]
['{"O E Havik","P Carlbring","L G _st","T Nordgreen","G Andersson","T Furmark"}', '{"O E Havik","P Carlbring","L G ��st","T Nordgreen","G Andersson","T Furmark"}', '{"o e havik","p carlbring","l g ��st","t nordgreen","g andersson","t furmark"}', 0.9743589743589743, 0.8076923076923077, -0.16666666666666663]
['{"Anna Nidecker","Siamak Ardekani","E Mark Mahone","Firouzeh Tannazi","Jarunee Intrapiromkul","Moody Wharam","Larry J Brant","Alena Horsk"}', '{"Anna Nidecker","Siamak Ardekani","E Mark Mahone","Firouzeh Tannazi","Jarunee Intrapiromkul","Moody Wharam","Larry J Brant","Alena Horsk��"}', '{"anna nidecker","siamak ardekani","e mark mahone","firouzeh tannazi","jarunee intrapiromkul","moody wharam","larry j

['{"D. Russell","P. Linck","R. Hibbs","I. Russell","R. Muntz","N. H. Williams","B. C1 - Frn RCT-s_kningen CN - Pubmed DA - Dec DO - 10.1093/fampra/cmh612 DP - NLM ET - 2004/11/09 Hounsome","R. T. Edwards","C. Wilkinson"}', '{"D. Russell","P. Linck","R. Hibbs","I. Russell","R. Muntz","N. H. Williams","B. C1 - Fr̴n RCT-șkningen CN - Pubmed DA - Dec DO - 10.1093/fampra/cmh612 DP - NLM ET - 2004/11/09 Hounsome","R. T. Edwards","C. Wilkinson"}', '{"d. russell","p. linck","r. hibbs","i. russell","r. muntz","n. h. williams","b. c1 - fr̴n rct-șkningen cn - pubmed da - dec do - 10.1093/fampra/cmh612 dp - nlm et - 2004/11/09 hounsome","r. t. edwards","c. wilkinson"}', 0.9908675799086758, 0.8082191780821918, -0.182648401826484]
['{"T. H. Michel","W. C1 - Frn RCT-s_kningen CN - Pubmed DA - Jul 1 DP - NLM ET - 2000/06/28 Rogers","A. Wagner","H. Wittink","R. Kulich","A. Sukiennik","R. Maciewicz"}', '{"T. H. Michel","W. C1 - Fr̴n RCT-șkningen CN - Pubmed DA - Jul 1 DP - NLM ET - 2000/06/28 Rogers"

['{"Ana Maria Baptista Menezes","David Alejandro Gonzlez","Elaine Cardozo Macedo","Pedro Curi Hallal","Maria de Ftima Maia","Marli Knorst","Samuel Carvalho Dumith","Jeovany Mart_nez-Mesa","Ricardo Bica Noal","Fernando Csar Wehrmeister","Jos Roberto Jardim"}', '{"Ana Maria Baptista Menezes","David Alejandro Gonz��lez","Elaine Cardozo Macedo","Pedro Curi Hallal","Maria de F��tima Maia","Marli Knorst","Samuel Carvalho Dumith","Jeovany Mart�_nez-Mesa","Ricardo Bica Noal","Fernando C̩sar Wehrmeister","Jos̩ Roberto Jardim"}', '{"ana maria baptista menezes","david alejandro gonz��lez","elaine cardozo macedo","pedro curi hallal","maria de f��tima maia","marli knorst","samuel carvalho dumith","jeovany mart�_nez-mesa","ricardo bica noal","fernando c̩sar wehrmeister","jos̩ roberto jardim"}', 0.9732824427480916, 0.8473282442748091, -0.12595419847328249]
['{"Michiel A J Kompier","Etty G A Wielenga-Meijer","Dani_l H J Wigboldus","Toon W Taris"}', '{"Michiel A J Kompier","Etty G A Wielenga-Meijer","D

['{"N. Jakubik","L. Trzpil","F. Raciborski","U. Samolinska-Zawisza","L. Samoliski","D. Paczesny","Z. Halat","J. Marszalkowska","B. Samolinski","P. Samel-Kowalik","A. Lusawa","J. Gutowska","A. Tomaszewska","A. Walkiewicz"}', '{"N. Jakubik","L. Trzpil","F. Raciborski","U. Samolinska-Zawisza","L. Samoli̱ski","D. Paczesny","Z. Halat","J. Marszalkowska","B. Samolinski","P. Samel-Kowalik","A. Lusawa","J. Gutowska","A. Tomaszewska","A. Walkiewicz"}', '{"n. jakubik","l. trzpil","f. raciborski","u. samolinska-zawisza","l. samoli̱ski","d. paczesny","z. halat","j. marszalkowska","b. samolinski","p. samel-kowalik","a. lusawa","j. gutowska","a. tomaszewska","a. walkiewicz"}', 0.9954545454545455, 0.8590909090909091, -0.13636363636363635]
['{"N. M. Pinheiro","R. Almeida-Reis","L. Oliva","M. A. Martins","M. L. V. Oliva","O. A. Theodoro-Junior","E. A. Leick","I. F. L. C. Tibrio","C. M. Prado","R. F. Righetti","B. T. M. Oliveira"}', '{"N. M. Pinheiro","R. Almeida-Reis","L. Oliva","M. A. Martins","M. L. 

['Stomatologiia', 'Stomatologii��a', 'stomatologii��a', tensor([[1.0000]]), tensor([[1.0000]]), 0.0]
["Prvention de l'insuffisance rnale induite par les produits de contrnste iodes", "Pr̩vention de l'insuffisance r̩nale induite par les produits de contrnste iodes", "pr̩vention de l'insuffisance r̩nale induite par les produits de contrnste iodes", tensor([[1.0000]]), tensor([[1.0000]]), 0.0]
['Actas espaolas de psiquiatr_a', 'Actas espa̱olas de psiquiatr�_a', 'actas espa̱olas de psiquiatr�_a', tensor([[1.]]), tensor([[1.]]), 0.0]
['Paliativn_ Schanzova osteotomie p_i nereponibiln_ luxaci kyeln_ho kloubu u pacient s d_tskou mozkovou obrnou v adolescentn_m v_u', 'Paliativn�_ Schanzova osteotomie p��i nereponibiln�_ luxaci ky��eln�_ho kloubu u pacient�� s d��tskou mozkovou obrnou v adolescentn�_m v��u', 'paliativn�_ schanzova osteotomie p��i nereponibiln�_ luxaci ky��eln�_ho kloubu u pacient�� s d��tskou mozkovou obrnou v adolescentn�_m v��u', tensor([[0.9665]]), tensor([[0.9665]]), 0.0]
[

['{"Michela Miani","Decio L Eizirik","Laurence Ladrire","Maikel L Colli","Miriam Cnop"}', '{"Michela Miani","Decio L Eizirik","Laurence Ladri̬re","Maikel L Colli","Miriam Cnop"}', '{"michela miani","decio l eizirik","laurence ladri̬re","maikel l colli","miriam cnop"}', tensor([[1.]]), tensor([[1.]]), 0.0]
['{"R Carraro","M F Rasmussen","L Niskanen","A Harper","N Finer","S R_ssner","M Kunesova","L Van Gaal","M E J Lean","A Astrup","A Rissanen","M J Savolainen","[Collective Name] NN8022-1807 Investigators"}', '{"R Carraro","M F Rasmussen","L Niskanen","A Harper","N Finer","S R̦ssner","M Kunesova","L Van Gaal","M E J Lean","A Astrup","A Rissanen","M J Savolainen","[Collective Name] NN8022-1807 Investigators"}', '{"r carraro","m f rasmussen","l niskanen","a harper","n finer","s r̦ssner","m kunesova","l van gaal","m e j lean","a astrup","a rissanen","m j savolainen","[collective name] nn8022-1807 investigators"}', tensor([[0.9862]]), tensor([[0.9862]]), 0.0]
['{"George Davey Smith","Amanda 

['{"M Caliskan","M Karab_c_oglu","A Citak","N Uzel","K Aydin","A Nayir","D D Soysal"}', '{"M Caliskan","M Karab̦c�_oglu","A Citak","N Uzel","K Aydin","A Nayir","D D Soysal"}', '{"m caliskan","m karab̦c�_oglu","a citak","n uzel","k aydin","a nayir","d d soysal"}', tensor([[0.9952]]), tensor([[0.9952]]), 0.0]
['{"N_ria Lloberas","Marcel la Franquesa","Josep M Cruzado","Josep M Griny_","Ins Rama","Gabriela Alperovich","Immaculada Herrero-Fresneda","Joan Torras","Pepita Gimnez-Bonaf"}', '{"N̼ria Lloberas","Marcel la Franquesa","Josep M Cruzado","Josep M Griny�_","In̩s Rama","Gabriela Alperovich","Immaculada Herrero-Fresneda","Joan Torras","Pepita Gim̩nez-Bonaf̩"}', '{"n̼ria lloberas","marcel la franquesa","josep m cruzado","josep m griny�_","in̩s rama","gabriela alperovich","immaculada herrero-fresneda","joan torras","pepita gim̩nez-bonaf̩"}', tensor([[0.9787]]), tensor([[0.9787]]), 0.0]
['{"Ronaldo Fazzi","Ana Estela Haddad","Maria Salete Nahs Pires Corr_a"}', '{"Ronaldo Fazzi","Ana Estel

['{"T Reinhold","F M_ller-Riemenschneider","S N Willich"}', '{"T Reinhold","F M�_ller-Riemenschneider","S N Willich"}', '{"t reinhold","f m�_ller-riemenschneider","s n willich"}', tensor([[1.]]), tensor([[1.]]), 0.0]
['{"Sren Brostrm","Gunnar Lose"}', '{"S��ren Brostr��m","Gunnar Lose"}', '{"s��ren brostr��m","gunnar lose"}', tensor([[1.0000]]), tensor([[1.0000]]), 0.0]
['{"R Schwarz","J Ernst","M Zenger","E Br_hler","R Schmidt"}', '{"R Schwarz","J Ernst","M Zenger","E Br�_hler","R Schmidt"}', '{"r schwarz","j ernst","m zenger","e br�_hler","r schmidt"}', tensor([[1.]]), tensor([[1.]]), 0.0]
['{"Ana Maria Baptista Menezes","David Alejandro Gonzlez","Elaine Cardozo Macedo","Pedro Curi Hallal","Maria de Ftima Maia","Marli Knorst","Samuel Carvalho Dumith","Jeovany Mart_nez-Mesa","Ricardo Bica Noal","Fernando Csar Wehrmeister","Jos Roberto Jardim"}', '{"Ana Maria Baptista Menezes","David Alejandro Gonz��lez","Elaine Cardozo Macedo","Pedro Curi Hallal","Maria de F��tima Maia","Marli Knorst"

['{"Miquel S_nchez-Marr","Geoff Holmes","Joaqu_n Izquierdo","Joaquim Comas","Ioannis Athanasiadis","Karina Gibert"}', '{"Miquel S����nchez-Marr���","Geoff Holmes","Joaqu���_n Izquierdo","Joaquim Comas","Ioannis Athanasiadis","Karina Gibert"}', '{"miquel s����nchez-marr���","geoff holmes","joaqu���_n izquierdo","joaquim comas","ioannis athanasiadis","karina gibert"}', tensor([[0.9856]]), tensor([[0.9856]]), 0.0]
['{"Ge H.-Y.","Fernndez-de-las-Peas C.","Yue S.-W."}', '{"Ge H.-Y.","Fern��ndez-de-las-Pe̱as C.","Yue S.-W."}', '{"ge h.-y.","fern��ndez-de-las-pe̱as c.","yue s.-w."}', tensor([[1.0000]]), tensor([[1.0000]]), 0.0]
['{"Delia Aguado","Javier Benito","Ignacio A G_mez de Segura","Mariana Abreu","Javier Garc_a-Fernndez"}', '{"Delia Aguado","Javier Benito","Ignacio A G���_mez de Segura","Mariana Abreu","Javier Garc���_a-Fern����ndez"}', '{"delia aguado","javier benito","ignacio a g���_mez de segura","mariana abreu","javier garc���_a-fern����ndez"}', tensor([[1.0000]]), tensor([[1.0000

['18F-flumazenil: a _-aminobutyric acid A-specific PET radiotracer for the localization of drug-resistant temporal lobe epilepsy.', '18F-flumazenil: a �_-aminobutyric acid A-specific PET radiotracer for the localization of drug-resistant temporal lobe epilepsy.', '18f-flumazenil: a �_-aminobutyric acid a-specific pet radiotracer for the localization of drug-resistant temporal lobe epilepsy.', 0.9921875, 0.953125, -0.0390625]
['Mild endoplasmic reticulum stress augments the proinflammatory effect of IL-1_ in pancreatic rat _-cells via the IRE1/XBP1s pathway.', 'Mild endoplasmic reticulum stress augments the proinflammatory effect of IL-1�_ in pancreatic rat �_-cells via the IRE1�/XBP1s pathway.', 'mild endoplasmic reticulum stress augments the proinflammatory effect of il-1�_ in pancreatic rat �_-cells via the ire1�/xbp1s pathway.', 0.9777777777777777, 0.9111111111111111, -0.06666666666666665]
['Optical turbulence: weak turbulence, condensates and collapsing filaments in the nonlinear S

['{"Gerard H Koppelman","Johan C de Jongste","Liset E M Elstgeest","Henri_tte A Smit","Ulrike Gehring","Annemien Haveman-Nies","Marjan Kerkhof","Marga B M Bekkers","Alet H Wijga","Salome Scholtens"}', '{"Gerard H Koppelman","Johan C de Jongste","Liset E M Elstgeest","Henri��tte A Smit","Ulrike Gehring","Annemien Haveman-Nies","Marjan Kerkhof","Marga B M Bekkers","Alet H Wijga","Salome Scholtens"}', '{"gerard h koppelman","johan c de jongste","liset e m elstgeest","henri��tte a smit","ulrike gehring","annemien haveman-nies","marjan kerkhof","marga b m bekkers","alet h wijga","salome scholtens"}', 0.9898477157360406, 0.8426395939086294, -0.14720812182741116]
['{"C L_liger","K Neuber","J Ring","I K_hler"}', '{"C L̦liger","K Neuber","J Ring","I K̦hler"}', '{"c l̦liger","k neuber","j ring","i k̦hler"}', 0.9545454545454546, 0.7727272727272727, -0.18181818181818188]
['{"Marcus R Munaf_","Mike W Peacey","In-Uck Park"}', '{"Marcus R Munaf�_","Mike W Peacey","In-Uck Park"}', '{"marcus r munaf�_"

['{"D. Russell","P. Linck","R. Hibbs","I. Russell","R. Muntz","N. H. Williams","B. C1 - Frn RCT-s_kningen CN - Pubmed DA - Dec DO - 10.1093/fampra/cmh612 DP - NLM ET - 2004/11/09 Hounsome","R. T. Edwards","C. Wilkinson"}', '{"D. Russell","P. Linck","R. Hibbs","I. Russell","R. Muntz","N. H. Williams","B. C1 - Fr̴n RCT-șkningen CN - Pubmed DA - Dec DO - 10.1093/fampra/cmh612 DP - NLM ET - 2004/11/09 Hounsome","R. T. Edwards","C. Wilkinson"}', '{"d. russell","p. linck","r. hibbs","i. russell","r. muntz","n. h. williams","b. c1 - fr̴n rct-șkningen cn - pubmed da - dec do - 10.1093/fampra/cmh612 dp - nlm et - 2004/11/09 hounsome","r. t. edwards","c. wilkinson"}', 0.9908675799086758, 0.8082191780821918, -0.182648401826484]
['{"T. H. Michel","W. C1 - Frn RCT-s_kningen CN - Pubmed DA - Jul 1 DP - NLM ET - 2000/06/28 Rogers","A. Wagner","H. Wittink","R. Kulich","A. Sukiennik","R. Maciewicz"}', '{"T. H. Michel","W. C1 - Fr̴n RCT-șkningen CN - Pubmed DA - Jul 1 DP - NLM ET - 2000/06/28 Rogers"

['{"Ana Maria Baptista Menezes","David Alejandro Gonzlez","Elaine Cardozo Macedo","Pedro Curi Hallal","Maria de Ftima Maia","Marli Knorst","Samuel Carvalho Dumith","Jeovany Mart_nez-Mesa","Ricardo Bica Noal","Fernando Csar Wehrmeister","Jos Roberto Jardim"}', '{"Ana Maria Baptista Menezes","David Alejandro Gonz��lez","Elaine Cardozo Macedo","Pedro Curi Hallal","Maria de F��tima Maia","Marli Knorst","Samuel Carvalho Dumith","Jeovany Mart�_nez-Mesa","Ricardo Bica Noal","Fernando C̩sar Wehrmeister","Jos̩ Roberto Jardim"}', '{"ana maria baptista menezes","david alejandro gonz��lez","elaine cardozo macedo","pedro curi hallal","maria de f��tima maia","marli knorst","samuel carvalho dumith","jeovany mart�_nez-mesa","ricardo bica noal","fernando c̩sar wehrmeister","jos̩ roberto jardim"}', 0.9732824427480916, 0.8473282442748091, -0.12595419847328249]
['{"Michiel A J Kompier","Etty G A Wielenga-Meijer","Dani_l H J Wigboldus","Toon W Taris"}', '{"Michiel A J Kompier","Etty G A Wielenga-Meijer","D

['{"N. Jakubik","L. Trzpil","F. Raciborski","U. Samolinska-Zawisza","L. Samoliski","D. Paczesny","Z. Halat","J. Marszalkowska","B. Samolinski","P. Samel-Kowalik","A. Lusawa","J. Gutowska","A. Tomaszewska","A. Walkiewicz"}', '{"N. Jakubik","L. Trzpil","F. Raciborski","U. Samolinska-Zawisza","L. Samoli̱ski","D. Paczesny","Z. Halat","J. Marszalkowska","B. Samolinski","P. Samel-Kowalik","A. Lusawa","J. Gutowska","A. Tomaszewska","A. Walkiewicz"}', '{"n. jakubik","l. trzpil","f. raciborski","u. samolinska-zawisza","l. samoli̱ski","d. paczesny","z. halat","j. marszalkowska","b. samolinski","p. samel-kowalik","a. lusawa","j. gutowska","a. tomaszewska","a. walkiewicz"}', 0.9954545454545455, 0.8590909090909091, -0.13636363636363635]
['{"N. M. Pinheiro","R. Almeida-Reis","L. Oliva","M. A. Martins","M. L. V. Oliva","O. A. Theodoro-Junior","E. A. Leick","I. F. L. C. Tibrio","C. M. Prado","R. F. Righetti","B. T. M. Oliveira"}', '{"N. M. Pinheiro","R. Almeida-Reis","L. Oliva","M. A. Martins","M. L. 

['Stomatologiia', 'Stomatologii��a', 'stomatologii��a', tensor([[1.0000]]), tensor([[1.0000]]), 0.0]
["Prvention de l'insuffisance rnale induite par les produits de contrnste iodes", "Pr̩vention de l'insuffisance r̩nale induite par les produits de contrnste iodes", "pr̩vention de l'insuffisance r̩nale induite par les produits de contrnste iodes", tensor([[1.0000]]), tensor([[1.0000]]), 0.0]
['Actas espaolas de psiquiatr_a', 'Actas espa̱olas de psiquiatr�_a', 'actas espa̱olas de psiquiatr�_a', tensor([[1.]]), tensor([[1.]]), 0.0]
['Paliativn_ Schanzova osteotomie p_i nereponibiln_ luxaci kyeln_ho kloubu u pacient s d_tskou mozkovou obrnou v adolescentn_m v_u', 'Paliativn�_ Schanzova osteotomie p��i nereponibiln�_ luxaci ky��eln�_ho kloubu u pacient�� s d��tskou mozkovou obrnou v adolescentn�_m v��u', 'paliativn�_ schanzova osteotomie p��i nereponibiln�_ luxaci ky��eln�_ho kloubu u pacient�� s d��tskou mozkovou obrnou v adolescentn�_m v��u', tensor([[0.9665]]), tensor([[0.9665]]), 0.0]
[

['{"Michela Miani","Decio L Eizirik","Laurence Ladrire","Maikel L Colli","Miriam Cnop"}', '{"Michela Miani","Decio L Eizirik","Laurence Ladri̬re","Maikel L Colli","Miriam Cnop"}', '{"michela miani","decio l eizirik","laurence ladri̬re","maikel l colli","miriam cnop"}', tensor([[1.]]), tensor([[1.]]), 0.0]
['{"R Carraro","M F Rasmussen","L Niskanen","A Harper","N Finer","S R_ssner","M Kunesova","L Van Gaal","M E J Lean","A Astrup","A Rissanen","M J Savolainen","[Collective Name] NN8022-1807 Investigators"}', '{"R Carraro","M F Rasmussen","L Niskanen","A Harper","N Finer","S R̦ssner","M Kunesova","L Van Gaal","M E J Lean","A Astrup","A Rissanen","M J Savolainen","[Collective Name] NN8022-1807 Investigators"}', '{"r carraro","m f rasmussen","l niskanen","a harper","n finer","s r̦ssner","m kunesova","l van gaal","m e j lean","a astrup","a rissanen","m j savolainen","[collective name] nn8022-1807 investigators"}', tensor([[0.9862]]), tensor([[0.9862]]), 0.0]
['{"George Davey Smith","Amanda 

['{"M Caliskan","M Karab_c_oglu","A Citak","N Uzel","K Aydin","A Nayir","D D Soysal"}', '{"M Caliskan","M Karab̦c�_oglu","A Citak","N Uzel","K Aydin","A Nayir","D D Soysal"}', '{"m caliskan","m karab̦c�_oglu","a citak","n uzel","k aydin","a nayir","d d soysal"}', tensor([[0.9952]]), tensor([[0.9952]]), 0.0]
['{"N_ria Lloberas","Marcel la Franquesa","Josep M Cruzado","Josep M Griny_","Ins Rama","Gabriela Alperovich","Immaculada Herrero-Fresneda","Joan Torras","Pepita Gimnez-Bonaf"}', '{"N̼ria Lloberas","Marcel la Franquesa","Josep M Cruzado","Josep M Griny�_","In̩s Rama","Gabriela Alperovich","Immaculada Herrero-Fresneda","Joan Torras","Pepita Gim̩nez-Bonaf̩"}', '{"n̼ria lloberas","marcel la franquesa","josep m cruzado","josep m griny�_","in̩s rama","gabriela alperovich","immaculada herrero-fresneda","joan torras","pepita gim̩nez-bonaf̩"}', tensor([[0.9787]]), tensor([[0.9787]]), 0.0]
['{"Ronaldo Fazzi","Ana Estela Haddad","Maria Salete Nahs Pires Corr_a"}', '{"Ronaldo Fazzi","Ana Estel

['{"T Reinhold","F M_ller-Riemenschneider","S N Willich"}', '{"T Reinhold","F M�_ller-Riemenschneider","S N Willich"}', '{"t reinhold","f m�_ller-riemenschneider","s n willich"}', tensor([[1.]]), tensor([[1.]]), 0.0]
['{"Sren Brostrm","Gunnar Lose"}', '{"S��ren Brostr��m","Gunnar Lose"}', '{"s��ren brostr��m","gunnar lose"}', tensor([[1.0000]]), tensor([[1.0000]]), 0.0]
['{"R Schwarz","J Ernst","M Zenger","E Br_hler","R Schmidt"}', '{"R Schwarz","J Ernst","M Zenger","E Br�_hler","R Schmidt"}', '{"r schwarz","j ernst","m zenger","e br�_hler","r schmidt"}', tensor([[1.]]), tensor([[1.]]), 0.0]
['{"Ana Maria Baptista Menezes","David Alejandro Gonzlez","Elaine Cardozo Macedo","Pedro Curi Hallal","Maria de Ftima Maia","Marli Knorst","Samuel Carvalho Dumith","Jeovany Mart_nez-Mesa","Ricardo Bica Noal","Fernando Csar Wehrmeister","Jos Roberto Jardim"}', '{"Ana Maria Baptista Menezes","David Alejandro Gonz��lez","Elaine Cardozo Macedo","Pedro Curi Hallal","Maria de F��tima Maia","Marli Knorst"

['{"Miquel S_nchez-Marr","Geoff Holmes","Joaqu_n Izquierdo","Joaquim Comas","Ioannis Athanasiadis","Karina Gibert"}', '{"Miquel S����nchez-Marr���","Geoff Holmes","Joaqu���_n Izquierdo","Joaquim Comas","Ioannis Athanasiadis","Karina Gibert"}', '{"miquel s����nchez-marr���","geoff holmes","joaqu���_n izquierdo","joaquim comas","ioannis athanasiadis","karina gibert"}', tensor([[0.9856]]), tensor([[0.9856]]), 0.0]
['{"Ge H.-Y.","Fernndez-de-las-Peas C.","Yue S.-W."}', '{"Ge H.-Y.","Fern��ndez-de-las-Pe̱as C.","Yue S.-W."}', '{"ge h.-y.","fern��ndez-de-las-pe̱as c.","yue s.-w."}', tensor([[1.0000]]), tensor([[1.0000]]), 0.0]
['{"Delia Aguado","Javier Benito","Ignacio A G_mez de Segura","Mariana Abreu","Javier Garc_a-Fernndez"}', '{"Delia Aguado","Javier Benito","Ignacio A G���_mez de Segura","Mariana Abreu","Javier Garc���_a-Fern����ndez"}', '{"delia aguado","javier benito","ignacio a g���_mez de segura","mariana abreu","javier garc���_a-fern����ndez"}', tensor([[1.0000]]), tensor([[1.0000

In [5]:
metric_dict_beers = {
    "clean_data_path": "../datasets/beers/clean.csv",
    "dirty_data_path": "../datasets/beers/dirty.csv",
    "corrected_data_path": "../datasets/beers/beers_repaired_holoclean.csv",
    "str_attr":[3,4,6,5,7,9,8,10],
    "short_str_attr": [9, 10],
    "long_str_attr": [2, 3, 8],
    "numer_attr": [4, 5]
    
}

m_beers = Metrics(metric_dict_beers)

m_beers.print_metrics()

['12', '12.0 OZ.', '12.0 oz.', 0.0]
['12', '12.0 OZ.', '12.0 oz.', 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', 0.0]
['12', '12.0 OZ.', '12.0 oz.', 0.0]
['12', '12.0 OZ.', '12.0 oz.', 0.0]
['12', '12.0 OZ.', '12.0 oz.', 0.0]
['16', '16.0 oz. Alumi-Tek', '16.0 oz. alumi-tek', 0.0]
['16', '16.0 OZ.', '16.0 oz.', 0.0]
['16', '16.0 OZ.', '16.0 oz.', 0.0]
['16', '16.0 OZ.', '16.0 oz.', 0.0]
['12', '12.0 OZ.', '12.0 oz.', 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', 0.0]
['12', '12.0 OZ.', '12.0 oz.', 0.0]
['12', '12.0 OZ.', '12.0 oz.', 0.0]
['16', '16.0 OZ.', '16.0 oz.', 0.0]
['16', '16.0 oz. Alumi-Tek', '16.0 oz. alumi-tek', 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', 0.0]
['12', '12.0 OZ.', '12.0 oz.', 0.0]
['12', '12.0 OZ.', '12.0 oz.', 0.0]
['16', '16.0 oz. Alumi-Tek', '16.0 oz. alumi-tek', 0.0]
['12', '12.0 oz. Silo Can', '12.0 oz. silo can', 0.0]
['16', '16.0 oz. Alumi-Tek', '16.0 oz. alumi-tek',

['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['16', '16.0 oz. Alumi-Tek', '16.0 oz. alumi-tek', tensor([[0.2980]]), tensor([[0.2980]]), 0.0]
['16', '16.0 OZ.', '16.0 oz.', tensor([[0.3884]]), tensor([[0.3884]]), 0.0]
['16', '16.0 OZ.', '16.0 oz.', tensor([[0.3884]]), tensor([[0.3884]]), 0.0]
['16', '16.0 OZ.', '16.0 oz.', tensor([[0.3884]]), tensor([[0.3884]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.

['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['16', '16.0 OZ.', '16.0 oz.', tensor([[0.3884]]), tensor([[0.3884]]), 0.0]
['16', '16.0 OZ.', '16.0 oz.', tensor([[0.3884]]), tensor([[0.3884]]), 0.0]
['16', '16.0 oz. Alumi-Tek', '16.0 oz. alumi-tek', tensor([[0.2980]]), tensor([[0.2980]]), 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['16', '16.0 oz. Alumi-Tek', '16.0 oz. alumi-tek', tensor([[0.2980]]), tensor([[0.2980]]), 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['16', '16.0 oz. Alumi-Tek', '16.0 oz. alumi-tek', tensor([[0.2980]]), tensor([[

['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['16', '16.0 oz. Alumi-Tek', '16.0 oz. alumi-tek', tensor([[0.2980]]), tensor([[0.2980]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '

['16', '16.0 oz. Alumi-Tek', '16.0 oz. alumi-tek', tensor([[0.2980]]), tensor([[0.2980]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 OZ.', '12.0 oz.', tensor([[0.4367]]), tensor([[0.4367]]), 0.0]
['12', '12.0 oz. Alumi-Tek', '12.0 oz. alumi-tek', tensor([[0.3528]]), tensor([[0.3528]]), 0.0]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]


['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['San Francisco', 'San Francisco CA', 'san francisco ca', tensor([[0.9598]]), tensor([[0.9598]]), 0.0]
['Saint Louis', 'Saint Louis MO', 'saint louis mo', tensor([[0.8876]]), tensor([[0.8876]]), 0.0]
['Madison', 'Madison WI', 'madison wi', tensor([[0.8880]]), te

['Denver', 'Denver CO', 'denver co', tensor([[0.9470]]), tensor([[0.9470]]), 0.0]
['Paonia', 'Paonia CO', 'paonia co', tensor([[0.8736]]), tensor([[0.8736]]), 0.0]
['Cincinnati', 'Cincinnati OH', 'cincinnati oh', tensor([[0.9263]]), tensor([[0.9263]]), 0.0]
['Portland', 'Portland ME', 'portland me', tensor([[0.9007]]), tensor([[0.9007]]), 0.0]
['Export', 'Export PA', 'export pa', tensor([[0.6008]]), tensor([[0.6008]]), 0.0]
['Export', 'Export PA', 'export pa', tensor([[0.6008]]), tensor([[0.6008]]), 0.0]
['Rochester', 'Rochester MI', 'rochester mi', tensor([[0.9241]]), tensor([[0.9241]]), 0.0]
['Lansdale', 'Lansdale PA', 'lansdale pa', tensor([[0.8948]]), tensor([[0.8948]]), 0.0]
['Chandler', 'Chandler AZ', 'chandler az', tensor([[0.9080]]), tensor([[0.9080]]), 0.0]
['Silverton', 'Silverton OR', 'silverton or', tensor([[0.9101]]), tensor([[0.9101]]), 0.0]
['Chico', 'Chico CA', 'chico ca', tensor([[0.9067]]), tensor([[0.9067]]), 0.0]
['Brooklyn', 'Brooklyn NY', 'brooklyn ny', tensor([[0

['NY', '', 'ny', tensor([[0.2934]]), tensor([[1.0000]]), 0.8832265436649323]
['AK', '', 'ak', tensor([[0.3642]]), tensor([[1.]]), 0.7947047799825668]
['NM', '', 'nm', tensor([[0.3021]]), tensor([[1.0000]]), 0.8724229782819748]
['MT', '', 'mt', tensor([[0.2334]]), tensor([[1.0000]]), 0.9582683257758617]
['PA', '', 'pa', tensor([[0.2107]]), tensor([[1.0000]]), 0.986633226275444]
['OR', '', 'or', tensor([[0.5177]]), tensor([[1.0000]]), 0.6028344482183456]
['OH', '', 'oh', tensor([[0.6181]]), tensor([[1.]]), 0.47741472721099854]
['TX', '', 'tx', tensor([[0.2450]]), tensor([[1.0000]]), 0.9437654539942741]
['TX', '', 'tx', tensor([[0.2450]]), tensor([[1.0000]]), 0.9437654539942741]
['NY', '', 'ny', tensor([[0.2934]]), tensor([[1.0000]]), 0.8832265436649323]
['NY', '', 'ny', tensor([[0.2934]]), tensor([[1.0000]]), 0.8832265436649323]
['IL', '', 'il', tensor([[0.3903]]), tensor([[1.]]), 0.7621133327484131]
['WI', '', 'wi', tensor([[0.2608]]), tensor([[1.0000]]), 0.9240202605724335]
['CA', '', 

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_na

['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['', 'N/A', '_nan_', tensor([[0.3131]]), tensor([[0.3353]]), 0.02771180123090744]
['San Francisco', 'San Francisco CA', 'san francisco ca', tensor([[0.9598]]), tensor([[0.9598]]), 0.0]
['Saint Louis', 'Saint Louis MO', 'saint louis mo', tensor([[0.8876]]), tensor([[0.8876]]), 0.0]
['Madison', 'Madison WI', 'madison wi', tensor([[0.8880]]), tensor([[0.8880]]), 0.0]
['Hayward', 'Hayward WI', 'hayward wi', tensor([[0.8320]]), tensor([[0.8320]]), 0.0]
['Denton', 'Denton TX', 'denton tx', tensor([[0.9419]]), tensor([[0.9419]]), 0.0]
['Asheville', 'Asheville NC', 'asheville nc', tensor([[0.9642]]), tensor([[0.9642]]), 0.0]
['Detroit', 'Detroit MI', 'detroit mi', tensor(

['Export', 'Export PA', 'export pa', tensor([[0.6008]]), tensor([[0.6008]]), 0.0]
['Export', 'Export PA', 'export pa', tensor([[0.6008]]), tensor([[0.6008]]), 0.0]
['Rochester', 'Rochester MI', 'rochester mi', tensor([[0.9241]]), tensor([[0.9241]]), 0.0]
['Lansdale', 'Lansdale PA', 'lansdale pa', tensor([[0.8948]]), tensor([[0.8948]]), 0.0]
['Chandler', 'Chandler AZ', 'chandler az', tensor([[0.9080]]), tensor([[0.9080]]), 0.0]
['Silverton', 'Silverton OR', 'silverton or', tensor([[0.9101]]), tensor([[0.9101]]), 0.0]
['Chico', 'Chico CA', 'chico ca', tensor([[0.9067]]), tensor([[0.9067]]), 0.0]
['Brooklyn', 'Brooklyn NY', 'brooklyn ny', tensor([[0.9519]]), tensor([[0.9519]]), 0.0]
['Brooklyn', 'Brooklyn NY', 'brooklyn ny', tensor([[0.9519]]), tensor([[0.9519]]), 0.0]
['Durango', 'Durango CO', 'durango co', tensor([[0.8980]]), tensor([[0.8980]]), 0.0]
['Pottstown', 'Pottstown PA', 'pottstown pa', tensor([[0.8965]]), tensor([[0.8965]]), 0.0]
['Boise', 'Boise ID', 'boise id', tensor([[0.85

['PA', '', 'pa', tensor([[0.2107]]), tensor([[1.0000]]), 0.986633226275444]
['OR', '', 'or', tensor([[0.5177]]), tensor([[1.0000]]), 0.6028344482183456]
['OH', '', 'oh', tensor([[0.6181]]), tensor([[1.]]), 0.47741472721099854]
['TX', '', 'tx', tensor([[0.2450]]), tensor([[1.0000]]), 0.9437654539942741]
['TX', '', 'tx', tensor([[0.2450]]), tensor([[1.0000]]), 0.9437654539942741]
['NY', '', 'ny', tensor([[0.2934]]), tensor([[1.0000]]), 0.8832265436649323]
['NY', '', 'ny', tensor([[0.2934]]), tensor([[1.0000]]), 0.8832265436649323]
['IL', '', 'il', tensor([[0.3903]]), tensor([[1.]]), 0.7621133327484131]
['WI', '', 'wi', tensor([[0.2608]]), tensor([[1.0000]]), 0.9240202605724335]
['CA', '', 'ca', tensor([[0.3093]]), tensor([[1.]]), 0.8634266257286072]
['CA', '', 'ca', tensor([[0.3093]]), tensor([[1.]]), 0.8634266257286072]
['OR', '', 'or', tensor([[0.5177]]), tensor([[1.0000]]), 0.6028344482183456]
['SC', '', 'sc', tensor([[0.2631]]), tensor([[1.0000]]), 0.9211630374193192]
['CO', '', 'co'

In [6]:
metric_dict_hospital = {
    "clean_data_path": "../datasets/hospital/clean.csv",
    "dirty_data_path": "../datasets/hospital/dirty.csv",
    "corrected_data_path": "../datasets/hospital/baran_repaired1.csv",
    "str_attr": [2,3,6,7,9, 11, 12, 13, 14, 15, 16, 19],
    "short_str_attr": [3, 6, 7, 9, 13, 17, 18, 19],
    "long_str_attr": [2,11, 12, 14, 15, 16],
    "numer_attr": [17, 18]
    
}

m_hospital1 = Metrics(metric_dict_hospital)

metric_dict_hospital = {
    "clean_data_path": "../datasets/hospital/clean.csv",
    "dirty_data_path": "../datasets/hospital/dirty.csv",
    "corrected_data_path": "../datasets/hospital/baran_repaired2.csv",
    "str_attr": [2,3,6,7,9, 11, 12, 13, 14, 15, 16, 19],
    "short_str_attr": [3, 6, 7, 9, 13, 17, 18, 19],
    "long_str_attr": [2,11, 12, 14, 15, 16],
    "numer_attr": [17, 18]
    
}

m_hospital2 = Metrics(metric_dict_hospital)

metric_dict_hospital = {
    "clean_data_path": "../datasets/hospital/clean.csv",
    "dirty_data_path": "../datasets/hospital/dirty.csv",
    "corrected_data_path": "../datasets/hospital/baran_repaired3.csv",
    "str_attr": [2,3,6,7,9, 11, 12, 13, 14, 15, 16, 19],
    "short_str_attr": [3, 6, 7, 9, 13, 17, 18, 19],
    "long_str_attr": [2,11, 12, 14, 15, 16],
    "numer_attr": [17, 18]
    
}

m_hospital3 = Metrics(metric_dict_hospital)

metric_dict_hospital = {
    "clean_data_path": "../datasets/hospital/clean.csv",
    "dirty_data_path": "../datasets/hospital/dirty.csv",
    "corrected_data_path": "../datasets/hospital/baran_repaired4.csv",
    "str_attr": [2,3,6,7,9, 11, 12, 13, 14, 15, 16, 19],
    "short_str_attr": [3, 6, 7, 9, 13, 17, 18, 19],
    "long_str_attr": [2,11, 12, 14, 15, 16],
    "numer_attr": [17, 18]
    
}

m_hospital4 = Metrics(metric_dict_hospital)

metric_dict_hospital = {
    "clean_data_path": "../datasets/hospital/clean.csv",
    "dirty_data_path": "../datasets/hospital/dirty.csv",
    "corrected_data_path": "../datasets/hospital/baran_repaired5.csv",
    "str_attr": [2,3,6,7,9, 11, 12, 13, 14, 15, 16, 19],
    "short_str_attr": [3, 6, 7, 9, 13, 17, 18, 19],
    "long_str_attr": [2,11, 12, 14, 15, 16],
    "numer_attr": [17, 18]
    
}

m_hospital5 = Metrics(metric_dict_hospital)

def calc_avg(m_dict_list):
    
    result = m_dict_list[0].copy()
    
    
    for i in range(0,3):
        if len(m_dict_list[i]) > 1:
            v = 0
            for j in range(0, len(m_dict_list)):
                v += list(m_dict_list[j].values())[i]
            result[list(m_dict_list[0].keys())[i]] = round(v/len(m_dict_list),3)

    print(result)
    return result
    
print("Average of 5 runs")
hospital_standard_metric_baran  = calc_avg([m_hospital1.standard_metric, m_hospital2.standard_metric, m_hospital3.standard_metric, m_hospital4.standard_metric, m_hospital5.standard_metric])
hospital_fuzzy_jw_baran  = calc_avg([m_hospital1.fuzzy_jw, m_hospital2.fuzzy_jw, m_hospital3.fuzzy_jw, m_hospital4.fuzzy_jw, m_hospital5.fuzzy_jw])
hospital_fuzzy_me_baran  = calc_avg([m_hospital1.fuzzy_me, m_hospital2.fuzzy_me, m_hospital3.fuzzy_me, m_hospital4.fuzzy_me, m_hospital5.fuzzy_me])
#hospital_fuzzy_ld_baran  = calc_avg([m_hospital1.fuzzy_ld, m_hospital2.fuzzy_ld, m_hospital3.fuzzy_ld, m_hospital4.fuzzy_ld, m_hospital5.fuzzy_ld])
#hospital_fuzzy_ld_words_baran  = calc_avg([m_hospital1.fuzzy_ld_words, m_hospital2.fuzzy_ld_words, m_hospital3.fuzzy_ld_words, m_hospital4.fuzzy_ld_words, m_hospital5.fuzzy_ld_words])
hospital_fuzzy_ld_char_baran  = calc_avg([m_hospital1.fuzzy_ld_char, m_hospital2.fuzzy_ld_char, m_hospital3.fuzzy_ld_char, m_hospital4.fuzzy_ld_char, m_hospital5.fuzzy_ld_char])
hospital_fuzzy_semantics_sentences  = calc_avg([m_hospital1.fuzzy_semantics_sentences, m_hospital2.fuzzy_semantics_sentences, m_hospital3.fuzzy_semantics_sentences, m_hospital4.fuzzy_semantics_sentences, m_hospital5.fuzzy_semantics_sentences])

print("")

hospital_fuzzy_jw_num_baran  = calc_avg([m_hospital1.fuzzy_jw_num, m_hospital2.fuzzy_jw_num, m_hospital3.fuzzy_jw_num, m_hospital4.fuzzy_jw_num, m_hospital5.fuzzy_jw_num])
hospital_fuzzy_me_num_baran  = calc_avg([m_hospital1.fuzzy_me_num, m_hospital2.fuzzy_me_num, m_hospital3.fuzzy_me_num, m_hospital4.fuzzy_me_num, m_hospital5.fuzzy_me_num])
#hospital_fuzzy_ld_num_baran  = calc_avg([m_hospital1.fuzzy_ld_num, m_hospital2.fuzzy_ld_num, m_hospital3.fuzzy_ld_num, m_hospital4.fuzzy_ld_num, m_hospital5.fuzzy_ld_num])
#hospital_fuzzy_ld_words_num_baran  = calc_avg([m_hospital1.fuzzy_ld_words_num, m_hospital2.fuzzy_ld_words_num, m_hospital3.fuzzy_ld_words_num, m_hospital4.fuzzy_ld_words_num, m_hospital5.fuzzy_ld_words_num])
hospital_fuzzy_ld_char_baran  = calc_avg([m_hospital1.fuzzy_ld_char_num, m_hospital2.fuzzy_ld_char_num, m_hospital3.fuzzy_ld_char_num, m_hospital4.fuzzy_ld_char_num, m_hospital5.fuzzy_ld_char_num])

['calhoun', 'calhoxn', 'madison', -0.2857142857142857]
['calhoun', 'caxhoun', 'madison', -0.2857142857142857]
['houston', 'housxon', 'madison', -0.2857142857142857]
['coffee', 'coffxx', 'clarke', -0.2222222222222222]
['elmore', 'elmoxe', 'clarke', -0.22222222222222232]
['elmore', 'elxore', 'clarke', -0.22222222222222232]
['fayette', 'fayexxe', 'madison', -0.3809523809523811]
['etowah', 'etowxh', 'clarke', -0.44444444444444453]
['butler', 'butxer', 'clarke', -0.3333333333333335]
['coffee', 'cxffee', 'clarke', -0.3333333333333335]
['scip-vte-1', 'sxip-vte-1', 'sxip-vtf-1', -0.06666666666666654]
['scip-vte-1', 'scip-vtx-1', 'scip-vtf-1', 0.0]
['calhoun', 'calhoxn', 'madison', 0.8571428571428572, 0.2857142857142857, -0.5714285714285715]
['calhoun', 'caxhoun', 'madison', 0.8571428571428572, 0.2857142857142857, -0.5714285714285715]
['houston', 'housxon', 'madison', 0.8571428571428572, 0.2857142857142857, -0.5714285714285715]
['coffee', 'coffxx', 'clarke', 0.6666666666666667, 0.33333333333333

['702 n main st', '702xnxmainxst', '849 south three notch street', tensor([[0.4875]]), tensor([[0.3449]]), -0.1782511547207832]
['702 n main st', '702 x maix st', '849 south three notch street', tensor([[0.6647]]), tensor([[0.3449]]), -0.39978574961423874]
['4370 west main street', '4370xwestxmainxstreet', '1108 ross clark circle', tensor([[0.5069]]), tensor([[0.4446]]), -0.07786206901073456]
['1530 u s highway 43', '1530xuxsxhighwayx43', '1256 military street south', tensor([[0.4913]]), tensor([[0.4192]]), -0.09012050926685333]
['1256 military street south', '1256 military street sxuth', '1530 u s highway 43', tensor([[0.8676]]), tensor([[0.4192]]), -0.5605513975024223]
['2505 u s highway 431 north', '2505xuxsxhighwayx431xnorth', '8000 alabama highway 69', -0.26831863788385535]
['702 n main st', '702xnxmainxst', '849 south three notch street', -0.39865689865689863]
['702 n main st', '702 x maix st', '849 south three notch street', -0.44993894993894995]
['4370 west main street', '4370x

In [8]:
calc_avg([m_hospital1.combined_metric, m_hospital2.combined_metric, m_hospital3.combined_metric, m_hospital4.combined_metric, m_hospital5.combined_metric])

{'Combined Precision': 0.888, 'Combined Recall': 0.472, 'Combined F1': 0.615, 'PC R': 5.0, 'PC F': 7.0}


{'Combined Precision': 0.888,
 'Combined Recall': 0.472,
 'Combined F1': 0.615,
 'PC R': 5.0,
 'PC F': 7.0}