In [1]:
import pandas
import html
import re
import numpy
import sys

from sentence_transformers import SentenceTransformer, util
from math import floor, ceil
#from gensim.models import KeyedVectors
#from gensim.downloader import load

#model = load('word2vec-google-news-300')

class Metrics:
    def __init__(self, metric_dict):
        #save paths to clean, dirty and corrected dataset
        self.clean_path = metric_dict["clean_data_path"]
        self.dirty_path = metric_dict["dirty_data_path"]
        #self.corrected_path = metric_dict["corrected_data_path"]
        
        #read csv-files of clean, dirty and corrected dataset
        self.clean_data = self.read_csv_dataset(metric_dict["clean_data_path"])
        self.dirty_data = self.read_csv_dataset(metric_dict["dirty_data_path"])
        self.corrected_data = self.read_csv_dataset(metric_dict["corrected_data_path"])
        
        #create dictionaries for errorneous cells
        #save clean, dirty and corrected values of erroneous cells seperately
        self.error_clean_val = self.get_dataframes_difference(self.dirty_data, self.clean_data) #clean values
        self.error_dirty_val = self.get_dataframes_difference(self.clean_data, self.dirty_data) #dirty values
        self.error_corrected_val = self.get_error_corrected_val() #corrected values
        
        #save attribute classification
        self.str_attr = metric_dict["str_attr"]
        self.short_str_attr = metric_dict["short_str_attr"]
        self.long_str_attr = metric_dict["long_str_attr"]
        self.numer_attr = metric_dict["numer_attr"]
        
        #calculate "normal" metrics precision, recall, f1
        self.standard_metric = self.get_data_cleaning_evaluation()
        
        #numeric TP
        self.numer_tp = self.get_numer_tp()
        
        #calculate and save fuzzy metrics
        #self.fuzzy_alt_metric = self.get_data_cleaning_evaluation_fuzzy_alt()
        self.fuzzy_jw = self.get_data_cleaning_evaluation_fuzzy_JW()
        self.fuzzy_me = self.get_data_cleaning_evaluation_fuzzy_ME()
        #self.fuzzy_ld_words = self.get_data_cleaning_evaluation_fuzzy_LD_Words() 
        self.fuzzy_ld_char = self.get_data_cleaning_evaluation_fuzzy_LD_Char() 
        #self.fuzzy_ld = self.get_data_cleaning_evaluation_fuzzy_LD() if self.short_str_attr or self.long_str_attr else {"LD Message": "short or long string attributes not declared"}
        self.fuzzy_semantics_sentences = self.get_data_cleaning_evaluation_fuzzy_semantic_sentences()
        
        #fuzzy metrics combined with outlier metric
        if self.numer_attr:
            #self.fuzzy_alt_num_metric = self.get_data_cleaning_evaluation_fuzzy_alt(True)
            self.fuzzy_jw_num = self.get_data_cleaning_evaluation_fuzzy_JW(True)
            self.fuzzy_me_num = self.get_data_cleaning_evaluation_fuzzy_ME(True)
            #self.fuzzy_ld_words_num = self.get_data_cleaning_evaluation_fuzzy_LD_Words(True)
            self.fuzzy_ld_char_num = self.get_data_cleaning_evaluation_fuzzy_LD_Char(True)
            #self.fuzzy_ld_num = self.get_data_cleaning_evaluation_fuzzy_LD(True) if self.short_str_attr or self.long_str_attr else {"LD Num Message": "short or long string attributes not declared"}
            self.fuzzy_semantics_sentences_num = self.get_data_cleaning_evaluation_fuzzy_semantic_sentences(True)
            
        #average metrics
        self.avg_string_metric = self.get_string_metric_avg()
        self.avg_string_semantic_metric = self.get_string_semantic_metric_avg()
        
        #combined evaluation
        self.combined_metric = self.get_combined_score_evaluation()

        
    def print_metrics(self):
        print(self.combined_metric)
        print("")
        print(self.standard_metric)
        #print(self.fuzzy_alt_metric)
        print(self.fuzzy_jw)
        print(self.fuzzy_me)
        #print(self.fuzzy_ld_words)
        print(self.fuzzy_ld_char)
        #print(self.fuzzy_ld)
        #print(self.fuzzy_semantics_words)
        print(self.fuzzy_semantics_sentences)
        print("")
        
        if self.numer_attr:
            #print(self.fuzzy_alt_num_metric)
            print(self.fuzzy_jw_num)
            print(self.fuzzy_me_num)
            #print(self.fuzzy_ld_words_num)
            print(self.fuzzy_ld_char_num)
            #print(self.fuzzy_ld_num)
            #print(self.fuzzy_semantics_words_num)
            print(self.fuzzy_semantics_sentences_num)
            
        else:
            print({"Num Metrics": "numeric attributes not declared"})

        
    def read_csv_dataset(self, dataset_path):
        """
        This method reads a dataset from a csv file path.
        """
        dataframe = pandas.read_csv(dataset_path, sep=",", header="infer", encoding="utf-8", dtype=str,
                                    keep_default_na=False, low_memory=False).applymap(self.value_normalizer)
        return dataframe
    
    @staticmethod
    def value_normalizer(value):
        """
        This method takes a value and minimally normalizes it.
        """
        value = html.unescape(value)
        value = re.sub("[\t\n ]+", " ", value, re.UNICODE)
        value = value.strip("\t\n ")
        return value
    
    def get_string_metric_avg(self):
        if True:
            ld_words_char_avg_p = list(self.fuzzy_ld_char.values())[0]
            ld_words_char_avg_r = list(self.fuzzy_ld_char.values())[1]
            ld_words_char_avg_f1 = list(self.fuzzy_ld_char.values())[2]
            avg_precision = (list(self.fuzzy_jw.values()))[0] + (list(self.fuzzy_me.values()))[0] +  ld_words_char_avg_p / 3
            avg_recall = (list(self.fuzzy_jw.values()))[1] + (list(self.fuzzy_me.values()))[1] +  ld_words_char_avg_r / 3
            avg_f1 = (list(self.fuzzy_jw.values()))[2] + (list(self.fuzzy_me.values()))[2] +  ld_words_char_avg_f1 / 3
        
        else:
            avg_precision = (list(self.fuzzy_jw.values())[0] + list(self.fuzzy_me.values())[0] + list(self.fuzzy_ld.values())[0])/3
            avg_recall = (list(self.fuzzy_jw.values())[1] + list(self.fuzzy_me.values())[1] + list(self.fuzzy_ld.values())[1])/3
            avg_f1 = (list(self.fuzzy_jw.values())[2] + list(self.fuzzy_me.values())[2] + list(self.fuzzy_ld.values())[0])/3

        return {"Average of String Metrics Precision": round(avg_precision,3), "Average of String Metrics Recall": round(avg_recall,3), "Average of String Metrics F1": round(avg_f1,3)}
        
    def get_string_semantic_metric_avg(self):
        avg_precision = (list(self.avg_string_metric.values())[0] + list(self.fuzzy_semantics_sentences.values())[0]) / 2
        avg_recall = (list(self.avg_string_metric.values())[1] + list(self.fuzzy_semantics_sentences.values())[1]) / 2
        avg_f1 = (list(self.avg_string_metric.values())[2] + list(self.fuzzy_semantics_sentences.values())[2]) / 2
        
        return {"Average of String and Semantics Metrics Precision": round(avg_precision,3), "Average of String and Semantics Metrics Recall": round(avg_recall,3), "Average of String and Semantics Metrics F1": round(avg_f1,3)}
        
        
    
    def get_dataframes_difference(self, dataframe_1, dataframe_2):
        """
        This method compares two dataframes and returns the different cells.
        """
        if dataframe_1.shape != dataframe_2.shape:
            sys.stderr.write("Two compared datasets do not have equal sizes!\n")
        difference_dictionary = {}
        difference_dataframe = dataframe_1.where(dataframe_1.values != dataframe_2.values).notna()
        for j in range(dataframe_1.shape[1]):
            for i in difference_dataframe.index[difference_dataframe.iloc[:, j]].tolist():
                difference_dictionary[(i, j)] = dataframe_2.iloc[i, j]
        return difference_dictionary
    
    def get_error_corrected_val(self):
        correction_dict = self.get_dataframes_difference(self.dirty_data, self.corrected_data)
        for key in list(correction_dict):
            if key not in self.error_clean_val:
                del correction_dict[key]
    
        return correction_dict
    
  
    def jaro_winkler_distance(self, s1, s2):
        """
        Compute Jaro-Winkler distance between two strings.
        """
        # If the s are equal
        if (s1 == s2):
            return 1.0

        # Length of two s
        len1 = len(s1)
        len2 = len(s2)

        # Maximum distance upto which matching
        # is allowed
        max_dist = floor(max(len1, len2) / 2) - 1

        # Count of matches
        match = 0

        # Hash for matches
        hash_s1 = [0] * len(s1)
        hash_s2 = [0] * len(s2)

        # Traverse through the first
        for i in range(len1):

            # Check if there is any matches
            for j in range(max(0, i - max_dist), 
                           min(len2, i + max_dist + 1)):

                # If there is a match
                if (s1[i] == s2[j] and hash_s2[j] == 0):
                    hash_s1[i] = 1
                    hash_s2[j] = 1
                    match += 1
                    break

        # If there is no match
        if (match == 0):
            return 0.0

        # Number of transpositions
        t = 0
        point = 0

        # Count number of occurrences
        # where two characters match but
        # there is a third matched character
        # in between the indices
        for i in range(len1):
            if (hash_s1[i]):

                # Find the next matched character
                # in second
                while (hash_s2[point] == 0):
                    point += 1

                if (s1[i] != s2[point]):
                    t += 1
                point += 1
        t = t//2

        # Return the Jaro Similarity
        return (match/ len1 + match / len2 +
                (match - t) / match)/ 3.0
    

    def jaro_winkler_distance_fuzzy(self, clean, dirty, corrected):
        jw_clean_dirty = self.jaro_winkler_distance(clean, dirty)
        jw_clean_corrected = self.jaro_winkler_distance(clean, corrected)


        return jw_clean_corrected - jw_clean_dirty
    
    
    def get_data_cleaning_evaluation_fuzzy_JW(self, num_metric=False):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ec_tp = 0.0
        output_size = 0.0
        
        for cell in self.error_corrected_val:
            output_size += 1
            if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                ec_tp += 1.0
            elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                metric_score = self.jaro_winkler_distance_fuzzy(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                ec_tp += metric_score
                print([self.error_clean_val[cell] ,  self.error_dirty_val[cell],  self.error_corrected_val[cell], metric_score])
                if metric_score >= 0:
                    pc_r += 1
                else:
                    pc_f += 1 
        
        
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy JW Num Precision": round(ec_p, 3),"Fuzzy JW Num Recall": round(ec_r, 3), "Fuzzy JW Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy JW Precision": round(ec_p, 3),"Fuzzy JW Recall": round(ec_r, 3), "Fuzzy JW F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}

    def get_data_cleaning_evaluation(self):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                ec_tp += 1.0
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        return {"Precision": round(ec_p, 3),"Recall": round(ec_r, 3), "F1": round(ec_f, 3), "Amount of fixed data errors": output_size}
    
    def get_single_outlier_score(self, clean, dirty, corrected):
        score = 1 - (abs(int(clean)-int(corrected)) / abs(int(clean) - int(dirty)))
    
        if score >= 0:
            return score

        else:
            return -1
        
    def get_fuzzy_score_outlier(self, clean, dirty, corrected):
        clean = re.findall(r'\d+', clean)
        dirty = re.findall(r'\d+', dirty)
        corrected = re.findall(r'\d+', corrected)

        if len(clean) != len(dirty) or len(dirty) != len(corrected) or len(corrected) != len(clean):
            return 0

        count = 0

        for (o,d,c) in  zip(clean, dirty, corrected):
            count += self.get_single_outlier_score(o, d, c)

        return count/len(clean)
    
    def get_numer_tp(self):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """

        ec_tp = 0.0
        for cell in self.error_corrected_val:
            if cell in self.error_clean_val:
                if cell[1] in self.numer_attr and self.error_corrected_val[cell] != self.error_clean_val[cell]:
                    ec_tp += self.get_fuzzy_score_outlier(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    
        return ec_tp 
    
    
    def get_data_cleaning_evaluation_fuzzy_alt(self, num_metric=False):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """

        ed_tp = 0.0
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                ed_tp += 1.0
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif cell[1] in self.numer_attr and num_metric:
                    ec_tp += self.get_fuzzy_score_outlier(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                elif cell[1] in self.str_attr:
                    ec_tp += self.get_fuzzy_score_string_alt(self.error_corrected_val[cell], self.error_clean_val[cell], self.error_corrected_val[cell])
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        
        if num_metric:
            return {"Fuzzy Alt Num Precision": round(ec_p, 3),"Fuzzy Alt Num Recall": round(ec_r, 3), "Fuzzy Alt Num F1": round(ec_f, 3)}
        else:
            return {"Fuzzy Alt Precision": round(ec_p, 3),"Fuzzy Alt Recall": round(ec_r, 3), "Fuzzy Alt F1": round(ec_f, 3)}
    
    def get_fuzzy_score_string_alt(self, clean, dirty, corrected ):
        if len(clean) != len(corrected) or len(dirty) != len(clean):
            return 0

        count_w = 0.0
        count_r = 0.0
        for o, c, d in zip(clean, corrected, dirty):
            if o != d:
                count_w += 1
                if o == c:
                    count_r += 1
            else:
                if c != d:
                    count_r -= 1

        if count_r <= 0:
            return 0

        return count_r / count_w
    
    def get_data_cleaning_evaluation_fuzzy_ME(self, num_metric=False):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    metric_score = self.monge_elkan_distance_fuzzy([self.error_corrected_val[cell]], [self.error_clean_val[cell]], [self.error_dirty_val[cell]])
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
    
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy ME Num Precision": round(ec_p, 3),"Fuzzy ME Num Recall": round(ec_r, 3), "Fuzzy ME Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy ME Precision": round(ec_p, 3),"Fuzzy ME Recall": round(ec_r, 3), "Fuzzy ME F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        
        
        
    
    def monge_elkan(self, bag1, bag2):
        """
        Compute Monge-Elkan similarity measure between two bags (lists).

        The Monge-Elkan similarity measure is a type of Hybrid similarity measure that combine the benefits of
        sequence-based and set-based methods. This can be effective for domains in which more control is needed
        over the similarity measure. It implicitly uses a secondary similarity measure, such as levenshtein to compute
        over all similarity score.

        Args:
            bag1,bag2 (list): Input lists

            sim_func (function): Secondary similarity function. This is expected to be a sequence-based
                similarity measure (defaults to levenshtein)

        Returns:
            Monge-Elkan similarity score (float)

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None


        Examples:
            >>> monge_elkan(['Niall'], ['Neal'])
            0.8049999999999999
            >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            0.8677218614718616
            >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=needleman_wunsch)
            2.0
            >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=affine)
            2.25
            >>> monge_elkan([''], ['a'])
            0.0
            >>> monge_elkan(['Niall'], ['Nigel'])
            0.7866666666666667

        References:
            * Principles of Data Integration book
        """

        # if exact match return 1.0
        if bag1 == bag2:
            return 1.0
        # if one of the strings is empty return 0
        if (len(bag1) == 0) or (len(bag2) == 0):
            return 0
        # aggregated sum of all the max sim score of all the elements in bag1
        # with elements in bag2
        sum_of_maxes = 0
        for t1 in bag1:
            max_sim = float('-inf')
            for t2 in bag2:
                max_sim = max(max_sim, self.jaro_winkler_distance(t1, t2))
            sum_of_maxes += max_sim
        sim = float(sum_of_maxes) / float(len(bag1))
        return 1 - sim


    def monge_elkan_distance_fuzzy(self, clean, dirty, corrected):
        me_clean_dirty = self.monge_elkan(clean, dirty)
        me_clean_corrected = self.monge_elkan(clean, corrected)
        #print(monge_elkan(clean, dirty))
        #print(monge_elkan(clean, corrected))


        return me_clean_corrected - me_clean_dirty
        
    def get_data_cleaning_evaluation_fuzzy_LD(self, num_metric=False):
        
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ed_tp = 0.0
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                ed_tp += 1.0
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                    
                elif cell[1] in self.numer_attr and num_metric:
                    ec_tp += self.get_fuzzy_score_outlier(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    
                elif cell[1] in self.long_str_attr:
                    metric_score = self.fuzzy_LD_Words(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])       
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
                        
                elif cell[1] in self.short_str_attr:
                    metric_score = self.fuzzy_LD_Char(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
                        
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        
        if num_metric:
            return {"Fuzzy LD Num Precision": round(ec_p, 3),"Fuzzy LD Recall": round(ec_r, 3), "Fuzzy LD F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            return {"Fuzzy LD Precision": round(ec_p, 3),"Fuzzy LD Recall": round(ec_r, 3), "Fuzzy LD F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
    

    def get_data_cleaning_evaluation_fuzzy_LD_Char(self, num_metric=False):
        
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                ec_tp += 1.0
            elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                metric_score = self.fuzzy_LD_Char(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                ec_tp += metric_score
                #print([self.error_clean_val[cell] ,  self.error_dirty_val[cell],  self.error_corrected_val[cell], metric_score])
                if metric_score >= 0:
                    pc_r += 1
                else:
                    pc_f += 1
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Char Num Precision": round(ec_p, 3),"Fuzzy LD Char Num Recall": round(ec_r, 3), "Fuzzy LD Char Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else: 
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Char Precision": round(ec_p, 3),"Fuzzy LD Char Recall": round(ec_r, 3), "Fuzzy LD Char F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
    
    def get_data_cleaning_evaluation_fuzzy_LD_Words(self, num_metric=False):
        
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ed_tp = 0.0
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                ed_tp += 1.0
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    metric_score = self.fuzzy_LD_Words(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])       
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
        
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Words Num Precision": round(ec_p, 3),"Fuzzy LD Words Num Recall": round(ec_r, 3), "Fuzzy LD Words Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Words Precision": round(ec_p, 3),"Fuzzy LD Words Recall": round(ec_r, 3), "Fuzzy LD Words F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}


    def fuzzy_LD_Char(self, clean, dirty, corrected):
        LD_clean_dirty =  self.levenshteinDistanceChar(clean, dirty)
        LD_clean_corrected = self.levenshteinDistanceChar(clean, corrected)
        print([clean, dirty, corrected, LD_clean_dirty, LD_clean_corrected,(LD_clean_corrected-LD_clean_dirty)])

        return (LD_clean_corrected-LD_clean_dirty)
    
    
    def fuzzy_LD_Words(self, clean, dirty, corrected):
        LD_clean_dirty =  self.levenshteinDistanceWords(clean, dirty)
        LD_clean_corrected = self.levenshteinDistanceWords(clean, corrected, (LD_clean_dirty - LD_clean_corrected) / max([len(clean), len(dirty)]))

       
        return (LD_clean_dirty - LD_clean_corrected) / max([len(clean), len(dirty)])
        
    def levenshteinDistanceWords(self, token1, token2):
        token1 = token1.split()
        token2 = token2.split()
        distances = numpy.zeros((len(token1) + 1, len(token2) + 1))

        for t1 in range(len(token1) + 1):
            distances[t1][0] = t1

        for t2 in range(len(token2) + 1):
            distances[0][t2] = t2

        a = 0
        b = 0
        c = 0

        for t1 in range(1, len(token1) + 1):
            for t2 in range(1, len(token2) + 1):
                if (token1[t1-1] == token2[t2-1]):
                    distances[t1][t2] = distances[t1 - 1][t2 - 1]
                else:
                    a = distances[t1][t2 - 1]
                    b = distances[t1 - 1][t2]
                    c = distances[t1 - 1][t2 - 1]

                    if (a <= b and a <= c):
                        distances[t1][t2] = a + 1
                    elif (b <= a and b <= c):
                        distances[t1][t2] = b + 1
                    else:
                        distances[t1][t2] = c + 1


        return distances[len(token1)][len(token2)]

    def levenshteinDistanceChar(self, token1, token2):
        distances = numpy.zeros((len(token1) + 1, len(token2) + 1))

        for t1 in range(len(token1) + 1):
            distances[t1][0] = t1

        for t2 in range(len(token2) + 1):
            distances[0][t2] = t2

        a = 0
        b = 0
        c = 0

        for t1 in range(1, len(token1) + 1):
            for t2 in range(1, len(token2) + 1):
                if (token1[t1-1] == token2[t2-1]):
                    distances[t1][t2] = distances[t1 - 1][t2 - 1]
                else:
                    a = distances[t1][t2 - 1]
                    b = distances[t1 - 1][t2]
                    c = distances[t1 - 1][t2 - 1]

                    if (a <= b and a <= c):
                        distances[t1][t2] = a + 1
                    elif (b <= a and b <= c):
                        distances[t1][t2] = b + 1
                    else:
                        distances[t1][t2] = c + 1


        return 1 - ((distances[len(token1)][len(token2)])/max([len(token1), len(token2)]))
    
    def get_fuzzy_score_outlier(self, clean, dirty, corrected):
        clean = re.findall(r'\d+', clean)
        dirty = re.findall(r'\d+', dirty)
        corrected = re.findall(r'\d+', corrected)

        if len(clean) != len(dirty) or len(dirty) != len(corrected) or len(corrected) != len(clean):
            return 0

        count = 0

        #for (o,d,c) in  zip(clean, dirty, corrected):
        #    if (int(o) > int(d) and int(d) > int(c)) or (int(o) <int(d) and int(d) < int(c)):
        #        return 0
        #    if abs(int(o) - int(c)) < abs(int(o) - int(d)):
        #        count += abs(int(o)-int(c)) / abs(int(o) - int(d))
        
        for (o,d,c) in  zip(clean, dirty, corrected):
            if abs(int(o) - int(d)) != 0:
                count += abs(int(o)-int(c)) / abs(int(o) - int(d))
            if abs(int(o) - int(c)) < abs(int(o) - int(d)):
                count += abs(int(o)-int(c)) / abs(int(o) - int(d))
       
        return count/len(clean)
    
    
    def get_fuzzy_score_semantic_sentence(self, clean, dirty, corrected):
        model = SentenceTransformer('all-MiniLM-L6-v2')

        embeddings1 = model.encode(clean, convert_to_tensor=True)
        embeddings2 = model.encode(dirty, convert_to_tensor=True)
        embeddings3 = model.encode(corrected, convert_to_tensor=True)

        cosine_scores_clean_dirty = util.cos_sim(embeddings1, embeddings2)
        cosine_scores_clean_corrected = util.cos_sim(embeddings1, embeddings3)


        return (cosine_scores_clean_corrected[0][0].item() - 0.2) / (0.8) - (cosine_scores_clean_dirty[0][0].item() - 0.2) / (0.8)
    
    def get_semantic_score(self, s1, s2):
        model = SentenceTransformer('all-MiniLM-L6-v2')

        embeddings1 = model.encode(s1, convert_to_tensor=True)
        embeddings2 = model.encode(s2, convert_to_tensor=True)
        score = util.cos_sim(embeddings1, embeddings2)
        
        return (score[0][0].item() - 0.2) / (0.8)
        
        


    def get_data_cleaning_evaluation_fuzzy_semantic_sentences(self, num_metric=False):

        """
        This method evaluates data cleaning process using fuzzy metrics
        """

        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    metric_score = self.get_fuzzy_score_semantic_sentence(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1

        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy Semantic Sentences Num Precision": round(ec_p, 3),"Fuzzy Semantic Sentences Num Recall": round(ec_r, 3), "Fuzzy Semantic Sentences Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:    
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy Semantic Sentences Precision": round(ec_p, 3),"Fuzzy Semantic Sentences Recall": round(ec_r, 3), "Fuzzy Semantic Sentences F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        
        
    def get_combined_score_evaluation(self, num_metric=False):
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    
                    clean_dirty_semantic_score = self.get_semantic_score(self.error_clean_val[cell], self.error_dirty_val[cell])
                    clean_dirty_string_score = self.get_string_score_avg(self.error_clean_val[cell], self.error_dirty_val[cell])
                    clean_corrected_semantic_score = self.get_semantic_score(self.error_clean_val[cell], self.error_corrected_val[cell])
                    clean_corrected_string_score = self.get_string_score_avg(self.error_clean_val[cell], self.error_corrected_val[cell])
                    
                    combined_score = self.get_combined_score(clean_dirty_semantic_score, clean_dirty_string_score, clean_corrected_semantic_score, clean_corrected_string_score)
                    
                    ec_tp += combined_score
                    if combined_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
                        
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        return {"Combined Precision": round(ec_p, 3),"Combined Recall": round(ec_r, 3), "Combined F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
                        
                        
    def get_string_score_avg(self, s1, s2):
        score_avg = ((self.levenshteinDistanceChar(s1, s2) / max([len(s1), len(s2)])) + self.monge_elkan(s1, s2) + self.jaro_winkler_distance(s1, s2)) / 3
        return score_avg
    
    def get_combined_score(self, cd_semantic, cd_string, cc_semantic, cc_string):
        
        threshold = 0.7
        
        string_score = cc_string - cd_string
        semantic_score = cc_semantic - cd_semantic
        avg_score = (string_score + semantic_score) / 2
        
        
        #semantic score high and string score high
        if cd_semantic >= threshold and cc_string >=threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return avg_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return string_score

            elif cc_semantic < threshold and cc_string >= threshold:
                return semantic_score

            elif cc_semantic < threshold and cc_string < threshold:
                return avg_score
        
        
        #semantic score high and string score low
        elif cd_semantic >= threshold and cd_string < threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return string_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return avg_score
                
            elif cc_semantic < threshold and cc_string >= threshold:
                return avg_score 

            elif cc_semantic < threshold and cc_string < threshold:
                return semantic_score
        
        
        
        
        #semantic score low and string score high
        elif cd_semantic < threshold and cd_string >= threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return semantic_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return avg_score 

            elif cc_semantic < threshold and cc_string >= threshold:
                return avg_score 

            elif cc_semantic < threshold and cc_string < threshold:
                return string_score
        
        
        #semantic score low and string score low
        elif cd_semantic < threshold and cd_string < threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return avg_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return semantic_score

            elif cc_semantic < threshold and cc_string >= threshold:
                return string_score

            elif cc_semantic < threshold and cc_string < threshold:
                return avg_score
            
        return avg_score
        
        
        




In [2]:
metric_dict_hospital = {
    "clean_data_path": "../datasets/hospital/clean.csv",
    "dirty_data_path": "../datasets/hospital/dirty.csv",
    "corrected_data_path": "../datasets/hospital/baran_repaired1.csv",
    "str_attr": [2,3,6,7,9, 11, 12, 13, 14, 15, 16, 19],
    "short_str_attr": [3, 6, 7, 9, 13, 17, 18, 19],
    "long_str_attr": [2,11, 12, 14, 15, 16],
    "numer_attr": [17, 18]
    
}

m_hospital1 = Metrics(metric_dict_hospital)

m_hospital1.print_metrics()


['calhoun', 'calhoxn', 'madison', -0.2857142857142857]
['calhoun', 'caxhoun', 'madison', -0.2857142857142857]
['houston', 'housxon', 'madison', -0.2857142857142857]
['coffee', 'coffxx', 'clarke', -0.2222222222222222]
['elmore', 'elmoxe', 'clarke', -0.22222222222222232]
['elmore', 'elxore', 'clarke', -0.22222222222222232]
['fayette', 'fayexxe', 'madison', -0.3809523809523811]
['etowah', 'etowxh', 'clarke', -0.44444444444444453]
['butler', 'butxer', 'clarke', -0.3333333333333335]
['coffee', 'cxffee', 'clarke', -0.3333333333333335]
['scip-vte-1', 'sxip-vte-1', 'sxip-vtf-1', -0.06666666666666654]
['scip-vte-1', 'scip-vtx-1', 'scip-vtf-1', 0.0]
['calhoun', 'calhoxn', 'madison', 0.8571428571428572, 0.2857142857142857, -0.5714285714285715]
['calhoun', 'caxhoun', 'madison', 0.8571428571428572, 0.2857142857142857, -0.5714285714285715]
['houston', 'housxon', 'madison', 0.8571428571428572, 0.2857142857142857, -0.5714285714285715]
['coffee', 'coffxx', 'clarke', 0.6666666666666667, 0.33333333333333

In [3]:
metric_dict_hospital = {
    "clean_data_path": "../datasets/hospital/clean.csv",
    "dirty_data_path": "../datasets/hospital/dirty.csv",
    "corrected_data_path": "../datasets/hospital/baran_repaired2.csv",
    "str_attr": [2,3,6,7,9, 11, 12, 13, 14, 15, 16, 19],
    "short_str_attr": [3, 6, 7, 9, 13, 17, 18, 19],
    "long_str_attr": [2,11, 12, 14, 15, 16],
    "numer_attr": [17, 18]
    
}

m_hospital2 = Metrics(metric_dict_hospital)

m_hospital2.print_metrics()


['no', 'xo', 'yes', -0.6666666666666666]
['no', 'xo', 'yes', -0.6666666666666666]
['no', 'xo', 'yes', 0.5, 0.0, -0.5]
['no', 'xo', 'yes', 0.5, 0.0, -0.5]
['no', 'xo', 'yes', -0.6666666666666666]
['no', 'xo', 'yes', -0.6666666666666666]
['no', 'xo', 'yes', 0.5, 0.0, -0.5]
['no', 'xo', 'yes', 0.5, 0.0, -0.5]
{'Combined Precision': 0.871, 'Combined Recall': 0.502, 'Combined F1': 0.637, 'PC R': 2.0, 'PC F': 0.0}

{'Precision': 0.87, 'Recall': 0.501, 'F1': 0.636, 'Amount of fixed data errors': 293.0}
{'Fuzzy JW Precision': 0.866, 'Fuzzy JW Recall': 0.498, 'Fuzzy JW F1': 0.633, 'PC R': 0.0, 'PC F': 2.0}
{'Fuzzy ME Precision': 0.87, 'Fuzzy ME Recall': 0.501, 'Fuzzy ME F1': 0.636, 'PC R': 2.0, 'PC F': 0.0}
{'Fuzzy LD Char Precision': 0.867, 'Fuzzy LD Char Recall': 0.499, 'Fuzzy LD Char F1': 0.633, 'PC R': 0.0, 'PC F': 2.0}
{'Fuzzy Semantic Sentences Precision': 0.874, 'Fuzzy Semantic Sentences Recall': 0.503, 'Fuzzy Semantic Sentences F1': 0.638, 'PC R': 2.0, 'PC F': 0.0}

{'Fuzzy JW Num Preci

In [4]:
metric_dict_hospital = {
    "clean_data_path": "../datasets/hospital/clean.csv",
    "dirty_data_path": "../datasets/hospital/dirty.csv",
    "corrected_data_path": "../datasets/hospital/baran_repaired3.csv",
    "str_attr": [2,3,6,7,9, 11, 12, 13, 14, 15, 16, 19],
    "short_str_attr": [3, 6, 7, 9, 13, 17, 18, 19],
    "long_str_attr": [2,11, 12, 14, 15, 16],
    "numer_attr": [17, 18]
    
}

m_hospital3 = Metrics(metric_dict_hospital)

m_hospital3.print_metrics()


['wedowee', 'wxdowxx', 'birmingham', -0.7142857142857143]
['wedowee', 'wxdowxx', 'birmingham', 0.5714285714285714, 0.0, -0.5714285714285714]
['wedowee', 'wxdowxx', 'birmingham', -0.7142857142857143]
['wedowee', 'wxdowxx', 'birmingham', 0.5714285714285714, 0.0, -0.5714285714285714]
{'Combined Precision': 0.932, 'Combined Recall': 0.456, 'Combined F1': 0.612, 'PC R': 1.0, 'PC F': 0.0}

{'Precision': 0.932, 'Recall': 0.456, 'F1': 0.612, 'Amount of fixed data errors': 249.0}
{'Fuzzy JW Precision': 0.929, 'Fuzzy JW Recall': 0.454, 'Fuzzy JW F1': 0.61, 'PC R': 0.0, 'PC F': 1.0}
{'Fuzzy ME Precision': 0.932, 'Fuzzy ME Recall': 0.456, 'Fuzzy ME F1': 0.612, 'PC R': 1.0, 'PC F': 0.0}
{'Fuzzy LD Char Precision': 0.929, 'Fuzzy LD Char Recall': 0.455, 'Fuzzy LD Char F1': 0.611, 'PC R': 0.0, 'PC F': 1.0}
{'Fuzzy Semantic Sentences Precision': 0.933, 'Fuzzy Semantic Sentences Recall': 0.456, 'Fuzzy Semantic Sentences F1': 0.613, 'PC R': 1.0, 'PC F': 0.0}

{'Fuzzy JW Num Precision': 0.935, 'Fuzzy JW N

In [5]:
metric_dict_hospital = {
    "clean_data_path": "../datasets/hospital/clean.csv",
    "dirty_data_path": "../datasets/hospital/dirty.csv",
    "corrected_data_path": "../datasets/hospital/baran_repaired4.csv",
    "str_attr": [2,3,6,7,9, 11, 12, 13, 14, 15, 16, 19],
    "short_str_attr": [3, 6, 7, 9, 13, 17, 18, 19],
    "long_str_attr": [2,11, 12, 14, 15, 16],
    "numer_attr": [17, 18]
    
}

m_hospital4 = Metrics(metric_dict_hospital)

m_hospital4.print_metrics()


['2505 u s highway 431 north', '2505xuxsxhighwayx431xnorth', '8000 alabama highway 69', -0.26831863788385535]
['702 n main st', '702xnxmainxst', '849 south three notch street', -0.39865689865689863]
['702 n main st', '702 x maix st', '849 south three notch street', -0.44993894993894995]
['4370 west main street', '4370xwestxmainxstreet', '1108 ross clark circle', -0.3611832611832613]
['1530 u s highway 43', '1530xuxsxhighwayx43', '1256 military street south', -0.36414754835807467]
['1256 military street south', '1256 military street sxuth', '1530 u s highway 43', -0.47885739991003157]
['2505 u s highway 431 north', '2505xuxsxhighwayx431xnorth', '8000 alabama highway 69', 0.8076923076923077, 0.2692307692307693, -0.5384615384615384]
['702 n main st', '702xnxmainxst', '849 south three notch street', 0.7692307692307692, 0.2142857142857143, -0.5549450549450549]
['702 n main st', '702 x maix st', '849 south three notch street', 0.8461538461538461, 0.2142857142857143, -0.6318681318681318]
['43

In [6]:
metric_dict_hospital = {
    "clean_data_path": "../datasets/hospital/clean.csv",
    "dirty_data_path": "../datasets/hospital/dirty.csv",
    "corrected_data_path": "../datasets/hospital/baran_repaired5.csv",
    "str_attr": [2,3,6,7,9, 11, 12, 13, 14, 15, 16, 19],
    "short_str_attr": [3, 6, 7, 9, 13, 17, 18, 19],
    "long_str_attr": [2,11, 12, 14, 15, 16],
    "numer_attr": [17, 18]
    
}

m_hospital5 = Metrics(metric_dict_hospital)

m_hospital5.print_metrics()


{'Combined Precision': 0.914, 'Combined Recall': 0.462, 'Combined F1': 0.614, 'PC R': 0.0, 'PC F': 0.0}

{'Precision': 0.914, 'Recall': 0.462, 'F1': 0.614, 'Amount of fixed data errors': 257.0}
{'Fuzzy JW Precision': 0.914, 'Fuzzy JW Recall': 0.462, 'Fuzzy JW F1': 0.614, 'PC R': 0.0, 'PC F': 0.0}
{'Fuzzy ME Precision': 0.914, 'Fuzzy ME Recall': 0.462, 'Fuzzy ME F1': 0.614, 'PC R': 0.0, 'PC F': 0.0}
{'Fuzzy LD Char Precision': 0.914, 'Fuzzy LD Char Recall': 0.462, 'Fuzzy LD Char F1': 0.614, 'PC R': 0.0, 'PC F': 0.0}
{'Fuzzy Semantic Sentences Precision': 0.914, 'Fuzzy Semantic Sentences Recall': 0.462, 'Fuzzy Semantic Sentences F1': 0.614, 'PC R': 0.0, 'PC F': 0.0}

{'Fuzzy JW Num Precision': 0.938, 'Fuzzy JW Num Recall': 0.473, 'Fuzzy JW Num F1': 0.629, 'PC R': 0.0, 'PC F': 0.0}
{'Fuzzy ME Num Precision': 0.938, 'Fuzzy ME Num Recall': 0.473, 'Fuzzy ME Num F1': 0.629, 'PC R': 0.0, 'PC F': 0.0}
{'Fuzzy LD Char Num Precision': 0.938, 'Fuzzy LD Char Num Recall': 0.473, 'Fuzzy LD Char Num F

In [7]:
def calc_avg(m_dict_list):
    
    result = m_dict_list[0].copy()
    
    
    for i in range(0,3):
        if len(m_dict_list[i]) > 1:
            v = 0
            for j in range(0, len(m_dict_list)):
                v += list(m_dict_list[j].values())[i]
            result[list(m_dict_list[0].keys())[i]] = round(v/len(m_dict_list),3)

    print(result)
    #return result
    
print("Average of 5 runs")
calc_avg([m_hospital1.standard_metric, m_hospital2.standard_metric, m_hospital3.standard_metric, m_hospital4.standard_metric, m_hospital5.standard_metric])
#calc_avg([m_hospital1.fuzzy_metric, m_hospital2.fuzzy_metric, m_hospital3.fuzzy_metric, m_hospital4.fuzzy_metric, m_hospital5.fuzzy_metric])
#calc_avg([m_hospital1.fuzzy_alt_metric, m_hospital2.fuzzy_alt_metric, m_hospital3.fuzzy_alt_metric, m_hospital4.fuzzy_alt_metric, m_hospital5.fuzzy_alt_metric])
calc_avg([m_hospital1.fuzzy_jw, m_hospital2.fuzzy_jw, m_hospital3.fuzzy_jw, m_hospital4.fuzzy_jw, m_hospital5.fuzzy_jw])
calc_avg([m_hospital1.fuzzy_me, m_hospital2.fuzzy_me, m_hospital3.fuzzy_me, m_hospital4.fuzzy_me, m_hospital5.fuzzy_me])
#calc_avg([m_hospital1.fuzzy_ld, m_hospital2.fuzzy_ld, m_hospital3.fuzzy_ld, m_hospital4.fuzzy_ld, m_hospital5.fuzzy_ld])
#calc_avg([m_hospital1.fuzzy_ld_words, m_hospital2.fuzzy_ld_words, m_hospital3.fuzzy_ld_words, m_hospital4.fuzzy_ld_words, m_hospital5.fuzzy_ld_words])
calc_avg([m_hospital1.fuzzy_ld_char, m_hospital2.fuzzy_ld_char, m_hospital3.fuzzy_ld_char, m_hospital4.fuzzy_ld_char, m_hospital5.fuzzy_ld_char])
calc_avg([m_hospital1.fuzzy_semantics_sentences, m_hospital2.fuzzy_semantics_sentences, m_hospital3.fuzzy_semantics_sentences, m_hospital4.fuzzy_semantics_sentences, m_hospital5.fuzzy_semantics_sentences])

print("")

#calc_avg([m_hospital1.fuzzy_num_metric, m_hospital2.fuzzy_num_metric, m_hospital3.fuzzy_num_metric, m_hospital4.fuzzy_num_metric, m_hospital5.fuzzy_num_metric])
#calc_avg([m_hospital1.fuzzy_alt_num_metric, m_hospital2.fuzzy_alt_num_metric, m_hospital3.fuzzy_alt_num_metric, m_hospital4.fuzzy_alt_num_metric, m_hospital5.fuzzy_alt_num_metric])
calc_avg([m_hospital1.fuzzy_jw_num, m_hospital2.fuzzy_jw_num, m_hospital3.fuzzy_jw_num, m_hospital4.fuzzy_jw_num, m_hospital5.fuzzy_jw_num])
calc_avg([m_hospital1.fuzzy_me_num, m_hospital2.fuzzy_me_num, m_hospital3.fuzzy_me_num, m_hospital4.fuzzy_me_num, m_hospital5.fuzzy_me_num])
#calc_avg([m_hospital1.fuzzy_ld_num, m_hospital2.fuzzy_ld_num, m_hospital3.fuzzy_ld_num, m_hospital4.fuzzy_ld_num, m_hospital5.fuzzy_ld_num])
#calc_avg([m_hospital1.fuzzy_ld_words_num, m_hospital2.fuzzy_ld_words_num, m_hospital3.fuzzy_ld_words_num, m_hospital4.fuzzy_ld_words_num, m_hospital5.fuzzy_ld_words_num])
calc_avg([m_hospital1.fuzzy_ld_char_num, m_hospital2.fuzzy_ld_char_num, m_hospital3.fuzzy_ld_char_num, m_hospital4.fuzzy_ld_char_num, m_hospital5.fuzzy_ld_char_num])
calc_avg([m_hospital1.fuzzy_semantics_sentences_num, m_hospital2.fuzzy_semantics_sentences_num, m_hospital3.fuzzy_semantics_sentences_num, m_hospital4.fuzzy_semantics_sentences_num, m_hospital5.fuzzy_semantics_sentences_num])

Average of 5 runs
{'Precision': 0.89, 'Recall': 0.472, 'F1': 0.616, 'Amount of fixed data errors': 296.0}
{'Fuzzy JW Precision': 0.885, 'Fuzzy JW Recall': 0.469, 'Fuzzy JW F1': 0.613, 'PC R': 1.0, 'PC F': 11.0}
{'Fuzzy ME Precision': 0.891, 'Fuzzy ME Recall': 0.473, 'Fuzzy ME F1': 0.617, 'PC R': 11.0, 'PC F': 1.0}
{'Fuzzy LD Char Precision': 0.882, 'Fuzzy LD Char Recall': 0.468, 'Fuzzy LD Char F1': 0.611, 'PC R': 1.0, 'PC F': 11.0}
{'Fuzzy Semantic Sentences Precision': 0.889, 'Fuzzy Semantic Sentences Recall': 0.472, 'Fuzzy Semantic Sentences F1': 0.615, 'PC R': 5.0, 'PC F': 7.0}

{'Fuzzy JW Num Precision': 0.901, 'Fuzzy JW Num Recall': 0.478, 'Fuzzy JW Num F1': 0.624, 'PC R': 1.0, 'PC F': 11.0}
{'Fuzzy ME Num Precision': 0.907, 'Fuzzy ME Num Recall': 0.481, 'Fuzzy ME Num F1': 0.628, 'PC R': 11.0, 'PC F': 1.0}
{'Fuzzy LD Char Num Precision': 0.898, 'Fuzzy LD Char Num Recall': 0.477, 'Fuzzy LD Char Num F1': 0.622, 'PC R': 1.0, 'PC F': 11.0}
{'Fuzzy Semantic Sentences Num Precision': 0.

In [8]:
calc_avg([m_hospital1.combined_metric, m_hospital2.combined_metric, m_hospital3.combined_metric, m_hospital4.combined_metric, m_hospital5.combined_metric])

{'Combined Precision': 0.888, 'Combined Recall': 0.472, 'Combined F1': 0.615, 'PC R': 5.0, 'PC F': 7.0}
