In [9]:
import pandas
import html
import re
import numpy
import sys

from sentence_transformers import SentenceTransformer, util
from math import floor, ceil
#from gensim.models import KeyedVectors
#from gensim.downloader import load

#model = load('word2vec-google-news-300')

class Metrics:
    def __init__(self, metric_dict):
        #save paths to clean, dirty and corrected dataset
        self.clean_path = metric_dict["clean_data_path"]
        self.dirty_path = metric_dict["dirty_data_path"]
        #self.corrected_path = metric_dict["corrected_data_path"]
        
        #read csv-files of clean, dirty and corrected dataset
        self.clean_data = self.read_csv_dataset(metric_dict["clean_data_path"])
        self.dirty_data = self.read_csv_dataset(metric_dict["dirty_data_path"])
        self.corrected_data = self.read_csv_dataset(metric_dict["corrected_data_path"])
        
        #create dictionaries for errorneous cells
        #save clean, dirty and corrected values of erroneous cells seperately
        self.error_clean_val = self.get_dataframes_difference(self.dirty_data, self.clean_data) #clean values
        self.error_dirty_val = self.get_dataframes_difference(self.clean_data, self.dirty_data) #dirty values
        self.error_corrected_val = self.get_error_corrected_val() #corrected values
        
        #save attribute classification
        self.str_attr = metric_dict["str_attr"]
        self.short_str_attr = metric_dict["short_str_attr"]
        self.long_str_attr = metric_dict["long_str_attr"]
        self.numer_attr = metric_dict["numer_attr"]
        
        #calculate "normal" metrics precision, recall, f1
        self.standard_metric = self.get_data_cleaning_evaluation()
        
        #numeric TP
        self.numer_tp = self.get_numer_tp()
        
        #calculate and save fuzzy metrics
        #self.fuzzy_alt_metric = self.get_data_cleaning_evaluation_fuzzy_alt()
        self.fuzzy_jw = self.get_data_cleaning_evaluation_fuzzy_JW()
        self.fuzzy_me = self.get_data_cleaning_evaluation_fuzzy_ME()
        #self.fuzzy_ld_words = self.get_data_cleaning_evaluation_fuzzy_LD_Words() 
        self.fuzzy_ld_char = self.get_data_cleaning_evaluation_fuzzy_LD_Char() 
        #self.fuzzy_ld = self.get_data_cleaning_evaluation_fuzzy_LD() if self.short_str_attr or self.long_str_attr else {"LD Message": "short or long string attributes not declared"}
        self.fuzzy_semantics_sentences = self.get_data_cleaning_evaluation_fuzzy_semantic_sentences()
        
        #fuzzy metrics combined with outlier metric
        if self.numer_attr:
            #self.fuzzy_alt_num_metric = self.get_data_cleaning_evaluation_fuzzy_alt(True)
            self.fuzzy_jw_num = self.get_data_cleaning_evaluation_fuzzy_JW(True)
            self.fuzzy_me_num = self.get_data_cleaning_evaluation_fuzzy_ME(True)
            #self.fuzzy_ld_words_num = self.get_data_cleaning_evaluation_fuzzy_LD_Words(True)
            self.fuzzy_ld_char_num = self.get_data_cleaning_evaluation_fuzzy_LD_Char(True)
            #self.fuzzy_ld_num = self.get_data_cleaning_evaluation_fuzzy_LD(True) if self.short_str_attr or self.long_str_attr else {"LD Num Message": "short or long string attributes not declared"}
            self.fuzzy_semantics_sentences_num = self.get_data_cleaning_evaluation_fuzzy_semantic_sentences(True)
            
        #average metrics
        self.avg_string_metric = self.get_string_metric_avg()
        self.avg_string_semantic_metric = self.get_string_semantic_metric_avg()
        
        #combined evaluation
        self.combined_metric = self.get_combined_score_evaluation()

        
    def print_metrics(self):
        print(self.combined_metric)
        print("")
        print(self.standard_metric)
        #print(self.fuzzy_alt_metric)
        print(self.fuzzy_jw)
        print(self.fuzzy_me)
        #print(self.fuzzy_ld_words)
        print(self.fuzzy_ld_char)
        #print(self.fuzzy_ld)
        #print(self.fuzzy_semantics_words)
        print(self.fuzzy_semantics_sentences)
        print("")
        
        if self.numer_attr:
            #print(self.fuzzy_alt_num_metric)
            print(self.fuzzy_jw_num)
            print(self.fuzzy_me_num)
            #print(self.fuzzy_ld_words_num)
            print(self.fuzzy_ld_char_num)
            #print(self.fuzzy_ld_num)
            #print(self.fuzzy_semantics_words_num)
            print(self.fuzzy_semantics_sentences_num)
            
        else:
            print({"Num Metrics": "numeric attributes not declared"})

        
    def read_csv_dataset(self, dataset_path):
        """
        This method reads a dataset from a csv file path.
        """
        dataframe = pandas.read_csv(dataset_path, sep=",", header="infer", encoding="utf-8", dtype=str,
                                    keep_default_na=False, low_memory=False).applymap(self.value_normalizer)
        return dataframe
    
    @staticmethod
    def value_normalizer(value):
        """
        This method takes a value and minimally normalizes it.
        """
        value = html.unescape(value)
        value = re.sub("[\t\n ]+", " ", value, re.UNICODE)
        value = value.strip("\t\n ")
        return value
    
    def get_string_metric_avg(self):
        if True:
            ld_words_char_avg_p = list(self.fuzzy_ld_char.values())[0]
            ld_words_char_avg_r = list(self.fuzzy_ld_char.values())[1]
            ld_words_char_avg_f1 = list(self.fuzzy_ld_char.values())[2]
            avg_precision = (list(self.fuzzy_jw.values()))[0] + (list(self.fuzzy_me.values()))[0] +  ld_words_char_avg_p / 3
            avg_recall = (list(self.fuzzy_jw.values()))[1] + (list(self.fuzzy_me.values()))[1] +  ld_words_char_avg_r / 3
            avg_f1 = (list(self.fuzzy_jw.values()))[2] + (list(self.fuzzy_me.values()))[2] +  ld_words_char_avg_f1 / 3
        
        else:
            avg_precision = (list(self.fuzzy_jw.values())[0] + list(self.fuzzy_me.values())[0] + list(self.fuzzy_ld.values())[0])/3
            avg_recall = (list(self.fuzzy_jw.values())[1] + list(self.fuzzy_me.values())[1] + list(self.fuzzy_ld.values())[1])/3
            avg_f1 = (list(self.fuzzy_jw.values())[2] + list(self.fuzzy_me.values())[2] + list(self.fuzzy_ld.values())[0])/3

        return {"Average of String Metrics Precision": round(avg_precision,3), "Average of String Metrics Recall": round(avg_recall,3), "Average of String Metrics F1": round(avg_f1,3)}
        
    def get_string_semantic_metric_avg(self):
        avg_precision = (list(self.avg_string_metric.values())[0] + list(self.fuzzy_semantics_sentences.values())[0]) / 2
        avg_recall = (list(self.avg_string_metric.values())[1] + list(self.fuzzy_semantics_sentences.values())[1]) / 2
        avg_f1 = (list(self.avg_string_metric.values())[2] + list(self.fuzzy_semantics_sentences.values())[2]) / 2
        
        return {"Average of String and Semantics Metrics Precision": round(avg_precision,3), "Average of String and Semantics Metrics Recall": round(avg_recall,3), "Average of String and Semantics Metrics F1": round(avg_f1,3)}
        
        
    
    def get_dataframes_difference(self, dataframe_1, dataframe_2):
        """
        This method compares two dataframes and returns the different cells.
        """
        if dataframe_1.shape != dataframe_2.shape:
            sys.stderr.write("Two compared datasets do not have equal sizes!\n")
        difference_dictionary = {}
        difference_dataframe = dataframe_1.where(dataframe_1.values != dataframe_2.values).notna()
        for j in range(dataframe_1.shape[1]):
            for i in difference_dataframe.index[difference_dataframe.iloc[:, j]].tolist():
                difference_dictionary[(i, j)] = dataframe_2.iloc[i, j]
        return difference_dictionary
    
    def get_error_corrected_val(self):
        correction_dict = self.get_dataframes_difference(self.dirty_data, self.corrected_data)
        for key in list(correction_dict):
            if key not in self.error_clean_val:
                del correction_dict[key]
    
        return correction_dict
    
  
    def jaro_winkler_distance(self, s1, s2):
        """
        Compute Jaro-Winkler distance between two strings.
        """
        # If the s are equal
        if (s1 == s2):
            return 1.0

        # Length of two s
        len1 = len(s1)
        len2 = len(s2)

        # Maximum distance upto which matching
        # is allowed
        max_dist = floor(max(len1, len2) / 2) - 1

        # Count of matches
        match = 0

        # Hash for matches
        hash_s1 = [0] * len(s1)
        hash_s2 = [0] * len(s2)

        # Traverse through the first
        for i in range(len1):

            # Check if there is any matches
            for j in range(max(0, i - max_dist), 
                           min(len2, i + max_dist + 1)):

                # If there is a match
                if (s1[i] == s2[j] and hash_s2[j] == 0):
                    hash_s1[i] = 1
                    hash_s2[j] = 1
                    match += 1
                    break

        # If there is no match
        if (match == 0):
            return 0.0

        # Number of transpositions
        t = 0
        point = 0

        # Count number of occurrences
        # where two characters match but
        # there is a third matched character
        # in between the indices
        for i in range(len1):
            if (hash_s1[i]):

                # Find the next matched character
                # in second
                while (hash_s2[point] == 0):
                    point += 1

                if (s1[i] != s2[point]):
                    t += 1
                point += 1
        t = t//2

        # Return the Jaro Similarity
        return (match/ len1 + match / len2 +
                (match - t) / match)/ 3.0
    

    def jaro_winkler_distance_fuzzy(self, clean, dirty, corrected):
        jw_clean_dirty = self.jaro_winkler_distance(clean, dirty)
        jw_clean_corrected = self.jaro_winkler_distance(clean, corrected)


        return jw_clean_corrected - jw_clean_dirty
    
    
    def get_data_cleaning_evaluation_fuzzy_JW(self, num_metric=False):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ec_tp = 0.0
        output_size = 0.0
        
        for cell in self.error_corrected_val:
            output_size += 1
            if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                ec_tp += 1.0
            elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                metric_score = self.jaro_winkler_distance_fuzzy(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                ec_tp += metric_score
                print([self.error_clean_val[cell] ,  self.error_dirty_val[cell],  self.error_corrected_val[cell], metric_score])
                if metric_score >= 0:
                    pc_r += 1
                else:
                    pc_f += 1 
        
        
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy JW Num Precision": round(ec_p, 3),"Fuzzy JW Num Recall": round(ec_r, 3), "Fuzzy JW Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy JW Precision": round(ec_p, 3),"Fuzzy JW Recall": round(ec_r, 3), "Fuzzy JW F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}

    def get_data_cleaning_evaluation(self):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                ec_tp += 1.0
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        return {"Precision": round(ec_p, 3),"Recall": round(ec_r, 3), "F1": round(ec_f, 3), "Amount of fixed data errors": output_size}
    
    def get_single_outlier_score(self, clean, dirty, corrected):
        score = 1 - (abs(int(clean)-int(corrected)) / abs(int(clean) - int(dirty)))
    
        if score >= 0:
            return score

        else:
            return -1
        
    def get_fuzzy_score_outlier(self, clean, dirty, corrected):
        clean = re.findall(r'\d+', clean)
        dirty = re.findall(r'\d+', dirty)
        corrected = re.findall(r'\d+', corrected)

        if len(clean) != len(dirty) or len(dirty) != len(corrected) or len(corrected) != len(clean):
            return 0

        count = 0

        for (o,d,c) in  zip(clean, dirty, corrected):
            count += self.get_single_outlier_score(o, d, c)

        return count/len(clean)
    
    def get_numer_tp(self):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """

        ec_tp = 0.0
        for cell in self.error_corrected_val:
            if cell in self.error_clean_val:
                if cell[1] in self.numer_attr and self.error_corrected_val[cell] != self.error_clean_val[cell]:
                    ec_tp += self.get_fuzzy_score_outlier(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    
        return ec_tp 
    
    
    def get_data_cleaning_evaluation_fuzzy_alt(self, num_metric=False):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """

        ed_tp = 0.0
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                ed_tp += 1.0
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif cell[1] in self.numer_attr and num_metric:
                    ec_tp += self.get_fuzzy_score_outlier(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                elif cell[1] in self.str_attr:
                    ec_tp += self.get_fuzzy_score_string_alt(self.error_corrected_val[cell], self.error_clean_val[cell], self.error_corrected_val[cell])
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        
        if num_metric:
            return {"Fuzzy Alt Num Precision": round(ec_p, 3),"Fuzzy Alt Num Recall": round(ec_r, 3), "Fuzzy Alt Num F1": round(ec_f, 3)}
        else:
            return {"Fuzzy Alt Precision": round(ec_p, 3),"Fuzzy Alt Recall": round(ec_r, 3), "Fuzzy Alt F1": round(ec_f, 3)}
    
    def get_fuzzy_score_string_alt(self, clean, dirty, corrected ):
        if len(clean) != len(corrected) or len(dirty) != len(clean):
            return 0

        count_w = 0.0
        count_r = 0.0
        for o, c, d in zip(clean, corrected, dirty):
            if o != d:
                count_w += 1
                if o == c:
                    count_r += 1
            else:
                if c != d:
                    count_r -= 1

        if count_r <= 0:
            return 0

        return count_r / count_w
    
    def get_data_cleaning_evaluation_fuzzy_ME(self, num_metric=False):
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    metric_score = self.monge_elkan_distance_fuzzy([self.error_corrected_val[cell]], [self.error_clean_val[cell]], [self.error_dirty_val[cell]])
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
    
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy ME Num Precision": round(ec_p, 3),"Fuzzy ME Num Recall": round(ec_r, 3), "Fuzzy ME Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy ME Precision": round(ec_p, 3),"Fuzzy ME Recall": round(ec_r, 3), "Fuzzy ME F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        
        
        
    
    def monge_elkan(self, bag1, bag2):
        """
        Compute Monge-Elkan similarity measure between two bags (lists).

        The Monge-Elkan similarity measure is a type of Hybrid similarity measure that combine the benefits of
        sequence-based and set-based methods. This can be effective for domains in which more control is needed
        over the similarity measure. It implicitly uses a secondary similarity measure, such as levenshtein to compute
        over all similarity score.

        Args:
            bag1,bag2 (list): Input lists

            sim_func (function): Secondary similarity function. This is expected to be a sequence-based
                similarity measure (defaults to levenshtein)

        Returns:
            Monge-Elkan similarity score (float)

        Raises:
            TypeError : If the inputs are not lists or if one of the inputs is None


        Examples:
            >>> monge_elkan(['Niall'], ['Neal'])
            0.8049999999999999
            >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
            0.8677218614718616
            >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=needleman_wunsch)
            2.0
            >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=affine)
            2.25
            >>> monge_elkan([''], ['a'])
            0.0
            >>> monge_elkan(['Niall'], ['Nigel'])
            0.7866666666666667

        References:
            * Principles of Data Integration book
        """

        # if exact match return 1.0
        if bag1 == bag2:
            return 1.0
        # if one of the strings is empty return 0
        if (len(bag1) == 0) or (len(bag2) == 0):
            return 0
        # aggregated sum of all the max sim score of all the elements in bag1
        # with elements in bag2
        sum_of_maxes = 0
        for t1 in bag1:
            max_sim = float('-inf')
            for t2 in bag2:
                max_sim = max(max_sim, self.jaro_winkler_distance(t1, t2))
            sum_of_maxes += max_sim
        sim = float(sum_of_maxes) / float(len(bag1))
        return sim


    def monge_elkan_distance_fuzzy(self, clean, dirty, corrected):
        me_clean_dirty = self.monge_elkan(clean, dirty)
        me_clean_corrected = self.monge_elkan(clean, corrected)
        #print(monge_elkan(clean, dirty))
        #print(monge_elkan(clean, corrected))


        return me_clean_corrected - me_clean_dirty
        
    def get_data_cleaning_evaluation_fuzzy_LD(self, num_metric=False):
        
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ed_tp = 0.0
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                ed_tp += 1.0
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                    
                elif cell[1] in self.numer_attr and num_metric:
                    ec_tp += self.get_fuzzy_score_outlier(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    
                elif cell[1] in self.long_str_attr:
                    metric_score = self.fuzzy_LD_Words(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])       
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
                        
                elif cell[1] in self.short_str_attr:
                    metric_score = self.fuzzy_LD_Char(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
                        
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        
        if num_metric:
            return {"Fuzzy LD Num Precision": round(ec_p, 3),"Fuzzy LD Recall": round(ec_r, 3), "Fuzzy LD F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            return {"Fuzzy LD Precision": round(ec_p, 3),"Fuzzy LD Recall": round(ec_r, 3), "Fuzzy LD F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
    

    def get_data_cleaning_evaluation_fuzzy_LD_Char(self, num_metric=False):
        
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                ec_tp += 1.0
            elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                metric_score = self.fuzzy_LD_Char(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                ec_tp += metric_score
                #print([self.error_clean_val[cell] ,  self.error_dirty_val[cell],  self.error_corrected_val[cell], metric_score])
                if metric_score >= 0:
                    pc_r += 1
                else:
                    pc_f += 1
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Char Num Precision": round(ec_p, 3),"Fuzzy LD Char Num Recall": round(ec_r, 3), "Fuzzy LD Char Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else: 
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Char Precision": round(ec_p, 3),"Fuzzy LD Char Recall": round(ec_r, 3), "Fuzzy LD Char F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
    
    def get_data_cleaning_evaluation_fuzzy_LD_Words(self, num_metric=False):
        
        """
        This method evaluates data cleaning process using fuzzy metrics
        """
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ed_tp = 0.0
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                ed_tp += 1.0
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    metric_score = self.fuzzy_LD_Words(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])       
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
        
        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Words Num Precision": round(ec_p, 3),"Fuzzy LD Words Num Recall": round(ec_r, 3), "Fuzzy LD Words Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy LD Words Precision": round(ec_p, 3),"Fuzzy LD Words Recall": round(ec_r, 3), "Fuzzy LD Words F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}


    def fuzzy_LD_Char(self, clean, dirty, corrected):
        LD_clean_dirty =  self.levenshteinDistanceChar(clean, dirty)
        LD_clean_corrected = self.levenshteinDistanceChar(clean, corrected)
        print([clean, dirty, corrected, LD_clean_dirty, LD_clean_corrected,(LD_clean_corrected-LD_clean_dirty)])

        return (LD_clean_corrected-LD_clean_dirty)
    
    
    def fuzzy_LD_Words(self, clean, dirty, corrected):
        LD_clean_dirty =  self.levenshteinDistanceWords(clean, dirty)
        LD_clean_corrected = self.levenshteinDistanceWords(clean, corrected, (LD_clean_dirty - LD_clean_corrected) / max([len(clean), len(dirty)]))

       
        return (LD_clean_dirty - LD_clean_corrected) / max([len(clean), len(dirty)])
        
    def levenshteinDistanceWords(self, token1, token2):
        token1 = token1.split()
        token2 = token2.split()
        distances = numpy.zeros((len(token1) + 1, len(token2) + 1))

        for t1 in range(len(token1) + 1):
            distances[t1][0] = t1

        for t2 in range(len(token2) + 1):
            distances[0][t2] = t2

        a = 0
        b = 0
        c = 0

        for t1 in range(1, len(token1) + 1):
            for t2 in range(1, len(token2) + 1):
                if (token1[t1-1] == token2[t2-1]):
                    distances[t1][t2] = distances[t1 - 1][t2 - 1]
                else:
                    a = distances[t1][t2 - 1]
                    b = distances[t1 - 1][t2]
                    c = distances[t1 - 1][t2 - 1]

                    if (a <= b and a <= c):
                        distances[t1][t2] = a + 1
                    elif (b <= a and b <= c):
                        distances[t1][t2] = b + 1
                    else:
                        distances[t1][t2] = c + 1


        return distances[len(token1)][len(token2)]

    def levenshteinDistanceChar(self, token1, token2):
        distances = numpy.zeros((len(token1) + 1, len(token2) + 1))

        for t1 in range(len(token1) + 1):
            distances[t1][0] = t1

        for t2 in range(len(token2) + 1):
            distances[0][t2] = t2

        a = 0
        b = 0
        c = 0

        for t1 in range(1, len(token1) + 1):
            for t2 in range(1, len(token2) + 1):
                if (token1[t1-1] == token2[t2-1]):
                    distances[t1][t2] = distances[t1 - 1][t2 - 1]
                else:
                    a = distances[t1][t2 - 1]
                    b = distances[t1 - 1][t2]
                    c = distances[t1 - 1][t2 - 1]

                    if (a <= b and a <= c):
                        distances[t1][t2] = a + 1
                    elif (b <= a and b <= c):
                        distances[t1][t2] = b + 1
                    else:
                        distances[t1][t2] = c + 1


        return 1 - ((distances[len(token1)][len(token2)])/max([len(token1), len(token2)]))
    
    def get_fuzzy_score_outlier(self, clean, dirty, corrected):
        clean = re.findall(r'\d+', clean)
        dirty = re.findall(r'\d+', dirty)
        corrected = re.findall(r'\d+', corrected)

        if len(clean) != len(dirty) or len(dirty) != len(corrected) or len(corrected) != len(clean):
            return 0

        count = 0

        #for (o,d,c) in  zip(clean, dirty, corrected):
        #    if (int(o) > int(d) and int(d) > int(c)) or (int(o) <int(d) and int(d) < int(c)):
        #        return 0
        #    if abs(int(o) - int(c)) < abs(int(o) - int(d)):
        #        count += abs(int(o)-int(c)) / abs(int(o) - int(d))
        
        for (o,d,c) in  zip(clean, dirty, corrected):
            if abs(int(o) - int(d)) != 0:
                count += abs(int(o)-int(c)) / abs(int(o) - int(d))
            if abs(int(o) - int(c)) < abs(int(o) - int(d)):
                count += abs(int(o)-int(c)) / abs(int(o) - int(d))
       
        return count/len(clean)
    
    
    def get_fuzzy_score_semantic_sentence(self, clean, dirty, corrected):
        model = SentenceTransformer('all-MiniLM-L6-v2')

        embeddings1 = model.encode(clean, convert_to_tensor=True)
        embeddings2 = model.encode(dirty, convert_to_tensor=True)
        embeddings3 = model.encode(corrected, convert_to_tensor=True)

        cosine_scores_clean_dirty = util.cos_sim(embeddings1, embeddings2)
        cosine_scores_clean_corrected = util.cos_sim(embeddings1, embeddings3)


        return (cosine_scores_clean_corrected[0][0].item() - 0.2) / (0.8) - (cosine_scores_clean_dirty[0][0].item() - 0.2) / (0.8)
    
    def get_semantic_score(self, s1, s2):
        model = SentenceTransformer('all-MiniLM-L6-v2')

        embeddings1 = model.encode(s1, convert_to_tensor=True)
        embeddings2 = model.encode(s2, convert_to_tensor=True)
        score = util.cos_sim(embeddings1, embeddings2)
        
        return (score[0][0].item() - 0.2) / (0.8)
        
        


    def get_data_cleaning_evaluation_fuzzy_semantic_sentences(self, num_metric=False):

        """
        This method evaluates data cleaning process using fuzzy metrics
        """

        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    metric_score = self.get_fuzzy_score_semantic_sentence(self.error_clean_val[cell], self.error_dirty_val[cell],  self.error_corrected_val[cell])
                    ec_tp += metric_score
                    if metric_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1

        if num_metric:
            ec_tp += self.numer_tp
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy Semantic Sentences Num Precision": round(ec_p, 3),"Fuzzy Semantic Sentences Num Recall": round(ec_r, 3), "Fuzzy Semantic Sentences Num F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        else:    
            ec_p = 0.0 if output_size == 0 else ec_tp / output_size
            ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
            ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
            return {"Fuzzy Semantic Sentences Precision": round(ec_p, 3),"Fuzzy Semantic Sentences Recall": round(ec_r, 3), "Fuzzy Semantic Sentences F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
        
        
    def get_combined_score_evaluation(self, num_metric=False):
        pc_r = 0.0 #right partial corrections
        pc_f = 0.0 #false partial corrections
        ec_tp = 0.0
        output_size = 0.0
        for cell in self.error_corrected_val:
            output_size += 1
            if cell in self.error_clean_val:
                if self.error_corrected_val[cell] == self.error_clean_val[cell]:
                    ec_tp += 1.0
                elif (num_metric and cell[1] in self.str_attr and cell[1] not in self.numer_attr) or (not num_metric and cell[1] in self.str_attr):
                    
                    clean_dirty_semantic_score = self.get_semantic_score(self.error_clean_val[cell], self.error_dirty_val[cell])
                    clean_dirty_string_score = self.get_string_score_avg(self.error_clean_val[cell], self.error_dirty_val[cell])
                    clean_corrected_semantic_score = self.get_semantic_score(self.error_clean_val[cell], self.error_corrected_val[cell])
                    clean_corrected_string_score = self.get_string_score_avg(self.error_clean_val[cell], self.error_corrected_val[cell])
                    
                    combined_score = self.get_combined_score(clean_dirty_semantic_score, clean_dirty_string_score, clean_corrected_semantic_score, clean_corrected_string_score)
                    
                    ec_tp += combined_score
                    if combined_score >= 0:
                        pc_r += 1
                    else:
                        pc_f += 1
                        
        ec_p = 0.0 if output_size == 0 else ec_tp / output_size
        ec_r = 0.0 if len(self.error_clean_val) == 0 else ec_tp / len(self.error_clean_val)
        ec_f = 0.0 if (ec_p + ec_r) == 0.0 else (2 * ec_p * ec_r) / (ec_p + ec_r)
        return {"Combined Precision": round(ec_p, 3),"Combined Recall": round(ec_r, 3), "Combined F1": round(ec_f, 3), "PC R": pc_r, "PC F": pc_f}
                        
                        
    def get_string_score_avg(self, s1, s2):
        score_avg = ((self.levenshteinDistanceChar(s1, s2) / max([len(s1), len(s2)])) + self.monge_elkan(s1, s2) + self.jaro_winkler_distance(s1, s2)) / 3
        return score_avg
    
    def get_combined_score(self, cd_semantic, cd_string, cc_semantic, cc_string):
        
        threshold = 0.7
        
        string_score = cc_string - cd_string
        semantic_score = cc_semantic - cd_semantic
        avg_score = (string_score + semantic_score) / 2
        
        
        #semantic score high and string score high
        if cd_semantic >= threshold and cc_string >=threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return avg_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return string_score

            elif cc_semantic < threshold and cc_string >= threshold:
                return semantic_score

            elif cc_semantic < threshold and cc_string < threshold:
                return avg_score
        
        
        #semantic score high and string score low
        elif cd_semantic >= threshold and cd_string < threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return string_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return avg_score
                
            elif cc_semantic < threshold and cc_string >= threshold:
                return avg_score 

            elif cc_semantic < threshold and cc_string < threshold:
                return semantic_score
        
        
        
        
        #semantic score low and string score high
        elif cd_semantic < threshold and cd_string >= threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return semantic_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return avg_score 

            elif cc_semantic < threshold and cc_string >= threshold:
                return avg_score 

            elif cc_semantic < threshold and cc_string < threshold:
                return string_score
        
        
        #semantic score low and string score low
        elif cd_semantic < threshold and cd_string < threshold:
            
            if cc_semantic >= threshold and cc_string >=threshold:
                return avg_score
            
            elif cc_semantic >= threshold and cc_string < threshold:
                return semantic_score

            elif cc_semantic < threshold and cc_string >= threshold:
                return string_score

            elif cc_semantic < threshold and cc_string < threshold:
                return avg_score
            
        return avg_score
        
        
        

 
 

In [10]:
metric_dict_flights1 = {
    "clean_data_path": "../datasets/flights/clean.csv",
    "dirty_data_path": "../datasets/flights/dirty.csv",
    "corrected_data_path": "../datasets/flights/baran_repaired1.csv",
    "str_attr": [3, 4, 5, 6],
    "short_str_attr": [],
    "long_str_attr": [],
    "numer_attr": []
    
}

m_flights1 = Metrics(metric_dict_flights1)

m_flights1.print_metrics()

['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['12:00 a.m.', '12:00noon', '12:00 p.m.', 0.24814814814814812]
['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['7:14 p.m.', '', '7:41 p.m.', 0.9629629629629629]
['10:10 a.m.', '10:11 a.m.', '9:59 a.m.', -0.1777777777777777]
['6:08 a.m.', '6:09 a.m.', '5:58 a.m.', -0.07407407407407418]
['2:26 p.m.', '2:43 p.m.', '2:41 p.m.', 0.0]
['7:21 a.m.', '7:22 a.m.', '7:08 a.m.', -0.07407407407407418]
['9:14 a.m.', '', '9:58 a.m.', 0.8518518518518517]
['6:56 a.m.', '6:57 a.m.', '5:59 a.m.', -0.12169312169312174]
['7:05 a.m.', '7:30 a.m.', '7:07 a.m.', 0.0]
['8:51 a.m.', '8:52 a.m.', '8:31 a.m.', 0.0]
['1:32 p.m.', '1:50 p.m. (+8:00)', '1:50 p.m.', 0.12200435729847481]
['8:39 p.m.', '8:40 p.m.', '8:30 p.m.', 0.07407407407407418]
['8:52 a.m.', '', '9:05 a.m.', 0.8518518518518517]
['7:39 a.m.', '7:40 a.m.', '7:14 a.m.', 0.0]
['7:14 p.m.',

{'Combined Precision': 0.92, 'Combined Recall': 0.599, 'Combined F1': 0.725, 'PC R': 198.0, 'PC F': 171.0}

{'Precision': 0.885, 'Recall': 0.576, 'F1': 0.698, 'Amount of fixed data errors': 3204.0}
{'Fuzzy JW Precision': 0.916, 'Fuzzy JW Recall': 0.596, 'Fuzzy JW F1': 0.722, 'PC R': 275.0, 'PC F': 94.0}
{'Fuzzy ME Precision': 0.917, 'Fuzzy ME Recall': 0.597, 'Fuzzy ME F1': 0.723, 'PC R': 307.0, 'PC F': 62.0}
{'Fuzzy LD Char Precision': 0.917, 'Fuzzy LD Char Recall': 0.597, 'Fuzzy LD Char F1': 0.724, 'PC R': 285.0, 'PC F': 84.0}
{'Fuzzy Semantic Sentences Precision': 0.919, 'Fuzzy Semantic Sentences Recall': 0.598, 'Fuzzy Semantic Sentences F1': 0.725, 'PC R': 203.0, 'PC F': 166.0}

{'Num Metrics': 'numeric attributes not declared'}


In [11]:
metric_dict_flights2 = {
    "clean_data_path": "../datasets/flights/clean.csv",
    "dirty_data_path": "../datasets/flights/dirty.csv",
    "corrected_data_path": "../datasets/flights/baran_repaired2.csv",
    "str_attr": [3, 4, 5, 6],
    "short_str_attr": [],
    "long_str_attr": [],
    "numer_attr": []
    
}

m_flights2 = Metrics(metric_dict_flights2)

m_flights2.print_metrics()

['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['6:30 p.m.', '', '6:56 p.m.', 0.8518518518518517]
['9:00 p.m.', '', '9:16 p.m.', 0.8518518518518517]
['7:14 p.m.', '', '7:59 p.m.', 0.8518518518518517]
['8:42 a.m.', '9:00 a.m.', '8:51 a.m.', 0.07407407407407407]
['3:27 p.m.', '3:50 p.m.', '3:28 p.m.', 0.07407407407407418]
['7:29 a.m.', '', '7:56 a.m.', 0.8518518518518517]
['2:04 p.m.', '2:03 p.m.', '1:19 p.m.', -0.14814814814814825]
['4:28 p.m.', '4:44 p.m.', '3:58 p.m.', 0.0]
['4:25 p.m.', '4:28 p.m.', '4:30 p.m.', -0.07407407407407418]
['3:58 p.m.', '', '4:15 p.m.', 0.8518518518518517]
['8:42 a.m.', '9:01 a.m.', '8:51 a.m.', 0.07407407407407407]
['3:27 p.m.', '3:50 p.m.', '3:28 p.m.', 0.07407407407407418]
['12:12 p.m.', '12:13 p.m.', '3:36 p.m.', -0.1777777777777777]
['5:22 p.m.', '5:38 p.m.', '5:08 p.m.', 0.0]
['6:30 p.m.'

{'Combined Precision': 0.962, 'Combined Recall': 0.569, 'Combined F1': 0.715, 'PC R': 280.0, 'PC F': 40.0}

{'Precision': 0.89, 'Recall': 0.526, 'F1': 0.661, 'Amount of fixed data errors': 2909.0}
{'Fuzzy JW Precision': 0.95, 'Fuzzy JW Recall': 0.562, 'Fuzzy JW F1': 0.706, 'PC R': 299.0, 'PC F': 21.0}
{'Fuzzy ME Precision': 0.951, 'Fuzzy ME Recall': 0.562, 'Fuzzy ME F1': 0.707, 'PC R': 300.0, 'PC F': 20.0}
{'Fuzzy LD Char Precision': 0.951, 'Fuzzy LD Char Recall': 0.562, 'Fuzzy LD Char F1': 0.706, 'PC R': 301.0, 'PC F': 19.0}
{'Fuzzy Semantic Sentences Precision': 0.963, 'Fuzzy Semantic Sentences Recall': 0.569, 'Fuzzy Semantic Sentences F1': 0.715, 'PC R': 276.0, 'PC F': 44.0}

{'Num Metrics': 'numeric attributes not declared'}


In [12]:
metric_dict_flights3 = {
    "clean_data_path": "../datasets/flights/clean.csv",
    "dirty_data_path": "../datasets/flights/dirty.csv",
    "corrected_data_path": "../datasets/flights/baran_repaired3.csv",
    "str_attr": [3, 4, 5, 6],
    "short_str_attr": [],
    "long_str_attr": [],
    "numer_attr": []
    
}

m_flights3 = Metrics(metric_dict_flights3)

m_flights3.print_metrics()

['11:25 a.m.', '', '11/30 11:25 a.m.', 0.8083333333333332]
['2:30 p.m.', '', '2:46 p.m.', 0.8518518518518517]
['12:57 p.m.', '', '11/30 12:57 p.m.', 0.8083333333333332]
['1:55 p.m.', '', '11/30 1:55 p.m.', 0.8296296296296296]
['4:16 p.m.', '', '11/30 4:16 p.m.', 0.7925925925925926]
['11:55 p.m.', '', '11/30 11:55 p.m.', 0.8416666666666667]
['7:10 p.m.', '', '7:27 p.m.', 0.8518518518518517]
['1:55 p.m.', '', '11/30 1:55 p.m.', 0.8296296296296296]
['4:16 p.m.', '', '11/30 4:16 p.m.', 0.7925925925925926]
['9:05 a.m.', '', '9:05 a.m. (-00:00)', 0.8333333333333334]
['11:25 a.m.', '', '11/30 11:25 a.m.', 0.8083333333333332]
['11:25 a.m.', '', '11/30 11:25 a.m.', 0.8083333333333332]
['2:30 p.m.', '', '2:46 p.m.', 0.8518518518518517]
['12:57 p.m.', '', '11/30 12:57 p.m.', 0.8083333333333332]
['1:55 p.m.', '', '11/30 1:55 p.m.', 0.8296296296296296]
['4:16 p.m.', '', '11/30 4:16 p.m.', 0.7925925925925926]
['11:55 p.m.', '', '11/30 11:55 p.m.', 0.8416666666666667]
['7:10 p.m.', '', '7:27 p.m.', 0

{'Combined Precision': 0.943, 'Combined Recall': 0.561, 'Combined F1': 0.703, 'PC R': 337.0, 'PC F': 60.0}

{'Precision': 0.864, 'Recall': 0.514, 'F1': 0.645, 'Amount of fixed data errors': 2926.0}
{'Fuzzy JW Precision': 0.931, 'Fuzzy JW Recall': 0.554, 'Fuzzy JW F1': 0.694, 'PC R': 332.0, 'PC F': 65.0}
{'Fuzzy ME Precision': 0.934, 'Fuzzy ME Recall': 0.555, 'Fuzzy ME F1': 0.696, 'PC R': 358.0, 'PC F': 39.0}
{'Fuzzy LD Char Precision': 0.928, 'Fuzzy LD Char Recall': 0.552, 'Fuzzy LD Char F1': 0.692, 'PC R': 354.0, 'PC F': 43.0}
{'Fuzzy Semantic Sentences Precision': 0.944, 'Fuzzy Semantic Sentences Recall': 0.562, 'Fuzzy Semantic Sentences F1': 0.704, 'PC R': 333.0, 'PC F': 64.0}

{'Num Metrics': 'numeric attributes not declared'}


In [13]:
metric_dict_flights4 = {
    "clean_data_path": "../datasets/flights/clean.csv",
    "dirty_data_path": "../datasets/flights/dirty.csv",
    "corrected_data_path": "../datasets/flights/baran_repaired4.csv",
    "str_attr": [3, 4, 5, 6],
    "short_str_attr": [],
    "long_str_attr": [],
    "numer_attr": []
    
}

m_flights4 = Metrics(metric_dict_flights4)

m_flights4.print_metrics()

['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['12:00 a.m.', '', '12:00 p.m.', 0.9333333333333332]
['6:30 p.m.', '', '6:56 p.m.', 0.8518518518518517]
['8:25 p.m.', '', '8:44 p.m.', 0.8518518518518517]
['10:10 a.m.', '10:11 a.m.', '9:59 a.m.', -0.1777777777777777]
['9:14 a.m.', '', '9:57 a.m.', 0.8518518518518517]
['5:08 p.m.', '5:09 p.m.', '4:56 p.m.', -0.12169312169312174]
['7:05 a.m.', '7:30 a.m.', '7:07 a.m.', 0.0]
['1:48 p.m.', '', '2:01 p.m.', 0.8042328042328042]
['2:55 p.m.', '3:04 p.m.', '2:52 p.m.', 0.14814814814814825]
['8:52 a.m.', '', '9:05 a.m.', 0.8518518518518517]
['3:31 p.m.', '', '4:02 p.m.', 0.7777777777777777]
['3:58 p.m.', '', '4:14 p.m.', 0.7777777777777777]
['3:27 p.m.', '3:50 p.m.', '3:28 p.m.', 0.07407407407407418]
['10:10 a.m.', '10:11 a.m.', '9:59 a.m.', -0.1777777777777777]
['6:08 a.m.', '6:09 a.m.', '5:58 a.m.', -0.07407407407407418]
['7:21 a.m.', 

{'Combined Precision': 0.925, 'Combined Recall': 0.576, 'Combined F1': 0.71, 'PC R': 293.0, 'PC F': 117.0}

{'Precision': 0.866, 'Recall': 0.539, 'F1': 0.665, 'Amount of fixed data errors': 3064.0}
{'Fuzzy JW Precision': 0.916, 'Fuzzy JW Recall': 0.57, 'Fuzzy JW F1': 0.703, 'PC R': 323.0, 'PC F': 87.0}
{'Fuzzy ME Precision': 0.919, 'Fuzzy ME Recall': 0.572, 'Fuzzy ME F1': 0.705, 'PC R': 377.0, 'PC F': 33.0}
{'Fuzzy LD Char Precision': 0.916, 'Fuzzy LD Char Recall': 0.57, 'Fuzzy LD Char F1': 0.703, 'PC R': 326.0, 'PC F': 84.0}
{'Fuzzy Semantic Sentences Precision': 0.925, 'Fuzzy Semantic Sentences Recall': 0.576, 'Fuzzy Semantic Sentences F1': 0.71, 'PC R': 286.0, 'PC F': 124.0}

{'Num Metrics': 'numeric attributes not declared'}


In [14]:
m_flights5 = {
    "clean_data_path": "../datasets/flights/clean.csv",
    "dirty_data_path": "../datasets/flights/dirty.csv",
    "corrected_data_path": "../datasets/flights/baran_repaired5.csv",
    "str_attr": [3, 4, 5, 6],
    "short_str_attr": [],
    "long_str_attr": [],
    "numer_attr": []
    
}

m_flights5 = Metrics(m_flights5)

m_flights5.print_metrics()

['11:55 p.m.', '', '12:11 a.m.', 0.7523809523809524]
['8:42 a.m.', '9:00 a.m.', '9:01 a.m.', 0.0]
['3:27 p.m.', '3:50 p.m.', '3:28 p.m.', 0.07407407407407418]
['11:32 a.m.', '', '12:37 p.m.', 0.8250000000000001]
['12:12 p.m.', '12:13 p.m.', '11:54 a.m.', -0.18095238095238086]
['11:56 a.m.', '', '12:06 p.m.', 0.7999999999999999]
['6:02 a.m.', '', '6:17 a.m.', 0.8518518518518517]
['8:40 a.m.', '', '9:01 a.m.', 0.8518518518518517]
['10:20 a.m.', '', '10:37 a.m.', 0.8666666666666667]
['12:41 p.m.', '12:42 p.m.', '12:10 p.m.', 0.0]
['1:48 p.m.', '', '2:01 p.m.', 0.8042328042328042]
['3:01 p.m.', '', '3:17 p.m.', 0.9259259259259259]
['9:16 a.m.', '', '9:43 a.m.', 0.8518518518518517]
['3:27 p.m.', '3:50 p.m.', '3:28 p.m.', 0.07407407407407418]
['12:12 p.m.', '12:13 p.m.', '11:54 a.m.', -0.18095238095238086]
['2:10 p.m.', '2:38 p.m.', '2:04 p.m.', 0.07407407407407418]
['12:41 p.m.', '12:42 p.m.', '12:10 p.m.', 0.0]
['11:55 p.m.', '', '12:11 a.m.', 0.7523809523809524]
['8:42 a.m.', '', '9:01 a.

{'Combined Precision': 0.925, 'Combined Recall': 0.604, 'Combined F1': 0.731, 'PC R': 307.0, 'PC F': 110.0}

{'Precision': 0.87, 'Recall': 0.568, 'F1': 0.687, 'Amount of fixed data errors': 3210.0}
{'Fuzzy JW Precision': 0.922, 'Fuzzy JW Recall': 0.602, 'Fuzzy JW F1': 0.728, 'PC R': 359.0, 'PC F': 58.0}
{'Fuzzy ME Precision': 0.923, 'Fuzzy ME Recall': 0.603, 'Fuzzy ME F1': 0.729, 'PC R': 358.0, 'PC F': 59.0}
{'Fuzzy LD Char Precision': 0.921, 'Fuzzy LD Char Recall': 0.601, 'Fuzzy LD Char F1': 0.727, 'PC R': 352.0, 'PC F': 65.0}
{'Fuzzy Semantic Sentences Precision': 0.928, 'Fuzzy Semantic Sentences Recall': 0.605, 'Fuzzy Semantic Sentences F1': 0.733, 'PC R': 306.0, 'PC F': 111.0}

{'Num Metrics': 'numeric attributes not declared'}


In [15]:
def calc_avg(m_dict_list):
    
    result = m_dict_list[0].copy()
    
    
    for i in range(0,3):
        if len(m_dict_list[i]) > 1:
            v = 0
            for j in range(0, len(m_dict_list)):
                v += list(m_dict_list[j].values())[i]
            result[list(m_dict_list[0].keys())[i]] = round(v/len(m_dict_list),3)

    print(result)
    #return result
    
    
print("Average of 5 runs")
calc_avg([m_flights1.combined_metric, m_flights2.combined_metric, m_flights3.combined_metric, m_flights4.combined_metric, m_flights5.combined_metric])
calc_avg([m_flights1.standard_metric, m_flights2.standard_metric, m_flights3.standard_metric, m_flights4.standard_metric, m_flights5.standard_metric])
#calc_avg([m_flights1.fuzzy_metric, m_flights2.fuzzy_metric, m_flights3.fuzzy_metric, m_flights4.fuzzy_metric, m_flights5.fuzzy_metric])
#calc_avg([m_flights1.fuzzy_alt_metric, m_flights2.fuzzy_alt_metric, m_flights3.fuzzy_alt_metric, m_flights4.fuzzy_alt_metric, m_flights5.fuzzy_alt_metric])
calc_avg([m_flights1.fuzzy_jw, m_flights2.fuzzy_jw, m_flights3.fuzzy_jw, m_flights4.fuzzy_jw, m_flights5.fuzzy_jw])
calc_avg([m_flights1.fuzzy_me, m_flights2.fuzzy_me, m_flights3.fuzzy_me, m_flights4.fuzzy_me, m_flights5.fuzzy_me])
#calc_avg([m_flights1.fuzzy_ld, m_flights2.fuzzy_ld, m_flights3.fuzzy_ld, m_flights4.fuzzy_ld, m_flights5.fuzzy_ld])
#calc_avg([m_flights1.fuzzy_ld_words, m_flights2.fuzzy_ld_words, m_flights3.fuzzy_ld_words, m_flights4.fuzzy_ld_words, m_flights5.fuzzy_ld_words])
calc_avg([m_flights1.fuzzy_ld_char, m_flights2.fuzzy_ld_char, m_flights3.fuzzy_ld_char, m_flights4.fuzzy_ld_char, m_flights5.fuzzy_ld_char])
calc_avg([m_flights1.fuzzy_semantics_sentences, m_flights2.fuzzy_semantics_sentences, m_flights3.fuzzy_semantics_sentences, m_flights4.fuzzy_semantics_sentences, m_flights5.fuzzy_semantics_sentences])

print("")


#calc_avg([m_flights1.fuzzy_num_metric, m_flights2.fuzzy_num_metric, m_flights3.fuzzy_num_metric, m_flights4.fuzzy_num_metric, m_flights5.fuzzy_num_metric])
#calc_avg([m_flights1.fuzzy_alt_num_metric, m_flights2.fuzzy_alt_num_metric, m_flights3.fuzzy_alt_num_metric, m_flights4.fuzzy_alt_num_metric, m_flights5.fuzzy_alt_num_metric])
#calc_avg([m_flights1.fuzzy_jw_num, m_flights2.fuzzy_jw_num, m_flights3.fuzzy_jw_num, m_flights4.fuzzy_jw_num, m_flights5.fuzzy_jw_num])
#calc_avg([m_flights1.fuzzy_me_num, m_flights2.fuzzy_me_num, m_flights3.fuzzy_me_num, m_flights4.fuzzy_me_num, m_flights5.fuzzy_me_num])
#calc_avg([m_flights1.fuzzy_ld_num, m_flights2.fuzzy_ld_num, m_flights3.fuzzy_ld_num, m_flights4.fuzzy_ld_num, m_flights5.fuzzy_ld_num])
#calc_avg([m_flights1.fuzzy_ld_words_num, m_flights2.fuzzy_ld_words_num, m_flights3.fuzzy_ld_words_num, m_flights4.fuzzy_ld_words_num, m_flights5.fuzzy_ld_words_num])
#calc_avg([m_flights1.fuzzy_ld_char_num, m_flights2.fuzzy_ld_char_num, m_flights3.fuzzy_ld_char_num, m_flights4.fuzzy_ld_char_num, m_beers5.fuzzy_ld_char_num])


Average of 5 runs
{'Combined Precision': 0.935, 'Combined Recall': 0.582, 'Combined F1': 0.717, 'PC R': 198.0, 'PC F': 171.0}
{'Precision': 0.875, 'Recall': 0.545, 'F1': 0.671, 'Amount of fixed data errors': 3204.0}
{'Fuzzy JW Precision': 0.927, 'Fuzzy JW Recall': 0.577, 'Fuzzy JW F1': 0.711, 'PC R': 275.0, 'PC F': 94.0}
{'Fuzzy ME Precision': 0.929, 'Fuzzy ME Recall': 0.578, 'Fuzzy ME F1': 0.712, 'PC R': 307.0, 'PC F': 62.0}
{'Fuzzy LD Char Precision': 0.927, 'Fuzzy LD Char Recall': 0.576, 'Fuzzy LD Char F1': 0.71, 'PC R': 285.0, 'PC F': 84.0}
{'Fuzzy Semantic Sentences Precision': 0.936, 'Fuzzy Semantic Sentences Recall': 0.582, 'Fuzzy Semantic Sentences F1': 0.717, 'PC R': 203.0, 'PC F': 166.0}

