In [None]:
import numpy as np
from datetime import datetime, timezone
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
import numpy as np
from tqdm.notebook import tqdm
import plotly.express as px
import plotly.graph_objects as go
from copy import deepcopy
from langdetect import detect
from textblob import TextBlob
import re
import pandas as pd
import pickle
import json
from os import listdir
from os.path import isfile, join

# Merge/Push problem solving

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def intersection_num(list1, list2):
    return len(list(set(list1).intersection(list2)))
def union_num(list1, list2):
    return (len(set(list1)) + len(set(list2))) - intersection_num(list1, list2)

def intersection_list(list1, list2):
    return list(set(list1).intersection(list2))
def union_list(list1, list2):
    return list(set(list(set(list1)) + list(set(list2))))

def p_n(nouns1, nouns2):
    intersection = intersection_num(nouns1, nouns2)
    union = union_num(nouns1, nouns2)
    if union==0:
        return 0
    return abs(float(intersection) / union)

def p_a_v(a1, v1, a2, v2):
    union_a_v_1 = union_list(a1, v1)
    union_a_v_2 = union_list(a2, v2)
    
    intersection_top = abs(intersection_num(union_a_v_1, union_a_v_2)) + 1
    union_bot = abs(union_num(union_a_v_1, union_a_v_2)) + 2
    return float(intersection_top) / union_bot

def p_x(p_n, p_a_v):
    return p_n * p_a_v


def tokenize_food(st):
    wordsList = nltk.word_tokenize(st)
    # removing stop words from wordList
    wordsList = [w for w in wordsList if not w in stop_words]
    #  Using a Tagger. Which is part-of-speech tagger
    tagged = nltk.pos_tag(wordsList)
    
    NOUN_TAGS = ["NN", "NNS", "NNP", "NNPS"]
    ADJECTIVE_TAGS = ["JJ", "JJR", "JJS"]
    VERB_TAGS = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
    CARDINAL_TAGS = ["CD"]
    ADVERB_TAGS = ["RB", "RBR", "RBS"]
    PRONOUN_TAGS = ["PRP", "PRP$"]
    
    nouns = []
    adjectives = []
    verbs = []
    cardinal_numbers = []
    adverbs = []
    pronouns = []
    other = []
    for word, tag in tagged:
        if tag in NOUN_TAGS:
            nouns.append(word)
            continue
        if tag in ADJECTIVE_TAGS:
            adjectives.append(word)
            continue
        if tag in VERB_TAGS:
            verbs.append(word)
            continue
        if tag in CARDINAL_TAGS:
            cardinal_numbers.append(word)
            continue
        if tag in ADVERB_TAGS:
            adverbs.append(word)
            continue
        if tag in PRONOUN_TAGS:
            pronouns.append(word)
            continue
        other.append(word)
    
    nouns_lemmatized = []
    for noun in nouns:
        nouns_lemmatized.append(lemmatizer.lemmatize(noun, "n"))
    nouns_lemmatized = list(set(nouns_lemmatized))
    
    adjectives_lemmatized = []
    for adjective in adjectives:
        adjectives_lemmatized.append(lemmatizer.lemmatize(adjective, "a"))
    adjectives_lemmatized = list(set(adjectives_lemmatized))
    
    verbs_lemmatized = []
    for verb in verbs:
        verbs_lemmatized.append(lemmatizer.lemmatize(verb, "v"))
    verbs_lemmatized = list(set(verbs_lemmatized))
    
    adverbs_lemmatized = []
    for adverb in adverbs:
        adverbs_lemmatized.append(lemmatizer.lemmatize(adverb, "r"))
    adverbs_lemmatized = list(set(adverbs_lemmatized))
    
    return nouns_lemmatized, adjectives_lemmatized, verbs_lemmatized




def lexical_similarity(s1, s2):
    nouns_1, adjectives_1, verbs_1 = tokenize_food(s1)
    nouns_2, adjectives_2, verbs_2 = tokenize_food(s2)
    return p_x(p_n(nouns_1, nouns_2), p_a_v(adjectives_1, verbs_1, adjectives_2, verbs_2))


def sentences_encoding(sentences):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentence_embeddings = model.encode(sentences)
    return sentence_embeddings
    
def my_clustering(sentence_embeddings, eps=0.01):
    clustering = DBSCAN(eps=eps, min_samples=2, metric="cosine", n_jobs=-1).fit(sentence_embeddings)

    labels = clustering.labels_

    lbl_set = set(labels)
    #print(labels[:20], len(lbl_set))
    return labels, len(lbl_set)

REGEX_ENGLISH = '^[a-zA-Z]+[a-zA-Z\d\W]+[a-zA-Z]+$'

def return_english_only(l):
    p = re.compile(REGEX_ENGLISH)
    l2 = [ s for s in l if p.match(s) ]
    return l2 

def remove_rows_with_non_english_foodname(df, col_name):
    
    eng = df[col_name].str.contains(REGEX_ENGLISH)
    df1 = df[eng]
    return df1


    
def pipeline(sentences, clust_eps=0.01, sim_measure=0.7, ignore_errors=False, skip_clustering=False, debug=False):
    print("Filtering for english text only")
    sentences = return_english_only(sentences)
    if not skip_clustering:
        print("Producing embeddings..")
        sentence_embeddings = sentences_encoding(sentences)
        print("Embeddings Done!")
        print("Doing Clustering..")
    
        labels, len_lbl_set= my_clustering(sentence_embeddings, eps=clust_eps)

        desc_clusters = {}
        for sentence, cluster in zip(sentences, labels):
            desc_clusters[cluster] = desc_clusters.get(cluster, [])
            desc_clusters[cluster].append(sentence)

        print(f"Clustering done, {len_lbl_set} clusters found")
        
        desc_clusters_matches = desc_clusters
        non_clustered = deepcopy(desc_clusters_matches[-1])
        desc_clusters_matches[-1]=[]
    else: 
        desc_clusters_matches = {0:sentences}
        
    print(f"Searching for lexical matches..")
    
    #print(desc_clusters_matches[0])

    matches = []
    for number, items in tqdm(desc_clusters_matches.items()):
        print(f"Processing Cluster of {len(items)} items")
        i_prev = 0 
        for i in range(50, len(items), 50):
            items_slice = items[i_prev:i]
            i_prev = i
            for item1 in items_slice:
                for item2 in items_slice:
                    if item1 != item2 and lexical_similarity(item1, item2) > sim_measure and (item1, item2) not in matches and (item2, item1) not in matches:
                        matches.append((item1, item2))
    
    print(f"{len(matches)} matches found")
    if not skip_clustering:
        if debug:
            return desc_clusters, sentence_embeddings, labels, matches
        return sentence_embeddings, labels, matches
    else:
        return matches
        


# df1 and df2 are dataframes where first column is the name of the dish
# All other columns can only be ingredients
# If names_only is True then we use different thresholds for shorter matches
# If hard_maches is True, then we use stricter thresholds
def find_matches(df1, df2, names_only=False, hard_matches=False, check_duplicates=False, sim_measure=None, clust_eps=None, debug=False):
    df1.rename(columns={df1.columns[0]: "FoodName" }, inplace = True)
    df2.rename(columns={df2.columns[0]: "FoodName" }, inplace = True)
    
    sentences_1 = list(df1.sum(axis=1))
    sentences_2 = list(df2.sum(axis=1))

    if names_only:
        if hard_matches:
            similarity_measure = 0.5 
            clustering_epsilon = 0.05
        else:
            similarity_measure = 0.4
            clustering_epsilon = 0.05
    else:
        if hard_matches:
            similarity_measure = 0.8 
            clustering_epsilon = 0.05
        else:
            similarity_measure = 0.7 
            clustering_epsilon = 0.05
    
    if sim_measure is not None:
        similarity_measure = sim_measure
    if clust_eps is not None:
        clustering_epsilon = clust_eps
    
    if debug:
        desc_clusters, sentence_embeddings, labels, matches = \
        pipeline(set(sentences_1+sentences_2), clust_eps=clustering_epsilon, sim_measure=similarity_measure, debug=debug)
    else:
        sentence_embeddings, labels, matches = \
        pipeline(set(sentences_1+sentences_2), clust_eps=clustering_epsilon, sim_measure=similarity_measure, debug=debug)
    name_matches = list(set(matches))
    matches = []
    for name_match in name_matches:
        n1 = name_match[0]
        n2 = name_match[1]
        
        match_1_1 = df1.loc[df1["FoodName"] == str(n1)]
        match_1_2 = df1.loc[df1["FoodName"] == str(n2)]
        
        match_2_1 = df2.loc[df2["FoodName"] == str(n1)]
        match_2_2 = df2.loc[df2["FoodName"] == str(n2)]
        
        match = {"df1_name_1":match_1_1, "df1_name_2":match_1_2,
                                        "df2_name_1":match_2_1,"df2_name_2":match_2_2}

        matches.append(match)
    
    if check_duplicates:
        dupe_1 = list(set(duplicates(sentences_1)))
        dupe_2 = list(set(duplicates(sentences_2)))

        for d1 in dupe_1:        
            match_1_1 = df1.loc[df1["FoodName"] == str(d1)]
            match_1_2 = df1.loc[df1["FoodName"] == str(d1)]

            match_2_1 = df2.loc[df2["FoodName"] == str("asdasdqweqasbdashjdasd")]
            match_2_2 = df2.loc[df2["FoodName"] == str("asdasdqweqasbdashjdasd")]

            match = {"df1_name_1":match_1_1, "df1_name_2":match_1_2,
                                            "df2_name_1":match_2_1,"df2_name_2":match_2_2}

            matches.append(match)

        for d2 in dupe_2:        
            match_1_1 = df1.loc[df1["FoodName"] == str("asdasdqweqasbdashjdasd")]
            match_1_2 = df1.loc[df1["FoodName"] == str("asdasdqweqasbdashjdasd")]

            match_2_1 = df2.loc[df2["FoodName"] == d2]
            match_2_2 = df2.loc[df2["FoodName"] == d2]

            match = {"df1_name_1":match_1_1, "df1_name_2":match_1_2,
                                            "df2_name_1":match_2_1,"df2_name_2":match_2_2}

            matches.append(match)
        
        dupe_1_2 = duplicates(list(set(sentences_1)) + list(set(sentences_2)))
        
        for d12 in dupe_1_2:        
            match_1_1 = df1.loc[df1["FoodName"] == d12]
            match_1_2 = df1.loc[df1["FoodName"] == str("asdasdqweqasbdashjdasd")]

            match_2_1 = df2.loc[df2["FoodName"] == str("asdasdqweqasbdashjdasd")]
            match_2_2 = df2.loc[df2["FoodName"] == d12]

            match = {"df1_name_1":match_1_1, "df1_name_2":match_1_2,
                                            "df2_name_1":match_2_1,"df2_name_2":match_2_2}

            matches.append(match)
    if debug:
        return desc_clusters, matches
    return matches

def match_types(matches):
    df1_self_matches = []
    df2_self_matches = []
    df1_to_df2 = []
    for row in matches:
        if (not (row["df1_name_1"].empty)) and (not (row["df1_name_2"].empty)):
            df1_self_matches.append((row["df1_name_1"], row["df1_name_2"]))
        
        if (not (row["df2_name_1"].empty)) and (not (row["df2_name_2"].empty)):
            df2_self_matches.append((row["df2_name_1"], row["df2_name_2"]))
            
        if (not (row["df1_name_1"].empty)) and (not (row["df2_name_2"].empty)):
            df1_to_df2.append((row["df1_name_1"], row["df2_name_2"]))
        
        if (not (row["df1_name_2"].empty)) and (not (row["df2_name_1"].empty)):
            df1_to_df2.append((row["df1_name_2"], row["df2_name_1"]))
            
    return df1_self_matches, df2_self_matches, df1_to_df2

def show_number_of_matches(df1_self_matches, df2_self_matches, df1_to_df2):
    print("Input matches to itself:", len(df1_self_matches))
    print("Input to reference matches:", len(df1_to_df2))


def print_dict_nicely(d):
    for k, v in d.items():
        print(str(k).ljust(15), str(v).ljust(15))

def duplicates(liststring):
    seen = set()
    dupes = [x for x in liststring if x in seen or seen.add(x)]
    return dupes

def match_column_names(col1, col2, sim_measure=0.4): 
    col_matches = []
    for col_name in col1:
        for col2_name in col2:
            if col_name == col2_name or lexical_similarity(col_name, col2_name) > sim_measure:
                col_matches.append((col_name, col2_name))
    return col_matches

def compare_numerical_columns_for_match(df1_match, df2_match, matching_columns):
    res = {}
    for column1, column2  in matching_columns:
        try:
            one = df1_match[column1][0]
            two = df2_match[column2][0]
            if one is None or two is None:
                res[column2] = None
                continue
        except Exception:
            res[column2] = None
            continue
        res[column2] = df1_match[column1][0] - df2_match[column2][0]
    return res

def compare_matches(matches, matching_columns):
    res = []
    for match in matches:
        res.append(compare_numerical_columns_for_match(match[0], match[1], matching_columns))
    return res
    

def match_clear_name_dirty(clear_name, dirty_names, clear_func):
    for dn in dirty_names:
        if clear_func(dn) == clear_name:
            return dn
    

def match_dirty(clear_matches, dirty_names_1, dirty_names_2, clear_func_1, clear_func_2):
    dirty_matches = []
    for name1, name2 in clear_matches:
        dirty_name1 = match_clear_name_dirty(name1, dirty_names_1, clear_func_1)
        dirty_name2 = match_clear_name_dirty(name2, dirty_names_2, clear_func_2)
        dirty_matches.append((dirty_name1, dirty_name2))
    return dirty_matches


def get_orig_row(name, orig_df, col_name):
    return orig_df[orig_df[col_name] == name].copy(deep=True).reset_index(drop=True)

def bad_matches_to_good_matches(matches, df1, df2, col_name_1, col_name_2):
    better_matches = []
    if df2 is None:
        for matchl, matchr in tqdm(matches):
            bl = get_orig_row(matchl.iloc[0,0], df1, col_name_1)
            br = get_orig_row(matchr.iloc[0,0], df1, col_name_1)
            better_matches.append((bl, br))
    else: 
        for matchl, matchr in tqdm(matches):
            bl = get_orig_row(matchl.iloc[0,0], df1, col_name_1)
            br = get_orig_row(matchr.iloc[0,0], df2, col_name_2)
            better_matches.append((bl, br))
    
    return better_matches


# Image quality helpers

In [None]:
import skimage.measure 
from skimage import io
import imquality.brisque as brisque
import PIL.Image
from os import listdir
from os.path import isfile, join

def measure_over_images(image_dir):
    average_entropy = 0
    average_entropy_per_class = {}

    image_sizes = {}

    average_quality = 0
    quality_per_class = {}
    pictures_with_negative_quality = {}

    average_sharpness = 0
    sharpness_per_class = {}

    i = 0

    for dd in tqdm(listdir(image_dir)):
        for f in tqdm(listdir(image_dir+dd)):
            try:
                i+=1
                if i % 500 == 0:
                    print("Entropy: ", average_entropy)
                    print("Quality: ", average_quality)
                    print("Negative quality: ", len(pictures_with_negative_quality))

                img_path = image_dir + dd + "/"+ f

                imgpl = PIL.Image.open(img_path)
                img = np.array(imgpl)
                
                
                entropy = skimage.measure.shannon_entropy(img)
                average_entropy_per_class[f] = average_entropy_per_class.get(f, [])
                average_entropy_per_class[f].append(entropy)
                average_entropy += entropy

                
                quality = brisque.score(imgpl)
                quality_per_class[f] = quality_per_class.get(f, [])
                quality_per_class[f].append(quality)
                average_quality+=quality

                if quality<0:
                    pictures_with_negative_quality[img_path] = quality

                array = np.asarray(imgpl, dtype=np.int32)

                image_sizes[str(array.shape)] = image_sizes.get(str(array.shape), 0)
                image_sizes[str(array.shape)]+=1
            except Exception as e:
                print("Error Occured:", str(e))
                continue

        print("Entropy: ", average_entropy)
        print("Quality: ", average_quality)
        print("Negative quality: ", len(pictures_with_negative_quality))


    average_entropy =  average_entropy/len(records)
    average_quality = average_quality/len(records)


    print("Entropy: ", average_entropy)
    print("Quality: ", average_quality)

    return average_entropy, average_entropy_per_class, image_sizes, \
            average_quality, quality_per_class,pictures_with_negative_quality, \
            average_sharpness, sharpness_per_class

# Calculate Row Statistics

In [None]:
def get_number_of_rows_with_nan(dataframe):
    return sum([True for idx,row in dataframe.iterrows() if any(row.isnull())])
def get_cells_with_nan(dataframe):
    return sum(df_cleaner.isnull().values.ravel())

def get_column_nan_contribution(dataframe):
    columns_with_nan = {}
    for idx,row in dataframe.iterrows():
        for i, r in enumerate(row):
            if pd.isnull(r):
                columns_with_nan[dataframe.columns[i]] = columns_with_nan.get(dataframe.columns[i], 0) + 1
    return {k: v for k, v in sorted(columns_with_nan.items(), key=lambda item: item[1])} 


# Time helpers

In [None]:
def remove_tz_from_dataframe(df_in):
    df = df_in.copy()
    col_times = [ col for col in df.columns if any([isinstance(x, pd.Timestamp) for x in df[col]])]
    for col in col_times:
        df[col] = pd.to_datetime(
            df[col], infer_datetime_format=True) 
        df[col] = df[col].dt.tz_localize(None) 
    return df

def clean_dates(df_in, column_name):
    df_in = df_in.copy(deep=True)
    df_in[column_name] = pd.to_datetime(df_in[column_name], errors='coerce')

    df_in[column_name].dt.tz_localize(None)

    df_in[column_name].apply(lambda x: x.replace(tzinfo=None))



    now = [pd.Timestamp(str(datetime.now().replace(tzinfo=None))) for _ in range(0, len(df_in))]
    df_in["now"] = now
    df_in["now"] = pd.to_datetime(df_in["now"]) 
    df_in["now"].dt.tz_localize(None)
    df_in["now"].apply(lambda x: x.replace(tzinfo=None))

    df_in = remove_tz_from_dataframe(df_in)

    df_in[column_name] = df_in["now"] - df_in[column_name]
    df_in[column_name] = df_in[column_name].dt.total_seconds()

    year_in_seconds = 365*24*60*60

    df_in[column_name]  = df_in[column_name] / year_in_seconds
    df_in = df_in.drop("now", axis=1)
    return df_in

# Load OpenFoodFacts

In [None]:
openfoodfacts = pd.read_csv('en.openfoodfacts.org.products.tsv', sep='\t')
openfoodfacts.describe()

In [None]:
openfoodfacts_cleaner = openfoodfacts.drop(["url", "code", "created_t", "created_datetime", "last_modified_t"], axis=1)
openfoodfacts_minimal = openfoodfacts_cleaner.drop(["packaging", "packaging_tags", "brands_tags"], axis=1)
openfoodfacts_minimal = openfoodfacts_minimal.dropna(subset=['product_name'])
openfoodfacts_minimal = remove_rows_with_non_english_foodname(openfoodfacts_minimal, 'product_name')

openfoodfacts_cleaner_dates = clean_dates(openfoodfacts_minimal, "last_modified_datetime")
openfoodfacts_cleaner_dates = openfoodfacts_cleaner_dates.astype({'product_name':'string'})
openfoodfacts_minimal.head()

openfoodfacts_no_nan = openfoodfacts_minimal.copy(deep=True)
openfoodfacts_no_nan.dropna(how='all', inplace=True)
openfoodfacts_no_nan.dropna(thresh=len(openfoodfacts_no_nan.columns)-7, inplace=True)

openfoodfacts_cleaner_dates["food_with_ingredients"] = openfoodfacts_cleaner_dates[["product_name", "ingredients_text"]].apply(lambda x : '{} {}'.format(x[0],x[1]), axis=1)


# Load Nutritional Values for Common foods and produce dataset

In [None]:
NVCF = pd.read_csv('nutrition.csv')

number_regex = re.compile(r"[0-9]+[.0-9]*")



def determine_col_type(df, col_name, by_name=False):
    if by_name:
        df[col_name].apply(to_grams)
        if "mcg" in col_name:
            return "mcg"
        elif "mg" in col_name:
            return "mg"
        elif "g" in col_name:
            return "g"
        
    mg_n = df[col_name].astype(str).str.count("mg").sum()
    mcg_n = df[col_name].astype(str).str.count("mcg").sum()
    g_n = df[col_name].astype(str).str.count("g").sum() - mg_n - mcg_n
    
    df[col_name].apply(to_grams)
    if max(g_n, mg_n, mcg_n) == g_n:
        return "g"
    if max(g_n, mg_n, mcg_n) == mg_n:
        return "mg"
    if max(g_n, mg_n, mcg_n) == mcg_n:
        return "mcg"
    return "other"
    


def get_column_types(df, columns):
    f = {}
    for col in columns:
        typ = determine_col_type(df, col)
        l = f.get(typ, [])
        l.append(col)
        f[typ] = l 
    return f

def to_grams(x):
    try:
        x = str(x)
        num = re.findall(number_regex, x)
        num = float(num[0])
    except Exception:
        return None
    if "mcg" in x:
        num = num / 1000000
    elif "mg" in x:
        num = num / 1000
    return num

column_list_NVCF = ["serving_size","calories","total_fat","saturated_fat","cholesterol",
               "sodium","choline","folate","folic_acid","niacin","pantothenic_acid",
               "riboflavin","thiamin","vitamin_a","vitamin_a_rae","carotene_alpha",
               "carotene_beta","cryptoxanthin_beta","lutein_zeaxanthin","lucopene",
               "vitamin_b12","vitamin_b6","vitamin_c","vitamin_d","vitamin_e",
               "tocopherol_alpha","vitamin_k","calcium","copper","irom","magnesium",
               "manganese","phosphorous","potassium","selenium","zink","protein",
               "alanine","arginine","aspartic_acid","cystine","glutamic_acid","glycine",
               "histidine","hydroxyproline","isoleucine","leucine","lysine","methionine",
               "phenylalanine","proline","serine","threonine","tryptophan","tyrosine",
               "valine","carbohydrate","fiber","sugars","fructose","galactose","glucose",
               "lactose","maltose","sucrose","fat","saturated_fatty_acids",
               "monounsaturated_fatty_acids","polyunsaturated_fatty_acids","fatty_acids_total_trans",
               "alcohol","ash","caffeine","theobromine","water"]
col_by_type = get_column_types(NVCF, column_list_NVCF)
print(col_by_type)

for column in column_list_NVCF:
    NVCF[column] = NVCF[column].apply(to_grams)


# Match columns between OFF and NVCF

In [None]:
NVCF_column_names_dirty = [x for x in list(NVCF.columns)]
openfoodfacts_column_names_dirty = [x for x in list(openfoodfacts_no_nan.columns)] 

def clear_func(x):
    return x.replace("_"," ").replace("-"," ").replace("100g", "").replace("en", "").replace("  "," ")

NVCF_column_names = [clear_func(x) for x in list(NVCF.columns)]
openfoodfacts_column_names = [clear_func(x) for x in list(openfoodfacts_no_nan.columns)] 


col_matches = match_column_names(NVCF_column_names, openfoodfacts_column_names, sim_measure=0.4)

matching_cols = match_dirty(col_matches, NVCF_column_names_dirty, openfoodfacts_column_names_dirty, clear_func, clear_func)

print(matching_cols)

# Change columns in OpenFoodFacts to numbers

In [None]:
OFF_cols_by_type = {"g":[]}
for (columnNVCF, columnOFF) in matching_cols:
    openfoodfacts_cleaner_dates[columnOFF] = openfoodfacts_cleaner_dates[columnOFF].apply(to_grams)
    OFF_cols_by_type["g"].append(columnOFF)

# Match rows between OFF and NVCF

In [None]:
df_s_1 = NVCF['name'].to_frame()
df_s_2 = openfoodfacts_cleaner_dates["food_with_ingredients"].to_frame()

names_only = True
hard_matches = True

matches = find_matches(df_s_1, df_s_2, names_only=names_only, hard_matches=hard_matches, check_duplicates=True)
print("Names only, soft matches:")
df1_self_matches, df2_self_matches, df1_to_df2 = match_types(matches)
show_number_of_matches(df1_self_matches, df2_self_matches, df1_to_df2)

In [None]:
with open(f'df1_self_matches{names_only}{hard_matches}.pickle', 'wb') as f:
    pickle.dump(df1_self_matches, f)
with open(f'df2_self_matches{names_only}{hard_matches}.pickle', 'wb') as f:
    pickle.dump(df2_self_matches, f)
with open(f'df1_to_df2{names_only}{hard_matches}.pickle', 'wb') as f:
    pickle.dump(df1_to_df2, f)

In [None]:
names_only = True
hard_matches = False

with open(f'df1_self_matches{names_only}{hard_matches}.pickle', 'rb') as f:
    df1_self_matches = pickle.load(f)
with open(f'df2_self_matches{names_only}{hard_matches}.pickle', 'rb') as f:
    df2_self_matches = pickle.load(f)
with open(f'df1_to_df2{names_only}{hard_matches}.pickle', 'rb') as f:
    df1_to_df2 = pickle.load(f)

In [None]:
print(len(df1_self_matches), df2_self_matches[0])

# Calculate difference between matched rows

In [None]:
openfoodfacts_cleaner_dates.head(1)

In [None]:
from time import sleep
good_matches = bad_matches_to_good_matches(df2_self_matches[:7000], openfoodfacts_cleaner_dates, None, 'food_with_ingredients', None)
#print(good_matches)

#("calories","calories")

numeric_cols = []
a = []
for x in matching_cols:
     a.append((x[1],x[1]))
        
numeric_cols = a


comparisons = compare_matches(good_matches, numeric_cols)

for i, ((ml, mr), values) in enumerate(zip(df2_self_matches, comparisons)):
    print("For a match between: ")
    print(ml.copy(deep=True).reset_index(drop=True).iloc[0,0])
    print("and")
    print(mr.copy(deep=True).reset_index(drop=True).iloc[0,0])
    print("Difference in columns: ")
    for key, value in values.items():
        print(f"  {key} : {value}")
    print("_____________________________________________")
    if i > 200:
        break

    
comparison_df = pd.DataFrame.from_dict(comparisons)


In [None]:
comparison_df.head()

# Number of records comparison

In [None]:
print(len(NVCF.index), len(openfoodfacts.index))

# Number of columns comparison

In [None]:
print(len(NVCF.columns), len(openfoodfacts.columns))

In [None]:
#https://stackoverflow.com/questions/19124601/pretty-print-an-entire-pandas-series-dataframe
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')


#https://stackoverflow.com/questions/51070985/find-out-the-percentage-of-missing-values-in-each-column-in-the-given-dataset
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mz_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mz_table = mz_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Missing Values'})
    mz_table['% of Missing Values'] = mz_table['% of Missing Values'].round(1)
    mz_table['Data Type'] = df.dtypes
    mz_table['% No Missing Values'] = 100 - mz_table['% of Missing Values']
    num_cols_missing_data = mz_table[mz_table['Missing Values'] > 0].shape[0]
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
        "There are " + str(num_cols_missing_data) +
          " columns that have missing values.")
    return mz_table

def get_columns_by_missing_values(dff):
    res = missing_values_table(dff)
    res.drop(columns=["Missing Values", "Data Type"], inplace=True)
    return res

def print_dataframe_statistics(dff):
    mvc = get_columns_by_missing_values(dff)
    def get_len_less_thresh(thr):
        return len(mvc[mvc["% of Missing Values"]<thr])
    def get_len_more_thresh(thr):
        return len(mvc[mvc["% of Missing Values"]>thr])

    print(f"Number of columns with <1% missing values: {get_len_less_thresh(1)}")
    print(f"Number of columns with <5% missing values: {get_len_less_thresh(5)}")
    print(f"Number of columns with <10% missing values: {get_len_less_thresh(10)}")
    print(f"Number of columns with <20% missing values: {get_len_less_thresh(20)}")
    print(f"Number of columns with <30% missing values: {get_len_less_thresh(30)}")
    print(f"Number of columns with <50% missing values: {get_len_less_thresh(50)}")
    
    print("")
    
    print(f"Number of columns with >80% missing values: {get_len_more_thresh(80)}")
    print(f"Number of columns with >95% missing values: {get_len_more_thresh(95)}")
    print(f"Number of columns with >99% missing values: {get_len_more_thresh(99)}")
    print("")
    
    
print("For OpenFoodFacts")
print_dataframe_statistics(openfoodfacts)
print("For NVCF")
print_dataframe_statistics(NVCF)


In [None]:
openfoodfacts_cleaner_dates["last_modified_datetime"].describe()

# Working with Food101 dataset

In [None]:
food_names = [f.replace("_", " ") for f in listdir("/mnt/DATA/Courses/Thesis/metric calculation/food101/images/")]

records = ["/mnt/DATA/Courses/Thesis/metric calculation/food101/images/"+ dd +f for dd in listdir("/mnt/DATA/Courses/Thesis/metric calculation/food101/images/") for f in listdir("/mnt/DATA/Courses/Thesis/metric calculation/food101/images/"+dd) ]

print("Number of food names:", len(food_names))
print("Number of records:", len(records))
print("Food names:", food_names)


# Match Food101 to OpenFoodFacts

In [None]:
df_s_1 = pd.DataFrame({"food_name":food_names})
df_s_2 = openfoodfacts_cleaner_dates["product_name"].to_frame()

names_only = True
hard_matches = False

matches = find_matches(df_s_1, df_s_2, sim_measure= 0.2, names_only=names_only, hard_matches=hard_matches, check_duplicates=True)
print("Names only, soft matches:")
df1_self_matches, df2_self_matches, df1_to_df2 = match_types(matches)
show_number_of_matches(df1_self_matches, df2_self_matches, df1_to_df2)

In [None]:
import json
import pickle

with open(f'food101_self_matches{names_only}{hard_matches}.pickle', 'wb') as f:
    pickle.dump(df1_self_matches, f)
with open(f'food101_to_df2{names_only}{hard_matches}.pickle', 'wb') as f:
    pickle.dump(df1_to_df2, f)

# Measure image parameters for Food101

In [None]:

image_dir = "/mnt/DATA/Courses/Thesis/metric calculation/food101/images/"
measure_over_images(image_dir)


# Load MAFood121


In [None]:

from os import listdir
from os.path import isfile, join

MAFood121_DATA_PATH = "/mnt/DATA/Courses/Thesis/metric calculation/Mafood121/images/"
MAFood121_food_names = [f.replace("_", " ") for f in listdir(MAFood121_DATA_PATH)]

MAFood121_records = [MAFood121_DATA_PATH+ dd +f for dd in listdir(MAFood121_DATA_PATH) for f in listdir(MAFood121_DATA_PATH+dd) ]

print("Number of food names:", len(MAFood121_food_names))
print("Number of records:", len(MAFood121_records))
print("Food names:", MAFood121_food_names)



# Match MAFood121 to OpenFoodFacts

In [None]:
df_s_1 = pd.DataFrame({"food_name":MAFood121_food_names})
df_s_2 = openfoodfacts_cleaner_dates["product_name"].to_frame()

names_only = True
hard_matches = False

matches = find_matches(df_s_1, df_s_2, sim_measure= 0.1, names_only=names_only, hard_matches=hard_matches, check_duplicates=True)
print("Names only, soft matches:")
df1_self_matches, df2_self_matches, df1_to_df2 = match_types(matches)
show_number_of_matches(df1_self_matches, df2_self_matches, df1_to_df2)

In [None]:
import json
import pickle

with open(f'MAFood121_self_matches{names_only}{hard_matches}.pickle', 'wb') as f:
    pickle.dump(df1_self_matches, f)
with open(f'MAFood121_to_df2{names_only}{hard_matches}.pickle', 'wb') as f:
    pickle.dump(df1_to_df2, f)

# Match MAFood121 to Food101

In [None]:
df_s_1 = pd.DataFrame({"food_name":MAFood121_food_names})
df_s_2 = pd.DataFrame({"food_name":food_names})

names_only = True
hard_matches = True

matches = find_matches(df_s_1, df_s_2, names_only=names_only, hard_matches=hard_matches, check_duplicates=True)
print("Names only, soft matches:")
df1_self_matches, df2_self_matches, df1_to_df2 = match_types(matches)
show_number_of_matches(df1_self_matches, df2_self_matches, df1_to_df2)

print(df1_to_df2)

# Measure Image parameters for MAFood121

In [None]:

image_dir = "/mnt/DATA/Courses/Thesis/metric calculation/Mafood121/images/"
measure_over_images(image_dir)


# Load Food Ingredients and Recipes Dataset with Images From Epicurious Website dataset

In [None]:
FIRDI_Epicurious = pd.read_csv('/mnt/DATA/Courses/Thesis/metric calculation/Food Ingredients and Recipes Dataset with Images From Epicurious Website/Food Ingredients and Recipe Dataset with Image Name Mapping.csv')

FIRDI_Epicurious["food_with_ingredients"] = FIRDI_Epicurious[["Title", "Cleaned_Ingredients"]].apply(lambda x : '{} {}'.format(x[0],x[1]), axis=1)
FIRDI_Epicurious.head()


In [None]:
print(len(FIRDI_Epicurious.index))

In [None]:
print("For FIRDI_Epicurious")
print_dataframe_statistics(FIRDI_Epicurious)

# Match FIRDI_Epicurious to OpenFoodFacts

In [None]:
df_s_1 = FIRDI_Epicurious["food_with_ingredients"].to_frame()
df_s_2 = openfoodfacts_cleaner_dates["product_name"].to_frame()

names_only = False
hard_matches = False

matches = find_matches(df_s_1, df_s_2, sim_measure= 0.05, names_only=names_only, hard_matches=hard_matches, check_duplicates=True)
print("Ingredients, soft matches:")
df1_self_matches, df2_self_matches, df1_to_df2 = match_types(matches)
show_number_of_matches(df1_self_matches, df2_self_matches, df1_to_df2)

# Image parameters for FIRDI_Epicurious

In [None]:
img_path = "/mnt/DATA/Courses/Thesis/metric calculation/Food Ingredients and Recipes Dataset with Images From Epicurious Website/Food Images/"
measure_over_images(img_path)


# Sample OFF vs OFF

In [None]:
import random

off_sample = openfoodfacts_cleaner_dates["food_with_ingredients"][:80000].sample(n=20000, random_state=1).to_frame()


prev_sample = ""

def screw_over_sample(x, augment=0.05):
    try:
        a = screw_over_sample.prev_sample 
    except Exception as e:
        screw_over_sample.prev_sample  = ""
    x = str(x)
    xs = x.split(" ")
    sample_limit = max(1, min(int(len(xs)*(1-augment))+1, len(xs)))
    xs = random.sample(xs, sample_limit)
    xs += screw_over_sample.prev_sample
    screw_over_sample.prev_sample = random.sample(xs, max(1, min(int(len(xs)*augment)+1, len(xs))))
    return " ".join(xs)

off_sample["product_name"] = off_sample["food_with_ingredients"].apply(screw_over_sample)
df_s_2 = openfoodfacts_cleaner_dates["food_with_ingredients"].to_frame()

names_only = True
hard_matches = False

matches = find_matches(off_sample, df_s_2, names_only=names_only, hard_matches=hard_matches, check_duplicates=True)
print(f"Ingredients, matches (is_hard: {hard_matches}):")
df1_self_matches, df2_self_matches, df1_to_df2 = match_types(matches)
show_number_of_matches(df1_self_matches, df2_self_matches, df1_to_df2)

