In [1]:
import os
import numpy as np
import pandas as pd
import re
import json
import n2w
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances, manhattan_distances
from typing import List
from pprint import pprint
from IPython.display import clear_output, display
from time import sleep

from tqdm import tqdm_notebook as tqdm

In [2]:
def load_names(filename: str, path: str = "csv"):
    return np.array(
        pd.read_csv(
            "{path}/{filename}.csv".format(path=path, filename=filename),
            index_col="name")["name"])

In [3]:
def load_columns(filename: str, path: str = "csv"):
    return np.array(
        pd.read_csv(
            "{path}/{filename}.csv".format(path=path, filename=filename),
            index_col="name").columns)

In [4]:
def load_df(filename: str, path: str = "csv"):
    return pd.read_csv(filename, index_col="name")

In [5]:
with open("./fat classifier/fat_codes.json", "r", encoding="UTF8") as f:
    fats = json.load(f)
fat_keys = list(fats.keys())
fat_keys.sort(key=len, reverse=True)

In [6]:
def vitamin_heuristics(name:str)->str:
    return name.lower().replace("vitamina ", "vitamina_").replace("vit. ", "vitamina_")

def percentage_heuristics(name:str)->str:
    return name.replace("%", "percentuale")

def g_mg_heuristics(name:str)->str:
    name = name.replace("(", " ").replace(")"," ").replace("|"," ")
    name = name.replace("mg", "milligrammi").replace(" g ", "grammi").replace("mcg", "microgrammi")
    return re.sub(" g$", "grammi", name)

def fat_heuristic(name:str)->str:
    global fats, fat_keys
    
    for key in fat_keys:
        if key in name:
            name.replace(key, "{value} {key}".format(value=fats[key]["eng"], key=key))
    
    if ":" in name:
        name = re.sub(r"(C\d+:\d+)\s(\w)", r"\1_\2", name)
        for number in re.findall("\d+", name):
            name = name.replace(number, n2w.convert(int(number)))
        name = name.replace(":", "_")
    
    if "÷" in name:
        name = name.replace("÷", "rate")
    
    return name

In [7]:
def apply_nutrients_heuristics(A: np.ndarray) -> np.ndarray:
    return np.array([fat_heuristic(g_mg_heuristics(percentage_heuristics(vitamin_heuristics(a)))) for a in A])

In [396]:
def uncooked_heuristic(name:str)->str:
    name = re.sub("crud[a-z]", "", name)
    name = re.sub("fresc[a-z]+", "", name)
    return name

def with_heuristic(name:str)->str:
    name = re.sub("senza ", "senza_", name)
    name = re.sub("con ", "con_", name)
    name = re.sub("non ", "non_", name)
    return name
    
def cooking_heuristic(name:str)->str:
    procedures = ["tostat", "arrost", "affumicat", "fritt"]
    for procedure in procedures:
        if procedure in name:
            name = re.sub(procedure+"[a-z]", "{procedure} cotto".format(procedure=procedure), name)
    return name

In [397]:
def apply_food_heuristics(A: np.ndarray) ->np.ndarray:
    heuristics = [uncooked_heuristic, cooking_heuristic, with_heuristic]
    partial = A
    for h in heuristics:
        new_partial = np.array([h(a) for a in partial])
        partial = new_partial
    return partial

In [718]:
def match(A: pd.DataFrame, B: pd.DataFrame,
          threshold: float = 0.8) -> pd.DataFrame:
    
    common_columns = np.intersect1d(A.columns, B.columns)
    
    #print(len(common_columns))
    
    A_common = A[common_columns]
    B_common = B[common_columns]
    
    A_T = A_common.transpose()
    B_T = B_common.transpose()
    
    #Ac = A.columns#A_T.columns
    #Bc = B.columns#B_T.columns
    
    Ac = A_T.columns
    Bc = B_T.columns
    
    Ae = apply_food_heuristics(Ac)
    Be = apply_food_heuristics(Bc)
    
    means_A = np.nanmean(A_T, axis=0)
    means_B = np.nanmean(B_T, axis=0)
    means_matrix = (
        np.repeat(means_A.reshape(-1,1), means_B.size, axis=1) + 
        np.repeat(means_B.reshape(1, -1), means_A.size, axis=0)
    )
    mean_distance = manhattan_distances(means_A.reshape(-1,1), means_B.reshape(-1,1)) / means_matrix
    
    vectorizer = TfidfVectorizer()
    vectorizer.fit(np.concatenate([Ae, Be]))
    X = vectorizer.transform(Ae)
    Y = vectorizer.transform(Be)

    tfidf_distances = euclidean_distances(X, Y)
    tfidf_distances /= np.nanmax(tfidf_distances)
    thr_mask = tfidf_distances > threshold
    zero_row_mask = np.any(tfidf_distances == 0, axis=1)
    zero_column_mask = np.any(tfidf_distances == 0, axis=0)
    tfidf_distances[thr_mask] = np.inf
    tfidf_distances[zero_row_mask] = np.inf
    tfidf_distances[:, zero_column_mask] = np.inf
    #np.fill_diagonal(tfidf_distances, np.inf)
    # Fill upper triangle
    #tfidf_distances[np.triu_indices(tfidf_distances.shape[0])] = np.inf
    
    distances = (tfidf_distances + mean_distance/np.nanmax(mean_distance))/2
    distances[distances>threshold] = np.inf
    
    infinite_rows = np.all(np.logical_or(np.isnan(distances), np.isinf(distances)), axis=1)
    infinite_cols = np.all(np.logical_or(np.isnan(distances), np.isinf(distances)), axis=0)
    distances = distances[~infinite_rows]
    distances = distances[:, ~infinite_cols]
    
    A1 = Ac[~infinite_rows]
    B1 = Bc[~infinite_cols]
    x_indices, y_indices = np.arange(A1.size), np.nanargmin(distances, axis=1)

    x, y = A1[x_indices], B1[y_indices]
    
    df = pd.DataFrame({
        "Second": y,
        "First": x,
        "Values": np.nanmin(distances, axis=1)
    })
    
    df = df.sort_values("Values")
    
    return df

In [719]:
yazio, total = load_df("csv/yazio.csv"), load_df("csv/bda_crea_valori_alimentari.csv")

In [720]:
match(total, yazio, threshold=0.8)

Unnamed: 0,First,Second,Values
718,stomaco di maiale,stomaco di maiale cotto,0.126300
525,farina d'orzo,farina di orzo,0.140015
748,"popcorn, cotti al microonde",popcorn al microonde,0.163084
115,"trippa di manzo, cruda",trippa di manzo cotta,0.181299
94,"patate, fecola",fecola di patate,0.199801
30,cavolo cappuccio rosso,cavolo cappuccio crudo,0.211954
1028,gamberetti,gamberetti scatola,0.215944
338,pane tostato,pane bianco tostato,0.218656
262,provolone,formaggio provolone,0.223408
529,crauti,crauti scatola,0.229682


In [29]:
food_aliases_path = "food_aliases.json"

In [290]:
skipped = {}

In [698]:
crea_bda = total
vn = yazio
all_columns = list(set([*crea_bda.columns, *vn.columns]))
common_indices = list(set(crea_bda.index) & set(vn.index))

In [699]:
crea_only_indices = list(set(crea_bda.index) - set(vn.index))
bda_only_indices = list(set(vn.index) - set(crea_bda.index))

In [700]:
fusion_dataframe_a = pd.DataFrame([], columns=all_columns)
fusion_dataframe_b = pd.DataFrame([], columns=all_columns)

In [701]:
fusion_dataframe_a = fusion_dataframe_a.append(crea_bda.loc[common_indices])
fusion_dataframe_b = fusion_dataframe_b.append(vn.loc[common_indices])

In [703]:
fusion_ndarray = np.stack([fusion_dataframe_a, fusion_dataframe_b])

In [704]:
nan_mask = np.all(np.isnan(fusion_ndarray.astype(float)), axis=0)

In [705]:
fusion_ndarray[:, nan_mask] = 0

In [706]:
mean_fusion = np.nanmean(fusion_ndarray, axis=0)

In [707]:
mean_fusion[nan_mask] = np.nan

In [708]:
fusion_common_final = pd.DataFrame(mean_fusion, columns=fusion_dataframe_a.columns)

In [709]:
fusion_common_final.index = common_indices

In [713]:
fusion_common_final_plus_crea = fusion_common_final.append(crea_bda.loc[crea_only_indices])

In [714]:
fusion_common_final_plus_crea_plus_bda = fusion_common_final_plus_crea.append(vn.loc[bda_only_indices])

In [715]:
fusion_common_final_plus_crea_plus_bda.index.name = "name"

In [717]:
fusion_common_final_plus_crea_plus_bda.shape

(4824, 197)

In [50]:
crea_common = crea.loc[common_indices]
bda_common = bda.loc[common_indices]

In [None]:
fusion_common_final_plus_crea_plus_bda[]

In [None]:
fusion_common = np.nanmean([crea_common, bda_common])

In [728]:
def replace_header(df, old, new, path):
    df[new] = df[old]
    df = df.drop(columns=[old])
    df.to_csv(path)
    return df


def replace_index(df, old, new, path, newnew=""):
    global food_aliases_path
    with open(food_aliases_path, "r") as f:
        food_aliases = json.load(f)

    if old in food_aliases:
        food_aliases[old].append(new)
    elif new in food_aliases:
        food_aliases[new].append(old)
    else:
        food_aliases[new] = [old]

    with open(food_aliases_path, "w") as f:
        json.dump(food_aliases, f)

    if old in df.index and new in df.index:
        drop_new = False
        if not newnew:
            drop_new = True
            newnew = new
        df.loc[newnew] = np.nanmean(df.loc[[old, new]], axis=0)
        df = df.drop(old)
        if drop_new:
            df = df.drop(new)
    else:
        df = df.rename(index={old: new})
    df.to_csv(path)
    return df


path_df2, path_df1 = "csv/bda_crea_valori_alimentari.csv", "csv/yazio.csv"
done = False
start = 0
while True:
    df1, df2 = load_df(path_df1), load_df(path_df2)
    r = match(df1, df2, threshold=0.8)
    n = len(r)
    if not start:
        start = n

    common_columns = np.intersect1d(df1.columns, df2.columns)
    df1t = df1[common_columns].transpose()
    df2t = df2[common_columns].transpose()

    n = 0
    for first, second, value in r.values:
        if first == second:
            continue
        n += 1

    if not n:
        break

    #print("I have found {n} possible matches. ".format(n=n))
    #sleep(0.75)

    for first, second, value in r.values:

        if first not in df1t or second not in df2t:
            continue

        done = False
        if first == second:
            n -= 1
            continue

        if "agnesi" in first:
            n -= 1
            continue
        # Homogeneus constraint:
        
        homogeneus = ["pollo","tacchino","struzzo","mais","peperone","coniglio","radicchio","hamburger"] # maiale no couz suino, vitello no couz manzo
        
        for kind in homogeneus:
            if (kind in first and kind not in second) or (kind not in first and kind in second):
                keys = "{two} {one}".format(
                    one=first, two=second), "{one} {two}".format(
                        one=first, two=second)
                skipped.setdefault(keys[0], 0)
                skipped.setdefault(keys[1], 0)
                skipped[keys[0]] += 1
                skipped[keys[1]] += 1
                skipped[keys[0]] += 5
                skipped[keys[1]] += 5
        
        # End Homogeneus
        # Opposites should not be considered
        opposites = [
            ("crudo","cotto"),
        ]
        
        for first_key, second_key in opposites:
            if (first_key in first and second_key in second) or (first_key in second and second_key in first):
                    
                keys = "{two} {one}".format(
                    one=first, two=second), "{one} {two}".format(
                        one=first, two=second)
                skipped.setdefault(keys[0], 0)
                skipped.setdefault(keys[1], 0)
                skipped[keys[0]] += 1
                skipped[keys[1]] += 1
                skipped[keys[0]] += 5
                skipped[keys[1]] += 5
        # End Opposites
        
        skip = False
        for name in []:
            if bool(re.findall(name, first)) == bool(re.findall(name, second)):
                n -= 1
                skip = True
                break
        if skip:
            continue

        keys = "{two} {one}".format(
            one=first, two=second), "{one} {two}".format(
                one=first, two=second)
        if keys[0] in skipped and skipped[keys[0]] > 2 or keys[1] in skipped and skipped[keys[1]] > 2:
            n -= 1
            continue
        mse = np.nanmean((df2t[second] - df1t[first])**2)
        if mse > 300:
            n -= 1
            continue
        while True:
            clear_output()
            print("Forza! Ne rimangon solo {n}! Sei al {perc:.1f}%!".format(
                n=n, perc=(1 - n / start) * 100))
            print(
                "I found\n\033[1m{first}\033[0m\n\033[1m{second}\033[0m \nShould I merge them?".
                format(first=first, second=second))

            print(
                "Their mse is: {mse:.4f}, with means {mean_1:.4f} and {mean_2:.4f}".
                format(
                    mse=mse,
                    mean_1=np.nanmean(df1t[first]),
                    mean_2=np.nanmean(df2t[second])))
            if mse < 0.1:
                print("\033[1mMSE BASSISSIMOOOO!\033[0m")
            elif mse < 1:
                print("\033[1mMSE BASSO!\033[0m")
            elif mse < 5:
                print("\033[1mMSE INTERESSANTE!\033[0m")
            elif mse > 1000:
                print("\033[1mMSE ALTO!\033[0m")
            inp = input("[y/n/d/del/ren]")
            if inp == "y":
                while True:
                    h = input("Which header should I use? [1/2/0/header] ")
                    if h == "1":
                        df1 = replace_index(df1, first, second, path_df1)
                        done = True
                        break
                    elif h == "2":
                        df2 = replace_index(df2, second, first, path_df2)
                        done = True
                        break
                    elif h == "0":
                        break
                    elif not h:
                        print("What did you mean? Please retry.")
                    else:
                        choice = input(
                            "Should i use '{header}'? [y/n]".format(header=h))
                        if choice == "y":
                            df2 = replace_index(df2, second, first, path_df2,
                                                h)
                            done = True
                            break
                        elif choice == "n":
                            print("Okay restarting")
                        else:
                            print("What did you mean? Please retry.")
                if done:
                    break
            elif inp == "d":
                data = pd.DataFrame({
                    first: df1t[first],
                    second: df2t[second],
                    "diff": np.abs(df1t[first] - df2t[second])
                })

                display(data.sort_values("diff", ascending=False)[:10])
                input("Press any key to continue")
            elif inp == "n" or not inp:
                skipped.setdefault(keys[0], 0)
                skipped.setdefault(keys[1], 0)
                skipped[keys[0]] += 1
                skipped[keys[1]] += 1
                if inp == "n":
                    skipped[keys[0]] += 5
                    skipped[keys[1]] += 5
                print("Ok, leaving it be.")
                break
            elif inp == "ren":
                new_header = input("Enter your new header.")
                while True:
                    h = input("Which one do you want to rename? [1/2]")
                    if h == "1":
                        df1.rename(index={first: new_header}).to_csv(path_df1)
                        done = True
                        break
                    elif h == "2":
                        df2.rename(index={second: new_header}).to_csv(path_df2)
                        done = True
                        break
                    elif h == "0":
                        break
                if done:
                    break
            elif inp == "del":
                while True:
                    h = input("Which one should I delete? [1/2/0]")
                    if h == "1":
                        df1.drop(first).to_csv(path_df1)
                        done = True
                        break
                    elif h == "2":
                        df2.drop(second).to_csv(path_df2)
                        done = True
                        break
                    elif h == "0":
                        break
                if done:
                    break
            else:
                print("What did you mean? Please retry.")
        if done:
            break
    clear_output()
    if not done:
        break

In [None]:
with open("match_found.json", "r") as f:
    match_found = json.load(f)

In [452]:
actual_matches = [match for matches in match_found if matches for match in matches]

In [453]:
actual_matches

[['albicocche sciroppate', 'albicocche, sciroppate'],
 ['fette biscottate, integrali', 'fette biscottate integrali'],
 ['fette biscottate integrali', 'fette biscottate, integrali'],
 ['albicocche, sciroppate', 'albicocche sciroppate']]