In [1]:
import os
import numpy as np
import pandas as pd
import re
import json
import n2w
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances, manhattan_distances
from typing import List
from pprint import pprint

In [2]:
def load_names(filename: str, path: str = "csv"):
    return np.array(
        pd.read_csv(
            "{path}/{filename}.csv".format(path=path, filename=filename),
            index_col="name")["name"])

In [3]:
def load_columns(filename: str, path: str = "csv"):
    return np.array(
        pd.read_csv(
            "{path}/{filename}.csv".format(path=path, filename=filename),
            index_col="name").columns)

In [4]:
def load_df(filename: str, path: str = "csv"):
    return pd.read_csv(filename, index_col="name")

In [5]:
with open("./fat classifier/fat_codes.json", "r", encoding="UTF8") as f:
    fats = json.load(f)
fat_keys = list(fats.keys())
fat_keys.sort(key=len, reverse=True)

In [6]:
def vitamin_heuristics(name:str)->str:
    return name.lower().replace("vitamina ", "vitamina_").replace("vit. ", "vitamina_")

def percentage_heuristics(name:str)->str:
    return name.replace("%", "percentuale")

def g_mg_heuristics(name:str)->str:
    name = name.replace("(", " ").replace(")"," ").replace("|"," ")
    name = name.replace("mg", "milligrammi").replace(" g ", "grammi").replace("mcg", "microgrammi")
    return re.sub(" g$", "grammi", name)

def fat_heuristic(name:str)->str:
    global fats, fat_keys
    
    for key in fat_keys:
        if key in name:
            name.replace(key, "{value} {key}".format(value=fats[key]["eng"], key=key))
    
    if ":" in name:
        name = re.sub(r"(C\d+:\d+)\s(\w)", r"\1_\2", name)
        for number in re.findall("\d+", name):
            name = name.replace(number, n2w.convert(int(number)))
        name = name.replace(":", "_")
    
    if "÷" in name:
        name = name.replace("÷", "rate")
    
    return name

In [7]:
def apply_nutrients_heuristics(A: np.ndarray) -> np.ndarray:
    return np.array([fat_heuristic(g_mg_heuristics(percentage_heuristics(vitamin_heuristics(a)))) for a in A])

In [8]:
def uncooked_heuristic(name:str)->str:
    name = re.sub("crud[a-z]", "", name)
    name = re.sub("fresc[a-z]", "", name)
    return name
    
def cooking_heuristic(name:str)->str:
    procedures = ["tostat", "arrost", "affumicat", "fritt"]
    for procedure in procedures:
        if procedure in name:
            name = re.sub(procedure+"[a-z]", "{procedure} cotto".format(procedure=procedure), name)
    return name

In [9]:
def apply_food_heuristics(A: np.ndarray) ->np.ndarray:
    heuristics = [uncooked_heuristic, cooking_heuristic]
    partial = A
    for h in heuristics:
        partial = np.array([h(a) for a in partial])
    return partial

In [10]:
def match(A: pd.DataFrame, B: pd.DataFrame,
          threshold: float = 0.8) -> pd.DataFrame:
    
    common_columns = np.intersect1d(A.columns, B.columns)
    
    #print(len(common_columns))
    
    A_common = A[common_columns]
    B_common = B[common_columns]
    
    A_T = A_common.transpose()
    B_T = B_common.transpose()
    
    #Ac = A.columns#A_T.columns
    #Bc = B.columns#B_T.columns
    
    Ac = A_T.columns
    Bc = B_T.columns
    
    Ae = apply_nutrients_heuristics(Ac)
    Be = apply_nutrients_heuristics(Bc)
    
    means_A = np.nanmean(A_T, axis=0)
    means_B = np.nanmean(B_T, axis=0)
    means_matrix = (
        np.repeat(means_A.reshape(-1,1), means_B.size, axis=1) + 
        np.repeat(means_B.reshape(1, -1), means_A.size, axis=0)
    )
    mean_distance = manhattan_distances(means_A.reshape(-1,1), means_B.reshape(-1,1)) / means_matrix
    
    vectorizer = TfidfVectorizer()
    vectorizer.fit(np.concatenate([Ae, Be]))
    X = vectorizer.transform(Ae)
    Y = vectorizer.transform(Be)

    tfidf_distances = euclidean_distances(X, Y)
    tfidf_distances /= np.nanmax(tfidf_distances)
    thr_mask = tfidf_distances > threshold
    zero_row_mask = np.any(tfidf_distances == 0, axis=1)
    zero_column_mask = np.any(tfidf_distances == 0, axis=0)
    tfidf_distances[thr_mask] = np.inf
    tfidf_distances[zero_row_mask] = np.inf
    tfidf_distances[:, zero_column_mask] = np.inf
    
    distances = (tfidf_distances + mean_distance / np.nanmax(mean_distance))/2
    distances[distances>threshold] = np.inf
    
    infinite_rows = np.all(np.logical_or(np.isnan(distances), np.isinf(distances)), axis=1)
    infinite_cols = np.all(np.logical_or(np.isnan(distances), np.isinf(distances)), axis=0)
    distances = distances[~infinite_rows]
    distances = distances[:, ~infinite_cols]
    
    A1 = Ac[~infinite_rows]
    B1 = Bc[~infinite_cols]
    x_indices, y_indices = np.arange(A1.size), np.nanargmin(distances, axis=1)

    x, y = A1[x_indices], B1[y_indices]
    
    df = pd.DataFrame({
        "Second": y,
        "First": x,
        "Values": np.nanmin(distances, axis=1)
    })
    
    df = df.sort_values("Values")
    
    return df

In [11]:
crea, bda, vn = load_df("csv/crea.csv"), load_df("csv/bda.csv"), load_df("csv/valori_alimentari.csv")

In [12]:
match(bda, crea, threshold=0.8)

Unnamed: 0,First,Second,Values
416,"ovino, fegato",fegato di ovino,0.115729
224,"bovino, fegato",fegato di bovino,0.129948
376,"bovino, cervello",cervello di bovino,0.149684
7,"bottarga, uova di cefalo muggine","cefalo muggine, uova [bottarga]",0.156553
148,"bovino, cuore",cuore di bovino,0.159789
147,"faraona, petto, senza pelle","faraona, petto, senza pelle, crudo",0.164704
304,"pollo, intero, con pelle",pollo intero con pelle crudo,0.169251
343,"pollo, intero, senza pelle",pollo intero senza pelle crudo,0.173850
217,"orata d'allevamento, filetti","orata fresca d'allevamento, filetti",0.177481
221,"fave, secche, sgusciate",fave secche sgusciate crude,0.178122


In [20]:
food_aliases_path = "foot_aliases.json"

In [23]:
skipped = {}

In [None]:
from IPython.display import clear_output
from time import sleep

def replace_header(df, old, new, path):
    df[new] = df[old]
    df = df.drop(columns=[old])
    df.to_csv(path)
    return df

def replace_index(df, old, new, path):
    global food_aliases_path
    with open(food_aliases_path, "r") as f:
        food_aliases = json.load(f)
    
    if old in food_aliases:
        food_aliases[old].append(new)  
    elif new in food_aliases:
        food_aliases[new].append(old)
    else:
        food_aliases[new] = [old]
    
    with open(food_aliases_path, "w") as f:
        json.dump(food_aliases, f)
    
    df = df.rename(index={old:new})
    df.to_csv(path)
    return df

path_df1, path_df2 = "csv/bda.csv", "csv/crea.csv"
done = False
while True:
    df1, df2 = load_df(path_df1), load_df(path_df2)
    r = match(df1, df2, threshold=0.8)
    
    common_columns = np.intersect1d(df1.columns, df2.columns)
    df1t= df1[common_columns].transpose()
    df2t = df2[common_columns].transpose()
    
    n = 0
    for first, second, value in r.values:
        if first == second:
            continue
        n += 1

    if not n:
        break

    #print("I have found {n} possible matches. ".format(n=n))
    #sleep(0.75)
    
    for first, second, value in r.values:
        done = False
        if first == second:
            continue
        key = "{one} {two}".format(one=first, two=second)
        if key in skipped and skipped[key]>2:
            continue
        while True:
            clear_output()
            print("I found \033[1m{first}\033[0m and \033[1m{second}\033[0m. Should I merge them?".format(
                first=first, second=second))
            print("Their respective mean is: {mean_1:.5f}, {mean_2:.5f}".format(
                mean_1=np.nanmean(df1t[first]),
                mean_2=np.nanmean(df2t[second])
            ))
            inp = input("[y/n]")
            if inp == "y":
                while True:
                    h = int(input("Which header should I use? [1/2/n] "))
                    if h == 1:
                        df1 = replace_index(
                            df1, first, second, path_df1)
                        done = True
                        break
                    elif h == 2:
                        df2 = replace_index(
                            df2, second, first, path_df2)
                        done = True
                        break
                    else:
                        print("What did you mean? Please retry.")
                if done:
                    break
            elif inp == "n" or not inp:
                skipped.setdefault(key, 0)
                skipped[key] += 1
                print("Ok, leaving it be.")
                break
            else:
                print("What did you mean? Please retry.")
        if done:
            break
    clear_output()
    if not done:
        break

I found [1mbovino, lessato in gelatina, in scatola[0m and [1mcarne bovina in gelatina, in scatola[0m. Should I merge them?
Their respective mean is: 258.39111, 276.63810
