In [680]:
import os
import numpy as np
import pandas as pd
import re
import json
import n2w
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances, manhattan_distances
from typing import List
from pprint import pprint

In [190]:
def load_names(filename: str, path: str = "csv"):
    return np.array(
        pd.read_csv(
            "{path}/{filename}.csv".format(path=path, filename=filename),
            index_col="name")["name"])

In [191]:
def load_columns(filename: str, path: str = "csv"):
    return np.array(
        pd.read_csv(
            "{path}/{filename}.csv".format(path=path, filename=filename),
            index_col="name").columns)

In [192]:
def load_df(filename: str, path: str = "csv"):
    return pd.read_csv(
            "{path}/{filename}.csv".format(path=path, filename=filename),
            index_col="name")

In [193]:
with open("./fat classifier/fat_codes.json", "r", encoding="UTF8") as f:
    fats = json.load(f)
fat_keys = list(fats.keys())
fat_keys.sort(key=len, reverse=True)

In [461]:
def vitamin_heuristics(name:str)->str:
    return name.lower().replace("vitamina ", "vitamina_").replace("vit. ", "vitamina_")

def percentage_heuristics(name:str)->str:
    return name.replace("%", "percentuale")

def g_mg_heuristics(name:str)->str:
    name = name.replace("(", " ").replace(")"," ").replace("|"," ")
    name = name.replace("mg", "milligrammi").replace(" g ", "grammi").replace("mcg", "microgrammi")
    return re.sub(" g$", "grammi", name)

def fat_heuristic(name:str)->str:
    global fats, fat_keys
    
    for key in fat_keys:
        if key in name:
            name.replace(key, "{value} {key}".format(value=fats[key]["eng"], key=key))
    
    if ":" in name:
        name = re.sub(r"(C\d+:\d+)\s(\w)", r"\1_\2", name)
        for number in re.findall("\d+", name):
            name = name.replace(number, n2w.convert(int(number)))
        name = name.replace(":", "_")
    
    if "÷" in name:
        name = name.replace("÷", "rate")
    
    return name

In [462]:
def apply_nutrients_heuristics(A: np.ndarray) -> np.ndarray:
    return np.array([fat_heuristic(g_mg_heuristics(percentage_heuristics(vitamin_heuristics(a)))) for a in A])

In [787]:
def uncooked_heuristic(name:str)->str:
    return re.sub("crud[a-z]", "", name)

In [788]:
def apply_food_heuristics(A: np.ndarray) ->np.ndarray:
    heuristics = [uncooked_heuristic]
    partial = A
    for h in heuristics:
        partial = np.array([h(a) for a in partial])
    return partial

In [792]:
def match(A: pd.DataFrame, B: pd.DataFrame,
          threshold: float = 0.8) -> pd.DataFrame:
    Ac = A.columns
    Bc = B.columns
    Ae = apply_food_heuristics(Ac)
    Be = apply_food_heuristics(Bc)
    
    means_A = np.nanmean(A, axis=0)
    means_B = np.nanmean(B, axis=0)
    means_matrix = (
        np.repeat(means_A.reshape(-1,1), means_B.size, axis=1) + 
        np.repeat(means_B.reshape(1, -1), means_A.size, axis=0)
    )
    mean_distance = manhattan_distances(means_A.reshape(-1,1), means_B.reshape(-1,1)) / means_matrix
    
    vectorizer = TfidfVectorizer()
    vectorizer.fit(np.concatenate([Ae, Be]))
    X = vectorizer.transform(Ae)
    Y = vectorizer.transform(Be)

    tfidf_distances = euclidean_distances(X, Y)
    tfidf_distances /= np.nanmax(tfidf_distances)
    thr_mask = tfidf_distances > threshold
    zero_mask = np.any(tfidf_distances == 0, axis=1)
    tfidf_distances[thr_mask] = np.inf
    tfidf_distances[zero_mask] = np.inf
    
    distances = tfidf_distances
    
    #distances = (tfidf_distances + mean_distance / np.nanmax(mean_distance))/2
    #distances[distances>threshold] = np.inf
    
    infinite_rows = np.all(distances == np.inf, axis=1)
    distances = distances[~infinite_rows]

    A1 = Ac[~infinite_rows]
    x_indices, y_indices = np.arange(A1.size), np.nanargmin(distances, axis=1)

    x, y = A1[x_indices], Bc[y_indices]

    df = pd.DataFrame({
        "Second": y,
        "First": x,
        "Values": np.nanmin(distances, axis=1)
    })
    
    df = df.sort_values("Values")
    
    return df

In [793]:
crea, bda, vn = load_df("crea"), load_df("bda"), load_df("valori_alimentari")

In [794]:
match(bda.transpose(), vn.transpose(), threshold=0.8)

Unnamed: 0,First,Second,Values
89,"ovino, cuore","cuore di ovino, crudo",0.174514
395,"patate, fecola",fecola di patate,0.175373
358,"cumino, semi",semi di cumino,0.178083
201,"finocchio, semi",semi di finocchio,0.178083
289,fiocchi di avena,fiocchi d'avena,0.179253
301,burro di arachidi,burro d'arachidi,0.187955
385,"suino, fegato",fegato di suino,0.196228
406,"succo di pompelmo, non zuccherato","succo di arancia e pompelmo, non zuccherato",0.307790
155,"salsiccia di suino e bovino, fresca","salsiccia di suino fresca, cruda",0.315850
425,"orzo, perlato","orzo perlato, cotto",0.322212


In [778]:
units = []
for df, path in [(crea, "csv/crea.csv"), (bda, "csv/bda.csv"), (vn, "csv/valori_alimentari.csv")]:
    for c in df.columns:
        if "energia" in c:
            print(c, path)
            #c_new = c.replace("(re)", "")
            #df[c_new] = df[c]
            #df = df.drop(columns=[c])
            #df.to_csv(path)

In [670]:
a = np.array([1,2,3,4,5]).reshape(-1, 1)
b = np.array([1,5,3,4,5]).reshape(-1, 1)

In [671]:
distances = euclidean_distances(a, b)

In [712]:
np.repeat(np.array([1,2,3]).reshape(1, -1), 3, axis=0)

array([[1, 2, 3],
       [1, 2, 3],
       [1, 2, 3]])