In [1]:
import os
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from typing import List

In [67]:
def load_names(filename:str, path:str="csv"):
    return np.array(pd.read_csv("{path}/{filename}.csv".format(path=path, filename=filename))["name"])

In [84]:
def match(A:np.ndarray, B:np.ndarray, threshold:float=0.4)->pd.DataFrame:
    vectorizer = TfidfVectorizer()
    vectorizer.fit(np.concatenate([A, B]))
    X = vectorizer.transform(A)
    Y = vectorizer.transform(B)
    
    distances = euclidean_distances(X, Y)
    distances[distances > np.max(distances) * threshold] = np.inf
    np.fill_diagonal(distances, np.inf)
    
    infinite_rows = np.all(distances == np.inf, axis=1)
    distances = distances[~infinite_rows]
    
    A1 = A[~infinite_rows]
    x_indices, y_indices = np.arange(A1.size), np.nanargmin(distances, axis=1)
    
    x, y = A1[x_indices], bda[y_indices]
    
    return pd.DataFrame({
        "Original":y,
        "Matched":x
    })

In [85]:
crea, bda = load_names("crea"), load_names("bda")

In [86]:
match(crea, bda)

Unnamed: 0,Matched,Original
0,"Latte di vacca in polvere, parzialmente scremato","latte di vacca, parzialmente scremato, in polvere"
1,Prugne gialle,prugne gialle
2,Anacardi,anacardi
3,Caciotta romana di pecora,caciotta romana di pecora
4,Fegato di bovino,"bovino, fegato"
5,"Orata fresca d'allevamento, filetti","orata d'allevamento, filetti"
6,Marsala tipico,marsala tipico
7,Fiocchi d'avena,fiocchi di avena
8,Whisky,whisky
9,Tartufo nero,tartufo nero
