In [1]:
import os
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
from typing import List

In [67]:
def load_names(filename:str, path:str="csv"):
    return np.array(pd.read_csv("{path}/{filename}.csv".format(path=path, filename=filename))["name"])

In [93]:
def load_columns(filename:str, path:str="csv"):
    return np.array(pd.read_csv("{path}/{filename}.csv".format(path=path, filename=filename)).columns)

In [111]:
def vitamin_heuristics(name:str)->str:
    return name.replace("Vitamina ", "Vitamina_")

In [112]:
def apply_heuristics(A:np.ndarray)->np.ndarray:
    return np.array([vitamin_heuristics(a) for a in A])

In [113]:
def match(A:np.ndarray, B:np.ndarray, threshold:float=0.8)->pd.DataFrame:
    Ae = apply_heuristics(A)
    Be = apply_heuristics(B)
    vectorizer = TfidfVectorizer()
    vectorizer.fit(np.concatenate([Ae, Be]))
    X = vectorizer.transform(Ae)
    Y = vectorizer.transform(Be)
    
    distances = euclidean_distances(X, Y)
    distances[distances > np.max(distances) * threshold] = np.inf
    np.fill_diagonal(distances, np.inf)
    
    infinite_rows = np.all(distances == np.inf, axis=1)
    distances = distances[~infinite_rows]
    
    A1 = A[~infinite_rows]
    x_indices, y_indices = np.arange(A1.size), np.nanargmin(distances, axis=1)
    
    x, y = A1[x_indices], B[y_indices]
    
    return pd.DataFrame({
        "Original":y,
        "Matched":x
    })

In [114]:
crea, bda = load_columns("crea"), load_columns("bda")

In [115]:
match(crea, bda, threshold=0.4)

Unnamed: 0,Matched,Original
0,Ferro (mg),Ferro | mg
1,Prolina,Prolina | mg
2,Vitamina C (mg),Vitamina C | mg
3,Isoleucina,Isoleucina | mg
4,Leucina,Leucina | mg
5,Magnesio (mg),Magnesio | mg
6,Metionina,Metionina | mg
7,Alcol (g),Alcol | g
8,Acqua (g),Acqua | g
9,Zinco (mg),Zinco | mg
