In [1]:
import os
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from pyscipopt import Model, quicksum
from scipy.sparse import csr_matrix
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
from typing import List, Callable

In [120]:
def vitamins_heuristic(name:str)->str:
    """Return given `name` with applied vitamin heuristic.
        name:str, the name to which apply the heuristic.
    """
    return re.sub("(vitamina?) ([a-z\d]+)", r"\1_\2", name)

def fats_heuristic(name:str)->str:
    """Return given `name` with applied fats heuristic.
        name:str, the name to which apply the heuristic.
    """
    return name.replace(":", "_")

def commons_heuristic(name:str)->str:
    """Return given `name` with applied commons heuristic.
        name:str, the name to which apply the heuristic.
    """
    drop = "\(mse\)", "total[\w]+"
    for d in drop:
        name = re.sub(d, "", name)
    return name

In [121]:
def columns_heuristics(name):
    """Return given `name` with applied batch of heuristics.
        name:str, the name to which apply the heuristics.
    """
    heuristics = [
        vitamins_heuristic,
        fats_heuristic,
        commons_heuristic
    ]
    for heuristic in heuristics:
        name = heuristic(name)
    return name

In [122]:
def tfidf(corpus:List[str], texts:List[np.ndarray], heuristics:Callable[[np.ndarray],np.ndarray]=None):
    """Return tfidf vectorization using given `corpus` vector. If given, also applies heuristics element-wise.
        corpus:List[str], list of texts to use as baseline for tfidf.
        texts:List[np.ndarray], list of vectors to which apply heuristics and tfidf vectorization element-wise.
        heuristics:Callable[[np.ndarray],np.ndarray], function to apply heauristics to texts.
    """
    vectorizer = TfidfVectorizer()
    vectorizer.fit(corpus)
    if heuristics is not None:
        texts = [heuristics(t) for t in texts]
    return [vectorizer.transform(t) for t in texts]

In [123]:
def pairwise_matrix_sum(A, B):
    return np.nansum(
        [A.reshape(A.shape[0], 1, A.shape[1]),
         B.reshape(1, *B.shape)], axis=0)


def pairwise_vector_sum(A, B):
    Ar = A.reshape(-1, 1)
    Br = B.reshape(1, B.size)
    if A.size==1:
        return np.nansum([Ar, Br.T], axis=0).T 
    return np.nansum([Ar, Br], axis=0)

In [124]:
def weighted_mse(A, B):
    normalization = pairwise_matrix_sum(A, B)
    difference = np.abs(pairwise_matrix_sum(A, -B))
    mask = np.logical_or(np.isnan(normalization), normalization == 0)
    return np.nanmean(np.divide(difference, normalization, where=~mask), axis=2)

In [125]:
def weighted_mean_differences(A: np.matrix, B: np.matrix) -> np.matrix:
    """Return weighted mean differences of given matrices."""
    if not A.shape[1] or not A.shape[1]:
        return np.zeros((A.shape[1], B.shape[1]))
    X, Y = np.nanmean(A.values, axis=0), np.nanmean(B.values, axis=0)
    N, M = np.sum(pd.notna(A.values), axis=0), np.sum(pd.notna(B.values), axis=0)
    return np.abs((pairwise_vector_sum(X, -Y) * pairwise_vector_sum(N, -M)) /
                  (pairwise_vector_sum(X, Y) + pairwise_vector_sum(N, M)))

In [126]:
def columns_to_type_groups(df):
    types = df.columns.to_series().groupby(df.dtypes).groups
    string = float64 = []
    if np.dtype('O') in types:
        string = types[np.dtype('O')]
    if np.dtype('float64') in types:
        float64 = types[np.dtype('float64')]
        
    return df[string], df[float64]

def matrix_type_argsort(matrix:np.matrix, df:pd.DataFrame)->np.matrix:
    """Return given `matrix` sorted using given `df` DataFrame types."""
    return matrix[list(zip(*sorted(zip(df.dtypes, range(len(df.dtypes))))))[1],:]

def matrix_double_type_argsort(floats:np.matrix, strings:np.matrix, df1:pd.DataFrame, df2:pd.DataFrame)->np.matrix:
    """Return combined matrix sorted in both axis using given dataframes types."""
    ground = np.zeros(np.sum([floats.shape, strings.shape], axis=0))
    if np.all(floats.shape):
        ground[:floats.shape[0], :floats.shape[1]] = floats
    if np.all(strings.shape):
        ground[-strings.shape[0]:, -strings.shape[1]:] = strings
    return matrix_type_argsort(matrix_type_argsort(ground, df1).T, df2).T

In [137]:
def filter_mean(matrix):
    if matrix.shape[0]==0 or matrix.shape[1]==0:
        return matrix
    matrix[np.isclose(matrix, np.nanmax(matrix))] = np.nan
    matrix[np.isinf(matrix)] = np.nan
    matrix[matrix >= np.nanmean(matrix)] = np.inf
    matrix[np.isnan(matrix)] = np.inf
    return matrix


def filter_mask(matrix):
    if matrix.shape[0]==0 or matrix.shape[1]==0:
        return matrix
    return np.all([np.all(
        [
            matrix == np.min(matrix, axis=1).reshape(-1, 1), matrix == np.min(
                matrix, axis=0)
        ],
        axis=0), ~np.isinf(matrix)], axis=0)


def drop_nan(df):
    return [df[c].dropna() for c in df]


def column_values_mean_tfidf_distance(df1: pd.DataFrame,
                                      df2: pd.DataFrame) -> np.matrix:
    """Return cosine distance of dataframes columns mean tfidf vectorization.
        df1:pd.DataFrame, first string only dataframe.
        df2:pd.DataFrame, second string only dataframe.
    """
    if not df1.shape[1] or not df2.shape[1]:
        return np.zeros((df1.shape[1], df2.shape[1]))
    corpus = pd.concat(
        [df[c].dropna() for df in [df1, df2] for c in df.columns])
    vectors = tfidf(corpus, [*drop_nan(df1), *drop_nan(df2)])
    mean_vectors = np.array([np.mean(v, axis=0) for v in vectors])
    mean_vectors = mean_vectors.reshape(mean_vectors.shape[0],
                                        mean_vectors.shape[-1])
    return euclidean_distances(mean_vectors[:df1.shape[1]],
                                mean_vectors[df1.shape[1]:])


def tfidf_distance(corpus: List[str],
                   A: np.ndarray,
                   B: np.ndarray,
                   heuristics: Callable = None) -> np.matrix:
    """Return cosine distance of dataframes columns mean tfidf vectorization.
        corpus:List[str], list of texts to use as baseline for tfidf.
        A:np.ndarray, vector to which apply heuristics and tfidf vectorization element-wise.
        B:np.ndarray, vector to which apply heuristics and tfidf vectorization element-wise.
        heuristics:Callable[[np.ndarray],np.ndarray], function to apply heauristics to texts.
    """
    return euclidean_distances(*tfidf(corpus, [A, B], heuristics))


def units_mask(A: np.ndarray, B: np.ndarray):
    sep = " | "
    unit_A, unit_B = [
        np.array(
            [None if len(c.split(sep)) == 1 else c.split(sep)[1] for c in v])
        for v in [A, B]
    ]
    return unit_A[:, None] == unit_B


def sub_incidence_matrix(corpus: List[str], A: pd.DataFrame, B: pd.DataFrame,
                         heuristics: Callable[[np.ndarray], np.ndarray]):
    strings, floats = list(
        zip(columns_to_type_groups(A), columns_to_type_groups(B)))
    return np.all(
        [
            np.all(
                [
                    #matrix_double_type_argsort(*[
                    #    filter_mask(filter_mean(m)) for m in [
                    #        weighted_mean_differences(*floats),
                    #        column_values_mean_tfidf_distance(*strings)
                    #    ]
                    #], A, B),
                    filter_mask(filter_mean(tfidf_distance(corpus, A.columns, B.columns, heuristics)))
                ],
                axis=0),
            units_mask(A.columns, B.columns)
        ],
        axis=0)

In [138]:
def incidence_matrix(corpus:List[str], dataframes:List[pd.DataFrame], heuristics:Callable):
    n = sum([df.shape[1] for df in dataframes])
    ground = np.zeros((n, n))
    old = np.array([0, 0])
    offset = 0
    for i, df1 in enumerate(dataframes):
        old[1] = offset
        for j, df2 in enumerate(dataframes[i:], i):
            if i == j:
                matrix = np.eye(df1.shape[1])
            else:
                matrix = sub_incidence_matrix(corpus, df1, df2, heuristics)
            x = slice(old[0], old[0] + df1.shape[1])
            y = slice(old[1], old[1] + df2.shape[1])
            ground[x, y] = matrix
            ground[y, x] = matrix.T
            old[1] += df2.shape[1]
        offset += df1.shape[1]
        old[0] += df1.shape[1]

    return ground

In [142]:
def matching(dataframes: List[pd.DataFrame], heuristics: Callable):
    corpus = heuristics(np.array([c for df in dataframes for c in df.columns]))
    M = incidence_matrix(heuristics(columns), dataframes, heuristics)
    return M
    shapes = [df.shape[1] for df in dataframes]
    m = np.repeat(np.arange(len(dataframes)), shapes)
    for i, row in enumerate(M):
        for j, value in enumerate(row):
            if value:
                first = np.nonzero(row)[0][0]
                df1 = dataframes[m[first]]
                df2 = dataframes[m[j]]
                old = df2.columns[j - sum(shapes[:m[j]])]
                new = df1.columns[first - sum(shapes[:m[first]])]
                if old != new:
                    dataframes[m[j]] = dataframes[m[j]].rename(
                        columns={
                            old:new
                        })

In [143]:
path = "../sanitized_csv/"
csvs = [
    csv
    for path, dirs, csvs in os.walk(path)
    for csv in csvs if "svizz" not in csv
]
dataframes = [
    pd.read_csv("{path}/{csv}".format(path=path, csv=csv), index_col=["name"])
    for csv in csvs
]

In [148]:
M = matching(dataframes, np.vectorize(columns_heuristics, otypes=[str]))

  
  


In [149]:
dense_A = M
for i, row in enumerate(dense_A):
    if np.sum(row) > 1:
        rows = np.where(row)
        dense_A[rows] = 0 
        dense_A[:,rows] = 0 
        print(columns[rows])

['acqua | g' 'acqua | g' 'acqua | g' 'acqua | g' 'acqua | g']
['calorie | kcal' 'calorie | kcal']
['fibra alimentare | g' 'fibra alimentare | g'
 'fibra alimentare totale | g']
['proteine totali | g' 'proteine | g' 'proteine | g' 'proteine | g'
 'proteine totali | g' 'proteine | g' 'proteine | g']
['proteine animali | g' 'proteine animali | g']
['proteine vegetali | g' 'proteine vegetali | g']
['amido | g' 'amido (mse) | g' 'amido | g']
['lipidi totali | g' 'lipidi totali | g' 'lipidi totali | g' 'lipidi | g']
['lipidi animali | g' 'lipidi animali | g']
['lipidi vegetali | g' 'lipidi vegetali | g']
['acido oleico | g' 'acido oleico | g']
['monoinsaturi totali | g' 'monoinsaturi totali | g'
 'acidi grassi monoinsaturi totali | g']
['acido linoleico | g' 'acido linoleico | g']
['acido linolenico | g' 'acido linolenico | g']
['altri polinsaturi | g' 'altri acidi grassi polinsaturi | g']
['polinsaturi totali | g' 'polinsaturi totali | g'
 'acidi grassi polinsaturi totali | g']
['colesterol