In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
from scipy.stats import ks_2samp, mannwhitneyu
from tqdm import tqdm_notebook as tqdm
from typing import List, Callable

In [2]:
def _and(*args):
    return np.all(args, axis=0)

def _or(*args):
    return np.any(args, axis=0)

def drop_nan(df):
    return [df[c][~np.isclose(df[c].values, 0)].dropna() for c in df]

In [4]:
def matrix_type_argsort(matrix: np.matrix, df: pd.DataFrame) -> np.matrix:
    """Return given `matrix` sorted using given `df` DataFrame types."""
    return matrix[np.argsort(
        list(zip(*sorted(zip(df.dtypes, range(len(df.dtypes))))))[1]), :]


def double_argsort(floats: np.matrix,
                   strings: np.matrix,
                   df1: pd.DataFrame,
                   df2: pd.DataFrame,
                   fill=0) -> np.matrix:
    """Return combined matrix sorted in both axis using given dataframes types."""
    ground = np.full((df1.shape[1], df2.shape[1]), float(fill))
    if np.all(floats.shape):
        ground[:floats.shape[0], :floats.shape[1]] = floats
    if np.all(strings.shape):
        ground[-strings.shape[0]:, -strings.shape[1]:] = strings
    return matrix_type_argsort(matrix_type_argsort(ground, df1).T, df2).T

In [5]:
def data_tfidf_distance(df1: pd.DataFrame, df2: pd.DataFrame,
                        data_tfidf) -> np.matrix:
    """Return cosine distance of dataframes columns mean tfidf vectorization.
        df1:pd.DataFrame, first string only dataframe.
        df2:pd.DataFrame, second string only dataframe.
    """
    if not df1.shape[1] or not df2.shape[1]:
        return np.ones((df1.shape[1], df2.shape[1]))
    a = np.array([
    ]).squeeze(axis=1)
    b = np.array([
        np.mean(data_tfidf.transform(c), axis=0) for c in drop_nan(df2)
    ]).squeeze(axis=1)
    return cosine_distances(a, b)

In [6]:
def string_data_distances(df1: pd.DataFrame, df2: pd.DataFrame, data_tfidf):
    return double_argsort(
        np.array([]),
        data_tfidf_distance(
            df1.select_dtypes(include='object'),
            df2.select_dtypes(include='object'),
            data_tfidf) < 0.75,
        df1,
        df2,
        fill=False)

In [7]:
def pairwise_test(A: pd.DataFrame, B: pd.DataFrame, test:Callable, minimum=100):
    df1 = drop_nan(A.select_dtypes(include='float64'))
    df2 = drop_nan(B.select_dtypes(include='float64'))
    return double_argsort(
        np.array([[test(a, b)[1] if a.size > minimum and b.size > minimum else 0 for b in df2] for a in df1]),
        np.array([]), A, B, 0)

def pairwise_ks(A: pd.DataFrame, B: pd.DataFrame):
    return pairwise_test(A, B, ks_2samp)

def pairwise_mw(A: pd.DataFrame, B: pd.DataFrame):
    return pairwise_test(A, B, mannwhitneyu)


In [8]:
def columns_tfidf_cosine(A, B, column_tfidf):
    return cosine_distances(
            column_tfidf.transform(A.columns),
            column_tfidf.transform(B.columns)) < 0.5

In [16]:
def units_mask(A: np.ndarray, B: np.ndarray):
    sep = " | "
    unit_A, unit_B = [
        np.array(
            [None if len(c.split(sep)) == 1 else c.split(sep)[1] for c in v])
        for v in [A, B]
    ]
    return unit_A[:, None] == unit_B


def sub_incidence_matrix(column_tfidf, data_tfidf, A: pd.DataFrame,
                         B: pd.DataFrame):
    p = pairwise_mw(A, B)
    #_, mk_p = typewise_mk(A, B, data_tfidf)
    
    #means = string_data_distances(A, B, data_tfidf)
    # tfidf = columns_tfidf_cosine(A, B, column_tfidf)
    # tfidf<0.75
    # print(np.nanmin(ksp), np.nanmax(ksp), np.nanmean(ksp), np.nanvar(ksp))
    return p>0.1#_and(means, tfidf, ks_p>1e-3, mk_p>1e-3, units_mask(A.columns, B.columns))


In [17]:
def incidence_matrix(column_corpus:List[str], data_corpus:List[str], dataframes:List[pd.DataFrame]):
    n = sum([df.shape[1] for df in dataframes])
    column_tfidf = TfidfVectorizer()
    column_tfidf.fit(column_corpus)
    data_tfidf = TfidfVectorizer()
    data_tfidf.fit(data_corpus)
    ground = np.zeros((n, n))
    old = np.array([0, 0])
    offset = 0
    for i, df1 in tqdm(enumerate(dataframes), total=len(dataframes)):
        old[1] = offset
        for j, df2 in enumerate(dataframes[i:], i):
            if i == j:
                matrix = np.eye(df1.shape[1])
            else:
                matrix = sub_incidence_matrix(column_tfidf, data_tfidf, df1, df2)
            x = slice(old[0], old[0] + df1.shape[1])
            y = slice(old[1], old[1] + df2.shape[1])
            ground[x, y] = matrix
            ground[y, x] = matrix.T
            old[1] += df2.shape[1]
        offset += df1.shape[1]
        old[0] += df1.shape[1]

    return ground.astype(bool)#np.matrix(ground)

In [18]:
def matching(dataframes: List[pd.DataFrame]):
    corpus = np.array([c for df in dataframes for c in df.columns])
    data_corpus = np.concatenate([df.select_dtypes(include='object').values.flatten() for df in dataframes])
    data_corpus = data_corpus[~pd.isna(data_corpus)]
    return incidence_matrix(corpus, data_corpus, dataframes)

In [19]:
path = "../sanitized_csv/"
csvs = [
    csv
    for path, dirs, csvs in os.walk(path)
    for csv in csvs
]
dataframes = [
    pd.read_csv("{path}/{csv}".format(path=path, csv=csv), index_col=["name"])
    for csv in csvs
]
columns = np.array([c for df in dataframes for c in df.columns])

In [20]:
M = matching(dataframes)

HBox(children=(IntProgress(value=0, max=9), HTML(value='')))




In [21]:
m = np.matrix(M)

In [22]:
M0 = np.copy(m)
for i, row in enumerate(M0):
    if np.sum(row) > 1:
        rows = np.where(row)
        M0[rows] = 0 
        M0[:,rows] = 0 
        print(columns[rows])

['acqua | g' 'acqua | g' 'acqua | g' 'acqua | g' 'acqua | g']
['calorie | kcal' 'calorie | kcal' 'energy kcal'
 'energia, ric con fibra | kcal' 'energia, ricalcolata | kcal'
 'valore calorico | kcal' 'energia | kcal']
['fibra alimentare | g' 'acidi grassi saturi | g' 'istidina | g'
 'monoinsaturi totali | g' 'saturi totali | g'
 'carboidrati solubili (mse) | g' 'fibra alimentare totale | g'
 'lipidi totali | g']
['proteine totali | g' 'protein' 'nutrition score france' 'grassi | g'
 'lipidi totali | g' 'carboidrati disponibili (mse) | g' 'iodio | g'
 'proteine totali | g']
['proteine animali | g' 'proteine : | %' 'proteine | g']
['proteine vegetali | g' 'sugars' 'fibra alimentare | g' 'c18:1 | g'
 'zuccheri solubili | g' 'zuccheri solubili | g']
['glucidi disponibili | g' 'carbohydrates, available' 'grassi | g'
 'leucina | g' 'lisina | g' 'proteine | g']
['glucidi solubili | g' 'fat, total' 'carboidrati disponibili | g'
 'tirosina | g' 'treonina | g' 'selenio | g' 'zuccheri | g'
 'carb