In [1]:
import os
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from pyscipopt import Model, quicksum
from scipy.sparse import csr_matrix
from scipy.stats import ks_2samp
from pprint import pprint
from tqdm import tqdm_notebook as tqdm
from typing import List, Callable

In [2]:
def pairwise_matrix_sum(A, B):
    return np.nansum(
        [A.reshape(A.shape[0], 1, A.shape[1]),
         B.reshape(1, *B.shape)], axis=0)


def pairwise_vector_sum(A, B):
    Ar = A.reshape(-1, 1)
    Br = B.reshape(1, B.size)
    if A.size==1:
        return np.nansum([Ar, Br.T], axis=0).T 
    return np.nansum([Ar, Br], axis=0)

def _and(*args):
    return np.all(args, axis=0)

In [3]:
def matrix_type_argsort(matrix: np.matrix, df: pd.DataFrame) -> np.matrix:
    """Return given `matrix` sorted using given `df` DataFrame types."""
    return matrix[np.argsort(list(zip(*sorted(zip(df.dtypes, range(len(df.dtypes))))))[1]),:]


def double_argsort(floats: np.matrix, strings: np.matrix, df1: pd.DataFrame,
                   df2: pd.DataFrame, fill=0) -> np.matrix:
    """Return combined matrix sorted in both axis using given dataframes types."""
    ground = np.full((df1.shape[1], df2.shape[1]), float(fill))
    if np.all(floats.shape):
        ground[:floats.shape[0], :floats.shape[1]] = floats
    if np.all(strings.shape):
        ground[-strings.shape[0]:, -strings.shape[1]:] = strings
    return matrix_type_argsort(matrix_type_argsort(ground, df1).T, df2).T

In [4]:
def weighted_mean_differences(A: np.matrix, B: np.matrix) -> np.matrix:
    """Return weighted mean differences of given matrices."""
    if not A.shape[1] or not A.shape[1]:
        return np.zeros((A.shape[1], B.shape[1]))
    X, Y = np.nanmean(A.values, axis=0), np.nanmean(B.values, axis=0)
    N, M = np.sum(pd.notna(A.values), axis=0), np.sum(pd.notna(B.values), axis=0)
    result = np.abs((pairwise_vector_sum(X, -Y) * pairwise_vector_sum(N, -M)) /
                  (pairwise_vector_sum(X, Y) + pairwise_vector_sum(N, M)))
    return result/np.max(result)

In [5]:
def column_values_mean_tfidf_distance(df1: pd.DataFrame,
                                      df2: pd.DataFrame, data_tfidf) -> np.matrix:
    """Return cosine distance of dataframes columns mean tfidf vectorization.
        df1:pd.DataFrame, first string only dataframe.
        df2:pd.DataFrame, second string only dataframe.
    """
    if not df1.shape[1] or not df2.shape[1]:
        return np.zeros((df1.shape[1], df2.shape[1]))
    a = np.array([np.mean(data_tfidf.transform(c), axis=0) for c in drop_nan(df1)]).squeeze(axis=1)
    b = np.array([np.mean(data_tfidf.transform(c), axis=0) for c in drop_nan(df2)]).squeeze(axis=1)
    return cosine_distances(a, b)

In [6]:
def mean_variations(df1: pd.DataFrame, df2: pd.DataFrame, data_tfidf):
    return double_argsort(
        weighted_mean_differences(
            df1.select_dtypes(include='float64'),
            df2.select_dtypes(include='float64')),
        column_values_mean_tfidf_distance(
            df1.select_dtypes(include='object'),
            df2.select_dtypes(include='object'),
            data_tfidf), df1, df2, fill=np.inf)

In [7]:
def pairwise_ks(A: List[np.ndarray], B: List[np.ndarray]) -> np.matrix:
    """Return weighted mean differences of given matrices."""
    min_size = 100
    if len(A) and len(B):
        try:
            return np.moveaxis(
                np.array([[
                    ks_2samp(a, b)
                    if a.size > min_size and b.size > min_size else (0, 0)
                    for b in B
                ] for a in A]), [0,1,2], [1, 2, 0])
        except Exception as e:
            print(e)
    return np.full((len(A), len(B)), np.nan), np.full((len(A), len(B)), np.nan)


def drop_nan(df):
    return [df[c].iloc[df[c].nonzero()].dropna() for c in df]


def typewise_ks(A: pd.DataFrame, B: pd.DataFrame, tfidf_data) -> np.matrix:
    """Return weighted mean differences of given matrices."""
    string_A = [
        tfidf_data.transform(c)
        for c in drop_nan(A.select_dtypes(include='object'))
    ]
    string_B = [
        tfidf_data.transform(c)
        for c in drop_nan(B.select_dtypes(include='object'))
    ]
    float_A = drop_nan(A.select_dtypes(include='float64'))
    float_B = drop_nan(B.select_dtypes(include='float64'))
    floats_s, floats_p = pairwise_ks(float_A, float_B)
    strings_s, strings_p = pairwise_ks(string_A, string_B)
    return double_argsort(floats_s, strings_s, A, B, 1), double_argsort(
        floats_p, strings_p, A, B, 0)

In [26]:
def cross_minima(matrix):
    return _and(
        matrix == np.min(matrix, axis=1).reshape(-1, 1), matrix == np.min(
            matrix, axis=0), ~np.isclose(matrix, np.nanmax(matrix)))


def cross_maxima(matrix):
    return _and(
        matrix == np.max(matrix, axis=1).reshape(-1, 1), matrix == np.max(
            matrix, axis=0), ~np.isclose(matrix, np.nanmin(matrix)))


def units_mask(A: np.ndarray, B: np.ndarray):
    sep = " | "
    unit_A, unit_B = [
        np.array(
            [None if len(c.split(sep)) == 1 else c.split(sep)[1] for c in v])
        for v in [A, B]
    ]
    return unit_A[:, None] == unit_B


def sub_incidence_matrix(column_tfidf, data_tfidf, A: pd.DataFrame,
                         B: pd.DataFrame):
    s, p = typewise_ks(A, B, data_tfidf)
    tfidf = cross_minima(
        cosine_distances(
            column_tfidf.transform(A.columns),
            column_tfidf.transform(B.columns)))
    means = cross_minima(mean_variations(A, B, data_tfidf))
    return _and(np.sum([means, tfidf, p>1e-3], axis=0)>1, units_mask(A.columns, B.columns))

In [27]:
def incidence_matrix(column_corpus:List[str], data_corpus:List[str], dataframes:List[pd.DataFrame]):
    n = sum([df.shape[1] for df in dataframes])
    column_tfidf = TfidfVectorizer()
    column_tfidf.fit(column_corpus)
    data_tfidf = TfidfVectorizer()
    data_tfidf.fit(data_corpus)
    ground = np.zeros((n, n))
    old = np.array([0, 0])
    offset = 0
    for i, df1 in tqdm(enumerate(dataframes), total=len(dataframes)):
        old[1] = offset
        for j, df2 in enumerate(dataframes[i:], i):
            if i == j:
                matrix = np.eye(df1.shape[1])
            else:
                matrix = sub_incidence_matrix(column_tfidf, data_tfidf, df1, df2)
            x = slice(old[0], old[0] + df1.shape[1])
            y = slice(old[1], old[1] + df2.shape[1])
            ground[x, y] = matrix
            ground[y, x] = matrix.T
            old[1] += df2.shape[1]
        offset += df1.shape[1]
        old[0] += df1.shape[1]

    return ground.astype(bool)#np.matrix(ground)

In [28]:
def matching(dataframes: List[pd.DataFrame]):
    corpus = np.array([c for df in dataframes for c in df.columns])
    data_corpus = np.concatenate([df.select_dtypes(include='object').values.flatten() for df in dataframes])
    data_corpus = data_corpus[~pd.isna(data_corpus)]
    return incidence_matrix(corpus, data_corpus, dataframes)

In [29]:
path = "../sanitized_csv/"
csvs = [
    csv
    for path, dirs, csvs in os.walk(path)
    for csv in csvs if "svizz" not in csv
]
dataframes = [
    pd.read_csv("{path}/{csv}".format(path=path, csv=csv), index_col=["name"])
    for csv in csvs
]
columns = np.array([c for df in dataframes for c in df.columns])

In [30]:
M = matching(dataframes)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

axis -1 is out of bounds for array of dimension 0




axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0
axis -1 is out of bounds for array of dimension 0



In [31]:
M0 = np.copy(M)
for i, row in enumerate(M0):
    if np.sum(row) > 1:
        rows = np.where(row)
        M0[rows] = 0 
        M0[:,rows] = 0 
        print(columns[rows])

['acqua | g' 'acqua | g' 'acqua | g' 'acqua | g' 'acqua | g']
['calorie | kcal' 'calorie | kcal' 'calorie | kcal'
 'energia, ric con fibra | kcal' 'energia, ricalcolata | kcal'
 'valore calorico | kcal' 'energia | kcal']
['fibra alimentare | g' 'fibra alimentare | g'
 'fibra alimentare totale | g']
['proteine totali | g' 'proteine totali | g' 'proteine | g']
['proteine animali | g' 'proteine animali | g']
['proteine vegetali | g' 'proteine vegetali | g']
['glucidi disponibili | g' 'carboidrati disponibili (mse) | g'
 'carboidrati disponibili | g']
['glucidi solubili | g' 'carboidrati solubili (mse) | g'
 'zuccheri solubili | g']
['lipidi totali | g' 'carboidrati disponibili | g' 'lipidi totali | g'
 'lipidi | g']
['lipidi animali | g' 'lipidi animali | g']
['lipidi vegetali | g' 'lipidi vegetali | g']
['lipidi saturi totali | g' 'acido oleico | g']
['acido oleico | g' 'acidi grassi saturi totali | g']
['monoinsaturi totali | g' 'acidi grassi monoinsaturi totali | g'
 'grassi monoinsatu