In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances, cosine_similarity
from pyscipopt import Model, quicksum
from scipy.sparse import csr_matrix
from tqdm import tqdm_notebook as tqdm

In [2]:
def tfidf_distance(corpus, A, B):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(corpus)
    X = vectorizer.transform(A)
    Y = vectorizer.transform(B)

    return cosine_distances(X, Y)

In [3]:
def pairwise_matrix_sum(A, B):
    return np.nansum(
        [A.reshape(A.shape[0], 1, A.shape[1]),
         B.reshape(1, *B.shape)], axis=0)


def pairwise_vector_sum(A, B):
    return np.nansum([A.reshape(-1, 1), B], axis=0)

In [4]:
def weighted_mse(A, B):
    normalization = pairwise_matrix_sum(A, B)
    difference = np.abs(pairwise_matrix_sum(A, -B))
    mask = np.logical_or(np.isnan(normalization), normalization == 0)
    return np.nanmean(np.divide(difference, normalization, where=~mask), axis=2)

In [5]:
def mean_variation(A, B):
    X, Y = np.nanmean(A, axis=0), np.nanmean(B, axis=0)
    N, M = np.sum(pd.notna(A), axis=0), np.sum(pd.notna(B), axis=0)
    mean_delta = pairwise_vector_sum(X, -Y)
    size_delta = pairwise_vector_sum(N, -M)
    size_sums = pairwise_vector_sum(N, M)
    return np.abs((mean_delta*size_delta)/size_sums)

In [6]:
def filter_float(df):
    return df[[
        value.tolist()
        for key, value in df.columns.to_series().groupby(df.dtypes).groups.items()
        if "float" in key.name
    ][0]]

In [7]:
def filter_mean(matrix):
    matrix[matrix>=np.nanmean(matrix)] = np.inf
    matrix[np.isnan(matrix)] = np.inf
    return matrix

def filter_mask(matrix):
     return np.all(np.stack([
         matrix == np.min(matrix, axis=1).reshape(-1, 1),
         matrix == np.min(matrix, axis=0),
         ~np.isinf(matrix)
     ]), axis=0)

def distance(corpus, A, B, alpha=1):
    float_A = filter_float(A)
    float_B = filter_float(B)
    tfidf = filter_mask(filter_mean(tfidf_distance(corpus, float_A.columns, float_B.columns)))
    mean = filter_mask(filter_mean(mean_variation(np.array(float_A), np.array(float_B))))
    result = np.all(np.stack([tfidf, mean]), axis=0)
    return result

In [8]:
def incidence_matrix(corpus, dataframes):
    masks = [[
        distance(corpus, df1, df2) if i != j else np.eye(filter_float(df1).shape[1]) for j, df2 in enumerate(dataframes)
    ] for i, df1 in enumerate(dataframes)]
    n = sum([filter_float(df).shape[1] for df in dataframes])
    m = len(dataframes)
    ground = np.zeros((n,n))
    old = np.array([0,0])
    for i in range(m):
        old[1] = 0
        for j in range(m):
            mask = masks[i][j]
            ground[old[0]:old[0]+mask.shape[0], old[1]:old[1]+mask.shape[1]] = mask
            old[1] += mask.shape[1]
        old[0] += mask.shape[0]
        
    return csr_matrix(ground)
    

In [9]:
path = "../sanitized_csv/"
csvs = [
    csv
    for path, dirs, csvs in os.walk(path)
    for csv in csvs
]
dataframes = [
    pd.read_csv("{path}/{csv}".format(path=path, csv=csv))
    for csv in csvs
]
corpus = [c for df in dataframes for c in df.columns]

In [10]:
reduced = dataframes
A = incidence_matrix(corpus, reduced)
columns = np.array([
    c for df in reduced for c in filter_float(df).columns
])

In [11]:
dense_A = A.todense()
np.fill_diagonal(dense_A, 0)
r = np.array(np.where(dense_A)).T
for a, b in columns[r]:
    print("DEUS VULT: {a} == {b}".format(a=a, b=b))

DEUS VULT: acqua | g == acqua | g
DEUS VULT: acqua | g == acqua | g
DEUS VULT: acqua | g == Acqua | g
DEUS VULT: acqua | g == acqua | g
DEUS VULT: acqua | g == acqua | g
DEUS VULT: calorie | kcal == calorie | kcal
DEUS VULT: calorie | kcal == calorie | kcal
DEUS VULT: calorie | kcal == calorie | kcal
DEUS VULT: calorie | kcal == valore calorico | kcal
DEUS VULT: calorie | kcal == energia | kcal
DEUS VULT: fibra alimentare | g == fibra alimentare | g
DEUS VULT: fibra alimentare | g == fibra alimentare | g
DEUS VULT: fibra alimentare | g == Fibra alimentare totale | g
DEUS VULT: proteine totali | g == proteine totali | g
DEUS VULT: proteine animali | g == proteine animali | g
DEUS VULT: proteine vegetali | g == proteine vegetali | g
DEUS VULT: glucidi disponibili | g == glucidi disponibili | g
DEUS VULT: glucidi solubili | g == glucidi solubili | g
DEUS VULT: amido | g == amido | g
DEUS VULT: lipidi totali | g == lipidi totali | g
DEUS VULT: lipidi animali | g == lipidi animali | g
DEUS 