In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from pyscipopt import Model, quicksum
from tqdm import tqdm_notebook as tqdm

In [2]:
def tfidf_distance(corpus, A, B):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(corpus)
    X = vectorizer.transform(A)
    Y = vectorizer.transform(B)

    return euclidean_distances(X, Y)

In [3]:
def pairwise_matrix_sum(A, B):
    return np.nansum(
        [A.reshape(A.shape[0], 1, A.shape[1]),
         B.reshape(1, *B.shape)], axis=0)


def pairwise_vector_sum(A, B):
    return np.nansum([A.reshape(-1, 1), B], axis=0)

In [4]:
def weighted_mse(A, B):
    normalization = pairwise_matrix_sum(A, B)
    difference = np.abs(pairwise_matrix_sum(A, -B))
    mask = np.logical_or(np.isnan(normalization), normalization == 0)
    return np.nanmean(np.divide(difference, normalization, where=~mask), axis=2)

In [5]:
def mean_variation(A, B):
    X, Y = np.nanmean(A, axis=0), np.nanmean(B, axis=0)
    N, M = np.sum(pd.notna(A), axis=0), np.sum(pd.notna(B), axis=0)
    mean_delta = pairwise_vector_sum(X, -Y)
    size_delta = pairwise_vector_sum(N, -M)
    size_sums = pairwise_vector_sum(N, M)
    return np.abs((mean_delta*size_delta)/size_sums)

In [6]:
def filter_float(df):
    return df[[
        value.tolist()
        for key, value in df.columns.to_series().groupby(df.dtypes).groups.items()
        if "float" in key.name
    ][0]]

In [7]:
def tuple_argmin(A, size, axis):
    return np.stack([
        np.arange(size),
        np.nanargmin(A, axis=axis)
    ]).T    

In [59]:
def distance(corpus, A, B, alpha=1):
    float_A = filter_float(A)
    float_B = filter_float(B)
    tfidf = tfidf_distance(corpus, float_A.columns, float_B.columns)
    mean = mean_variation(np.array(float_A), np.array(float_B))
    tmp =  alpha * tfidf / np.nanmax(tfidf) + (1 - alpha) * mean / np.nanmax(mean)
    tmp[np.isnan(tmp)] = np.inf
    return tmp

In [60]:
def distances(dataframes):
    corpus = [c for df in dataframes for c in df.columns]
    return [[
        distance(corpus, df1, df2) if i != j else None for j, df2 in enumerate(dataframes)
        
    ] for i, df1 in enumerate(dataframes)]

$$
\begin{align*}
  \min f_{1} = \sum_{r=1}^M \sum_{\substack{s=1                                                                                                                                                                                                                   \\ s\neq r}}^{M} \sum_{i=1}^{N_{r}} \sum_{j=1}^{N_{s}} c_{risj} e_{risj} \\
  \max f_{2} = \sum_{r=1}^M \sum_{\substack{s=1                                                                                                                                                                                                                   \\ s\neq r}}^{M} \sum_{i=1}^{N_{r}} \sum_{j=1}^{N_{s}} e_{risj}\\
  \underbrace{e_{risj} - e_{sjri} = 0}_{\text{For simmetry.}}                 & \quad \forall r, s \in \{1, \ldots, M\}: r \neq s\; \forall i \in \{1, \ldots, N_r\}\; \forall j \in \{1, \ldots, N_s\}                                                           \\
  \underbrace{e_{risj} + e_{sjtk} - 2e_{ritk} = 0}_{\text{For transitivity.}} & \quad \forall r, s, t \in \{1, \ldots, M\}: r \neq s, r \neq t, t \neq s\; \forall i \in \{1, \ldots, N_r\}\; \forall j \in \{1, \ldots, N_s\}\; \forall k \in \{1, \ldots, N_k\} \\
  e_{risj}                                                                    & \in \{0,1\}                                                                                                                                                                       \\
  c_{risj}                                                                    & \in [0,1]                                                                                                                                                                         \\
\end{align*}
$$

In [30]:
def matching(costs, columns_sizes, alpha):
    model = Model("Multi dimensional matching.")
    M = len(costs)
    print("Adding variables to model")
    e = {(r, i, s, j): model.addVar(
        vtype="B", name="e({r},{i},{s},{j})".format(r=r, s=s, i=i, j=j))
         for r in tqdm(range(M)) for s in range(M) if s != r
         for i in range(columns_sizes[r]) for j in range(columns_sizes[s])}

    print("Adding simmetry constraints")
    for r in tqdm(range(M)):
        for s in range(M):
            if s != r:
                for i in range(columns_sizes[r]):
                    for j in range(columns_sizes[s]):
                        model.addCons(
                            e[r, i, s, j] - e[s, j, r, i] == 0,
                            "Simmetry({r},{i},{s},{j})".format(
                                r=r, s=s, i=i, j=j))

    print("Adding transitivity constraints")
    for r in tqdm(range(M)):
        for s in tqdm(range(M), leave=False):
            for t in tqdm(range(M), leave=False):
                for i in range(columns_sizes[r]):
                    for j in range(columns_sizes[s]):
                        for k in range(columns_sizes[t]):
                            if t == s == r or i==j==k:
                                continue
                            e1 = e2 = e3 = 0
                            if s != r and i!=j:
                                e1 = e[r, i, s, j]
                            if t != s and j!=k:
                                e2 = e[s, j, t, k]
                            if t != r and i != k:
                                e3 = e[r, i, t, k]
                            model.addCons(
                                e1 + e2 - 2 * e3 == 0,
                                "Transitivity({r},{i},{s},{j},{t},{k})".format(
                                    r=r, s=s, t=t, i=i, j=j, k=k))

    print("Adding objective function")
    model.setObjective(
        quicksum((alpha - (1 - alpha) * costs[r][s][i, j]) * e[r, i, s, j]
                 for r in tqdm(range(M)) for s in range(M) if s != r
                 for i in range(columns_sizes[r])
                 for j in range(columns_sizes[s])), "maximize")
    model.data = e
    return model

In [49]:
path = "../sanitized_csv/"
csvs = [
    csv
    for path, dirs, csvs in os.walk(path)
    for csv in csvs
]
dataframes = [
    pd.read_csv("{path}/{csv}".format(path=path, csv=csv))
    for csv in csvs[:3]
]

In [50]:
columns_sizes = [filter_float(df).shape[1] for df in dataframes]

In [61]:
model = matching(distances(dataframes), columns_sizes, 0.5)

Adding variables to model


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


Adding simmetry constraints


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


Adding transitivity constraints


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


Adding objective function


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




In [62]:
model.optimize()

In [63]:
model.getObjVal()

105.59999999999988

In [64]:
e = model.data
edges = [key for key, edge in e.items()  if model.getVal(edge)==1]
edges

[(0, 0, 1, 0),
 (0, 1, 1, 1),
 (0, 2, 1, 2),
 (0, 3, 1, 3),
 (0, 4, 1, 4),
 (0, 5, 1, 5),
 (0, 6, 1, 6),
 (0, 7, 1, 7),
 (0, 8, 1, 8),
 (0, 9, 1, 9),
 (0, 10, 1, 10),
 (0, 11, 1, 11),
 (0, 12, 1, 12),
 (0, 13, 1, 13),
 (0, 14, 1, 14),
 (0, 15, 1, 15),
 (0, 16, 1, 16),
 (0, 0, 2, 0),
 (0, 1, 2, 1),
 (0, 2, 2, 2),
 (0, 3, 2, 3),
 (0, 4, 2, 4),
 (0, 5, 2, 5),
 (0, 6, 2, 6),
 (0, 7, 2, 7),
 (0, 8, 2, 8),
 (0, 9, 2, 9),
 (0, 10, 2, 10),
 (0, 11, 2, 11),
 (0, 12, 2, 12),
 (0, 13, 2, 13),
 (0, 14, 2, 14),
 (0, 15, 2, 15),
 (0, 16, 2, 16),
 (0, 17, 2, 17),
 (0, 18, 2, 18),
 (0, 19, 2, 19),
 (0, 20, 2, 20),
 (0, 21, 2, 21),
 (0, 22, 2, 22),
 (0, 23, 2, 23),
 (0, 24, 2, 24),
 (0, 25, 2, 25),
 (0, 26, 2, 26),
 (0, 27, 2, 27),
 (0, 28, 2, 28),
 (0, 29, 2, 29),
 (0, 30, 2, 30),
 (0, 31, 2, 31),
 (1, 0, 0, 0),
 (1, 1, 0, 1),
 (1, 2, 0, 2),
 (1, 3, 0, 3),
 (1, 4, 0, 4),
 (1, 5, 0, 5),
 (1, 6, 0, 6),
 (1, 7, 0, 7),
 (1, 8, 0, 8),
 (1, 9, 0, 9),
 (1, 10, 0, 10),
 (1, 11, 0, 11),
 (1, 12, 0, 12),
 (1, 1

In [25]:
0 == 0 == 1

False