In [78]:
import os
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances, cosine_distances, cosine_similarity
from pyscipopt import Model, quicksum
from scipy.sparse import csr_matrix
from tqdm import tqdm_notebook as tqdm
from typing import List, Callable

In [2]:
def vitamins_heuristic(name:str)->str:
    """Return given `name` with applied vitamin heuristic.
        name:str, the name to which apply the heuristic.
    """
    return re.sub("(vitamina?) ([a-z\d]+)", r"\1_\2", name)

In [3]:
def columns_heuristics(name):
    """Return given `name` with applied batch of heuristics.
        name:str, the name to which apply the heuristics.
    """
    heuristics = [
        vitamins_heuristic
    ]
    for heuristic in heuristics:
        name = heuristic(name)
    return name

In [81]:
def tfidf(corpus:List[str], texts:List[np.ndarray], heuristics:Callable[[np.ndarray],np.ndarray]=None):
    """Return tfidf vectorization using given `corpus` vector. If given, also applies heuristics element-wise.
        corpus:List[str], list of texts to use as baseline for tfidf.
        texts:List[np.ndarray], list of vectors to which apply heuristics and tfidf vectorization element-wise.
        heuristics:Callable[[np.ndarray],np.ndarray], function to apply heauristics to texts.
    """
    vectorizer = TfidfVectorizer()
    vectorizer.fit(corpus)
    if heuristics is not None:
        texts = [heuristics(t) for t in texts]
    return [vectorizer.transform(t) for t in texts]

In [82]:
def pairwise_matrix_sum(A, B):
    return np.nansum(
        [A.reshape(A.shape[0], 1, A.shape[1]),
         B.reshape(1, *B.shape)], axis=0)


def pairwise_vector_sum(A, B):
    return np.nansum([A.reshape(-1, 1), B.reshape(1, B.size)], axis=0)

In [75]:
def weighted_mse(A, B):
    normalization = pairwise_matrix_sum(A, B)
    difference = np.abs(pairwise_matrix_sum(A, -B))
    mask = np.logical_or(np.isnan(normalization), normalization == 0)
    return np.nanmean(np.divide(difference, normalization, where=~mask), axis=2)

In [7]:
def weighted_mean_differences(A: np.matrix, B: np.matrix) -> np.matrix:
    """Return weighted mean differences of given matrices."""
    X, Y = np.nanmean(A, axis=0), np.nanmean(B, axis=0)
    N, M = np.sum(pd.notna(A), axis=0), np.sum(pd.notna(B), axis=0)
    return np.abs((pairwise_vector_sum(X, -Y) * pairwise_vector_sum(N, -M)) /
                  (pairwise_vector_sum(X, Y) + pairwise_vector_sum(N, M)))

In [96]:
def columns_to_type_groups(df):
    types = df.columns.to_series().groupby(df.dtypes).groups
    string = float64 = []
    if np.dtype('O') in types:
        string = types[np.dtype('O')]
    if np.dtype('float64') in types:
        float64 = types[np.dtype('float64')]
        
    return df[string], df[float64]

def matrix_type_argsort(matrix:np.matrix, df:pd.DataFrame)->np.matrix:
    """Return given `matrix` sorted using given `df` DataFrame types."""
    return np.argsort(matrix, order=list(zip(*sorted(zip(df.dtypes, range(len(df.dtypes))))))[1], axis=0)

def matrix_double_type_argsort(floats:np.matrix, strings:np.matrix, df1:pd.DataFrame, df2:pd.DataFrame)->np.matrix:
    """Return combined matrix sorted in both axis using given dataframes types."""
    ground = np.zeros(np.sum([floats.shape, strings.shape], axis=0))
    ground[slice(floats.shape)] = floats
    ground[slice(-strings.shape)] = strings
    return matrix_type_argsort(matrix_type_argsort(ground, df1).T, df2).T

In [76]:
def filter_mean(matrix):
    matrix[matrix >= np.nanmean(matrix)] = np.inf
    matrix[np.isnan(matrix)] = np.inf
    return matrix


def filter_mask(matrix):
    return np.all(
        [
            matrix == np.min(matrix, axis=1).reshape(-1, 1), matrix == np.min(
                matrix, axis=0), ~np.isinf(matrix)
        ],
        axis=0)


def drop_nan(df):
    return [df[c].drop_nan() for c in df]


def column_values_mean_tfidf_distance(df1: pd.DataFrame,
                                      df2: pd.DataFrame) -> np.matrix:
    """Return cosine distance of dataframes columns mean tfidf vectorization.
        df1:pd.DataFrame, first string only dataframe.
        df2:pd.DataFrame, second string only dataframe.
    """
    mean_vectors = np.mean(
        tfidf(
            np.concatenate([df1.values.flatten(),
                            df2.values.flatten()]),
            [*drop_nan(df1), *drop_nan(df2)]),
        axis=0)
    return cosine_distances(mean_vectors[:df1.shape[1]],
                            mean_vectors[df1.shape[1]:])


def tfidf_distance(corpus: List[str],
                   A: np.ndarray,
                   B: np.ndarray,
                   heuristics: Callable = None) -> np.matrix:
    """Return cosine distance of dataframes columns mean tfidf vectorization.
        corpus:List[str], list of texts to use as baseline for tfidf.
        A:np.ndarray, vector to which apply heuristics and tfidf vectorization element-wise.
        B:np.ndarray, vector to which apply heuristics and tfidf vectorization element-wise.
        heuristics:Callable[[np.ndarray],np.ndarray], function to apply heauristics to texts.
    """
    return cosine_distances(*tfidf(corpus, np.concatenate([A, B])))
    

def sub_incidence_matrix(corpus: List[str], A: pd.DataFrame, B: pd.DataFrame,
                         heuristics: Callable[[np.ndarray], np.ndarray]):
    strings, floats = list(zip(type_filter(A), type_filter(B)))
    return np.all(
        [
            filter_mask(filter_mean(matrix)) for matrix in [
                column_values_mean_tfidf_distance(*strings),
                weighted_mean_differences(*floats),
                tfidf_distance(corpus, A.columns, B.columns, heuristics),
            ]
        ],
        axis=0)

In [69]:
def incidence_matrix(corpus, dataframes, heuristics):
    n = sum([df.shape[1] for df in dataframes])
    ground = np.zeros((n, n))
    old = np.array([0, 0])
    for i, df1 in enumerate(dataframes):
        old[1] = 0
        for j, df2 in enumerate(dataframes[i:], i):
            if i==j:
                matrix = np.eye(df1.shape[1])
            else:
                matrix = sub_incidence_matrix(corpus, df1, df2, heuristics)
            x = slice(old[0], old[0] + df2.shape[1])
            y = slice(old[1], old[1] + df1.shape[1])
            ground[x, y] = matrix
            ground[y, x] = matrix.T
            old[1] += df1.shape[1]
        old[0] += df2.shape[1]

    return csr_matrix(ground)

In [70]:
path = "../sanitized_csv/"
csvs = [
    csv
    for path, dirs, csvs in os.walk(path)
    for csv in csvs
]
dataframes = [
    pd.read_csv("{path}/{csv}".format(path=path, csv=csv), index_col=["name"])
    for csv in csvs
]
corpus = [c for df in dataframes for c in df.columns]

In [95]:
np.sum([dataframes[0].shape,dataframes[0].shape], axis=0)

array([829, 829])

In [71]:
A = incidence_matrix(corpus, dataframes, np.vectorize(columns_heuristics, otypes=[str]))
columns = np.array([
    c for df in dataframes for c in filter_float(df).columns
])

ValueError: Found array with 0 sample(s) (shape=(0, 46)) while a minimum of 1 is required by the normalize function.

In [63]:
dense_A = A.todense()
for i, row in enumerate(dense_A):
    if np.sum(row) > 1:
        print(columns[np.where(row)[1]])
        dense_A[:, i] = 0 
    

NameError: name 'A' is not defined

In [None]:
C = np.random.randint(10, size=(10, ))