In [1]:
import dask
import dask.dataframe as df
import numpy as np
import pandas as pd
from scipy import sparse as sp
from math import log
from collections import Counter
import functools 
import sklearn.metrics as metrics
np.random.seed(0)

In [None]:
trues = np.array([1,2,3,3,2,1])
preds = np.array([4,5,4,4,5,5])
true_classes, true_idx =np.unique(trues, return_inverse=True)
print(true_classes)
print(true_idx)
pred_classes, pred_idx =np.unique(preds, return_inverse=True)
print(metrics.mutual_info_score(trues, preds))
print(pred_classes)
print(pred_idx)
n_classes = true_classes.shape[0]
n_preds = pred_classes.shape[0]

In [None]:
0.4620981203732969


In [2]:
@dask.delayed
def partition_mutual_info_pre_score(true: pd.Series, pred: pd.Series):
    datos = {}
    true_classes, true_idx = np.unique(true, return_inverse=True)
    datos['true_classes'] = true_classes
    datos['true_idx'] = true_idx
    pred_classes, pred_idx = np.unique(pred, return_inverse=True)
    datos['pred_classes'] = pred_classes
    datos['pred_idx'] = pred_idx
    n_classes = true_classes.shape[0]
    n_preds = pred_classes.shape[0]
    datos['n_classes'] = n_classes
    datos['n_preds'] = n_preds
    contingency = sp.coo_matrix((np.ones(true_idx.shape[0]),
                                 (true_idx, pred_idx)),
                                shape=(n_classes, n_preds),
                                dtype=np.int)
    nzx, nzy, nz_val = sp.find(contingency)
    datos['nzx'], datos['nzy'], datos['nz_val'] = nzx, nzy, nz_val
    contingency_sum = contingency.sum()
    datos['contingency_sum'] = contingency_sum
    pi = np.ravel(contingency.sum(axis=1))
    datos['pi'] = pi
    pj = np.ravel(contingency.sum(axis=0))
    datos['pj'] = pj
    return datos

In [3]:
@dask.delayed(nout=2)
def gen_pi_pj(chunks_mi_list: list, true_classes: list, pred_classes: list):
    #pi_dask = [0 for i in range(true_classes_len)]
    pi_dask = np.zeros(len(true_classes))
    pj_dask = np.zeros(len(pred_classes))
    for mi_chunk in chunks_mi_list:
        for index, clase in enumerate(true_classes):
            try:
                index_true_clase = mi_chunk['true_classes'].tolist().index(clase)
                pi_dask[index] = pi_dask[index] + mi_chunk['pi'][mi_chunk['true_classes'].tolist().index(clase)]
            except (IndexError, ValueError):
                None
        for index, clase in enumerate(pred_classes):
            try:
                index_pred_clase = mi_chunk['pred_classes'].tolist().index(clase)
                pj_dask[index] = pj_dask[index] + mi_chunk['pj'][mi_chunk['pred_classes'].tolist().index(clase)]
            except (IndexError, ValueError):
                None
    return (pi_dask, pj_dask)

In [4]:
@dask.delayed(nout=3)
def gen_nzx_nzy_nzval_dask(chunks_mi_list: list, true_classes, pred_classes):
    nzx_dask, nzy_dask, nz_val_dask = np.array([], dtype=np.int64),np.array([], dtype=np.int64),np.array([], dtype=np.int64)
    cross_clusters_list = []
    for mi_chunk in chunks_mi_list:
        true_nzx_np = np.array(list(map(lambda x: mi_chunk['true_classes'][x], mi_chunk['nzx'])))
        true_nzy_np = np.array(list(map(lambda x: mi_chunk['pred_classes'][x], mi_chunk['nzy'])))
        true_nz_val = mi_chunk['nz_val']
        cross_clusters_list.append(Counter(dict(list(zip(zip(true_nzx_np,true_nzy_np),true_nz_val)))))
    cross_clusters = dict(functools.reduce(lambda a,b : a+b,cross_clusters_list))
    for key in cross_clusters.keys():
        nzx_dask = np.append(nzx_dask, true_classes.tolist().index(key[0]))
        nzy_dask = np.append(nzy_dask, pred_classes.tolist().index(key[1]))
        nz_val_dask = np.append(nz_val_dask, cross_clusters[key])
    return (nzx_dask, nzy_dask, nz_val_dask)

In [None]:
@dask.delayed
def mutual_info_score(chunked_mi_list: list, trues: df.Series, preds: df.Series):
    pi, pj = gen_pi_pj(chunked_mi_list, trues, preds)
    nzx,nzy,nz_val = gen_nzx_nzy_nzval_dask(chunked_mi_list)
    contingency_sum = get_contingency_sum(chunked_mi_list)
    log_contingency_nm = np.log(nz_val)
    print(log_contingency_nm)
    contingency_nm = nz_val / contingency_sum
    print(contingency_nm)
    # Don't need to calculate the full outer product, just for non-zeroes
    outer = pi.take(nzx).astype(np.int64) * pj.take(nzy).astype(np.int64)
    print(pi.take(nzx).astype(np.int64))
    print(outer)
    print(pj.take(nzy).astype(np.int64))
    log_outer = get_log_outer(outer,pi,pj)
    mi = (contingency_nm * (log_contingency_nm - log(contingency_sum)) +
          contingency_nm * log_outer)
    return mi.sum()
    

In [5]:
@dask.delayed
def get_log_outer(outer_delayed, pi_delayed, pj_delayed):
    print(outer_delayed)
    print(pi_delayed)
    print(pj_delayed)
    return -np.log(outer_delayed) + np.log(sum(pi_delayed)) + np.log(sum(pj_delayed))

@dask.delayed
def get_mi(contingency_nm_d, log_contingency_nm_d, contingency_sum_d, log_outer_d):
    return (contingency_nm_d * (log_contingency_nm_d - log(contingency_sum_d)) +
          contingency_nm_d * log_outer_d)

@dask.delayed
def get_contingency_sum(chunks_mi_list: list):
    suma = 0
    for mi_chunk in chunks_mi_list:
        suma = suma + mi_chunk['contingency_sum']
    return suma

@dask.delayed
def get_log_contingency_nm(nz_val_delayed):
    return np.log(nz_val_delayed)

@dask.delayed
def get_contingency_nm(contingency_sum_delayed, nz_val_delayed):
    contingency_nm = nz_val_delayed / contingency_sum_delayed
    return contingency_nm

In [6]:
def mutual_information(true: df.DataFrame, pred: df.DataFrame):
    #Mutual information of distributions in format of pd.Series or pd.DataFrame.
    str_trues = true.astype(str).apply(lambda x: ' '.join(x.tolist()), axis=1, meta=('phrase', 'object'))
    str_preds = pred.astype(str).apply(lambda x: ' '.join(x.tolist()), axis=1, meta=('phrase', 'object'))
    true_classes = str_trues.unique()
    pred_classes = str_preds.unique()
    chunked_mi_list = list(map(lambda x: partition_mutual_info_pre_score(x[0],x[1]),list(zip(str_trues.to_delayed(), 
                                                                       str_preds.to_delayed()))))
    pi, pj = gen_pi_pj(chunked_mi_list, true_classes, preds)
    nzx,nzy,nz_val = gen_nzx_nzy_nzval_dask(chunked_mi_list, true_classes, pred_classes)
    contingency_sum = get_contingency_sum(chunked_mi_list)
    log_contingency_nm = get_log_contingency_nm(nz_val)
    contingency_nm = get_contingency_nm(chunked_mi_list, nz_val)
    
    # Don't need to calculate the full outer product, just for non-zeroes
    outer = pi.take(nzx).astype(np.int64) * pj.take(nzy).astype(np.int64)
    log_outer = get_log_outer(outer, pi, pj)
    mi = get_mi(contingency_nm, log_contingency_nm, contingency_sum, log_outer)
    return mi



In [7]:
trues = df.from_pandas(pd.DataFrame([1,2,3,3,2,1]), npartitions=2)
preds = df.from_pandas(pd.DataFrame([4,5,4,4,5,5]), npartitions=2)
mi = mutual_information(trues, preds)

In [8]:
dask.compute(mi)

ValueError: operands could not be broadcast together with shapes (4,) (2,) 

In [None]:
def greedy_bayes(dataset: df.DataFrame, k=0: int, epsilon=0: float):
    """Construct a Bayesian Network (BN) using greedy algorithm.

    Parameters
    ----------
        dataset : DataFrame
            Input dataset, which only contains categorical attributes.
        k : int
            Maximum degree of the constructed BN. If k=0, k is automatically calculated.
        epsilon : float
            Parameter of differential privacy.
    """

    num_tuples, num_attributes = dataset.shape
    if not k:
        k = calculate_k(num_attributes, num_tuples)

    attributes = set(dataset.keys())
    N = []
    V = set()
    V.add(random.choice(attributes))

    print('================== Constructing Bayesian Network ==================')
    for i in range(1, len(attributes)):
        print('Looking for next attribute-parents pair.')
        rest_attributes = attributes - V
        parents_pair_list = []
        mutual_info_list = []
        for child in rest_attributes:
            print('    Considering attribute {}'.format(child))
            for parents in combinations(V, min(k, len(V))):
                parents = list(parents)
                parents_pair_list.append((child, parents))
                # TODO consider to change the computation of MI by combined integers instead of strings.
                mi = mutual_information(dataset[child], dataset[parents])
                mutual_info_list.append(mi)

        if epsilon:
            sampling_distribution = exponential_mechanism(dataset, mutual_info_list, epsilon)
            idx = np.random.choice(list(range(len(mutual_info_list))), p=sampling_distribution)
        else:
            idx = mutual_info_list.index(max(mutual_info_list))

        N.append(parents_pair_list[idx])
        V.add(parents_pair_list[idx][0])

    print('========================= BN constructed =========================')

    return N

In [None]:
np.log