## Prove that map-reduced version of correlation function is within 0.5% of non-mapreduced

# Imports

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as ss
np.random.seed(42)
from collections import Counter
from multiprocessing import Pool, cpu_count

### Base Correlation Functions

In [2]:
def conditional_entropy(x, y):
    """Calculates the conditional entropy of x given y: S(x|y)
    Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy

    Parameters
    ----------
    x : array-like
        A sequence of measurements.
    y : array-like
        A sequence of measurements.

    Returns
    -------
    float
        The total entropy of x given y

    Examples
    --------
    >>> np.random.seed(1)
    >>> x = np.random.randint(0,2, size=10)
    >>> y = np.random.randint(0,2, size=10)
    >>> conditional_entropy(x,y)
    0.606842558824411

    """
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x, y)))
    total_occurrences = sum(y_counter.values())
    p_xy = np.array([val for val in xy_counter.values()])/total_occurrences
    p_y = np.array([y_counter[xy[1]] for xy in xy_counter.keys()])/total_occurrences
    entropy = np.sum((p_xy * np.log(p_y/p_xy)))
    return entropy


def cramers_v(x, y):
    """Calculates Cramer's V statistic for categorical-categorical association.
    Uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328.
    This is a symmetric coefficient: V(x,y) = V(y,x)
    Original function taken from: https://stackoverflow.com/a/46498792/5863503
    Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

    Parameters
    ----------
    x : array-like
        A sequence of categorical measurements.
    y : array-like
        A sequence of categorical measurements.

    Returns
    -------
    float
        Coefficient in the range [0, 1].

    Examples
    --------
    >>> np.random.seed(1)
    >>> x = np.random.randint(0, 2, size=100)
    >>> y = x
    >>> cramers_v(x, y)
    0.9795896894087645

    """

    confusion_matrix = pd.crosstab(x, y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1), (rcorr-1)))


def theils_u(x, y):
    """Calculates Theil's U statistic (Uncertainty coefficient) for categorical-categorical association.
    This is the uncertainty of x given y: value is on the range of [0,1] - where 0 means y provides no information about
    x, and 1 means y provides full information about x.
    Given the value of x, how many possible states does y have, and how often do they occur.
    This is an asymmetric coefficient: U(x,y) != U(y,x)
    Wikipedia: https://en.wikipedia.org/wiki/Uncertainty_coefficient

    Parameters
    ----------
    x : array-like
        A sequence of categorical measurements.
    y : array-like
        A sequence of categorical measurements.

    Returns
    -------
    float
        Coefficient in the range [0, 1].

    Examples
    --------
    >>> np.random.seed(1)
    >>> x = np.random.randint(0, 2, size=100)
    >>> y = x
    >>> theils_u(x, y)
    1.0

    """
    s_xy = conditional_entropy(x, y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x


def correlation_ratio(categories, measurements):
    """Calculates the Correlation Ratio (sometimes marked by the greek letter Eta) for categorical-continuous association.
    Answers the question - given a continuous value of a measurement, is it possible to know which category is it
    associated with?
    Value is in the range [0,1], where 0 means a category cannot be determined by a continuous measurement, and 1 means
    a category can be determined with absolute certainty.
    Wikipedia: https://en.wikipedia.org/wiki/Correlation_ratio

    Parameters
    ----------
    categories : array-like
        A sequence of categorical measurements.
    measurements : array-like
        A sequence of continuous measurements.

    Returns
    -------
    float
        Coefficient in the range [0, 1].

    Examples
    --------
    >>> np.random.seed(1)
    >>> categories = np.random.randint(0,2, size=100)
    >>> measurements = np.random.rand(100)
    >>> correlation_ratio(categories, measurements)
    0.042988734885557815

    """
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0, cat_num):
        cat_measures = measurements[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2)))
    denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = numerator/denominator
    return eta

In [21]:
def simul_parallel(func, x, y, n=100):
    res = []
    beg = 0
    end = 0
    increment = len(x)//n
    for i in range(int(n)):
        end += increment
        if (len(x) - end) < increment:
            end = None
        res.append(func(x[beg:end], y[beg:end]))
        beg = end

    return np.mean(res)

In [41]:
def parallel_trials(func, x, y, Ns):
    # Simulate Results of Chunk step in Dask ACA
    """
    Call simul_parallel in parallel (utilize multiple CPUs), each call takes:
        * func - a function to apply
        * x - data, array
        * y - data, array
        * n - chunk size
    func is applied to chunks (of size n) of arrays x and y. The results of func applied to
    each chunk are averaged and returned. 
    """
    with Pool(cpu_count()-1) as pool:
        res = pool.starmap(simul_parallel, [(func,x,y,n) for n in Ns])
    return res

In [42]:
def get_max_diff_from_trials(func, x, y, Ns):
    func_ser = func(x,y)
    func_par = parallel_trials(func, x, y, Ns)
    max_diff = (np.abs(np.array(func_par)-func_ser)/func_ser).max()
    return max_diff

### Make Demo Data

In [43]:
cats = np.arange(2)
size = 1000000
cats_1 = np.random.choice(cats, size=size)
cats_2 = np.array([x if np.random.uniform()< .9 else (x+1)%len(cats) for x in cats_1])
conts_1 = np.array([np.random.randn()+.5 if x else np.random.randn()-.5 for x in cats_1])

In [44]:
Ns = np.logspace(1,3, num=10, dtype=int)

# Cramers V Parallel

#### cramers_v

In [45]:
cramer_ser = cramers_v(cats_1, cats_2)
cramer_ser

0.7998418887727863

In [46]:
cramer_max_diff = get_max_diff_from_trials(cramers_v, cats_1, cats_2, Ns)

### Within 0.5% 

In [47]:
print(cramer_max_diff*100)

0.27997057669525616


# Theils_U Parallel

In [34]:
theils_ser = theils_u(cats_1, cats_2)
theils_ser

0.5318704426953134

In [35]:
theils_max_diff = get_max_diff_from_trials(theils_u, cats_1, cats_2, Ns)

### Within 0.5%

In [36]:
print(theils_max_diff*100)

0.21295064331663247


## Correlation Coefficient Parallel

In [37]:
eta_ser = correlation_ratio(cats_1, conts_1)
eta_ser

0.20028934064832624

In [38]:
eta_max_diff = get_max_diff_from_trials(correlation_ratio, cats_1, conts_1, Ns)

## Within 0.5%

In [39]:
print(eta_max_diff*100)

0.24989565669311598


# Conclusion

Even with small partitions, the ACA style aggregation of the correlations will yield results within +- 0.5% which is more than acceptable tolerance for our purposes