# Imports

In [1]:
from copy import deepcopy
from dask import delayed
import pandas as pd
import dask.dataframe as dd
from distributed import Client, LocalCluster
from dask.dataframe.core import aca
import scipy.stats as ss
import numpy as np
from collections import Counter
from functools import partial

## Resources

In [20]:
# data_path = '../../../data/flights_data/trip_logs.parquet'
# data_path = '/Users/nathanieldake/development/unsupervised/DSResearchSpikes/010_Column_Correlation/eda_tools/test_table_class.parquet'


In [21]:
data_path = '/Users/nathanieldake/development/unsupervised/data/cw_data/Item_Level_Details_Original.parquet'

In [3]:
cluster = LocalCluster(n_workers=6)
client = Client(cluster)

In [22]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:56968,Cluster  Workers: 6  Cores: 12  Memory: 17.18 GB


In [23]:
REPLACE = 'replace'
DROP_SAMPLES = 'drop_samples'
DROP_FEATURES = 'drop_features'
SKIP = 'skip'
DEFAULT_REPLACE_NUMERIC = 0.0
DEFAULT_REPLACE_NOMINAL = 'MISSING'

## Correlation Func Primitives

In [24]:
def remove_na_rows(x, y):
    df = pd.DataFrame({'x': x, 'y': y})
    df = df.dropna().reset_index(drop=True)
    return df['x'], df['y']

def nan_strategy(func):
    def inner(x, y, **kwargs):
        if kwargs.get('nan_strategy', 'skip') == DROP_SAMPLES:
            x, y = remove_na_rows(x, y)
        return func(x, y)
    return inner

In [25]:
def identify_nominal_columns(df, include=['object', 'category']):
    """Given a dataset, identify categorical columns.
    Parameters:
    -----------
    dataset : a pandas dataframe
    include : which column types to filter by; default: ['object', 'category'])
    Returns:
    --------
    categorical_columns : a list of categorical columns
    Example:
    --------
    >> df = pd.DataFrame({'col1': ['a', 'b', 'c', 'a'], 'col2': [3, 4, 2, 1]})
    >> identify_nominal_columns(df)
    ['col1']
    """

    columns = list(df.select_dtypes(include=include).columns)
    return columns


In [26]:
def conditional_entropy(x, y):
    """Calculates the conditional entropy of x given y: S(x|y)
    Wikipedia: https://en.wikipedia.org/wiki/Conditional_entropy

    Parameters
    ----------
    x : array-like
        A sequence of measurements.
    y : array-like
        A sequence of measurements.

    Returns
    -------
    float
        The total entropy of x given y

    Examples
    --------
    >>> np.random.seed(1)
    >>> x = np.random.randint(0,2, size=10)
    >>> y = np.random.randint(0,2, size=10)
    >>> conditional_entropy(x,y)
    0.606842558824411

    """
    y_counter = Counter(y)
    xy_counter = Counter(list(zip(x, y)))
    total_occurrences = sum(y_counter.values())
    p_xy = np.array([val for val in xy_counter.values()])/total_occurrences
    p_y = np.array([y_counter[xy[1]] for xy in xy_counter.keys()])/total_occurrences
    entropy = np.sum((p_xy * np.log(p_y/p_xy)))
    return entropy

@nan_strategy
def cramers_v(x, y):
    """Calculates Cramer's V statistic for categorical-categorical association.
    Uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328.
    This is a symmetric coefficient: V(x,y) = V(y,x)
    Original function taken from: https://stackoverflow.com/a/46498792/5863503
    Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V

    Parameters
    ----------
    x : array-like
        A sequence of categorical measurements.
    y : array-like
        A sequence of categorical measurements.

    Returns
    -------
    float
        Coefficient in the range [0, 1].

    Examples
    --------
    >>> np.random.seed(1)
    >>> x = np.random.randint(0, 2, size=100)
    >>> y = x
    >>> cramers_v(x, y)
    0.9795896894087645

    """

    confusion_matrix = pd.crosstab(x, y)
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1), (rcorr-1)))

@nan_strategy
def theils_u(x, y):
    """Calculates Theil's U statistic (Uncertainty coefficient) for categorical-categorical association.
    This is the uncertainty of x given y: value is on the range of [0,1] - where 0 means y provides no information about
    x, and 1 means y provides full information about x.
    Given the value of x, how many possible states does y have, and how often do they occur.
    This is an asymmetric coefficient: U(x,y) != U(y,x)
    Wikipedia: https://en.wikipedia.org/wiki/Uncertainty_coefficient

    Parameters
    ----------
    x : array-like
        A sequence of categorical measurements.
    y : array-like
        A sequence of categorical measurements.

    Returns
    -------
    float
        Coefficient in the range [0, 1].

    Examples
    --------
    >>> np.random.seed(1)
    >>> x = np.random.randint(0, 2, size=100)
    >>> y = x
    >>> theils_u(x, y)
    1.0

    """
    s_xy = conditional_entropy(x, y)
    x_counter = Counter(x)
    total_occurrences = sum(x_counter.values())
    p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
    s_x = ss.entropy(p_x)
    if s_x == 0:
        return 1
    else:
        return (s_x - s_xy) / s_x

@nan_strategy
def correlation_ratio(categories, measurements):
    """Calculates the Correlation Ratio (sometimes marked by the greek letter Eta) for categorical-continuous association.
    Answers the question - given a continuous value of a measurement, is it possible to know which category is it
    associated with?
    Value is in the range [0,1], where 0 means a category cannot be determined by a continuous measurement, and 1 means
    a category can be determined with absolute certainty.
    Wikipedia: https://en.wikipedia.org/wiki/Correlation_ratio

    Parameters
    ----------
    categories : array-like
        A sequence of categorical measurements.
    measurements : array-like
        A sequence of continuous measurements.

    Returns
    -------
    float
        Coefficient in the range [0, 1].

    Examples
    --------
    >>> np.random.seed(1)
    >>> categories = np.random.randint(0,2, size=100)
    >>> measurements = np.random.rand(100)
    >>> correlation_ratio(categories, measurements)
    0.042988734885557815

    """
    fcat, _ = pd.factorize(categories)
    cat_num = np.max(fcat)+1
    y_avg_array = np.zeros(cat_num)
    n_array = np.zeros(cat_num)
    for i in range(0, cat_num):
        cat_measures = measurements.iloc[np.argwhere(fcat == i).flatten()]
        n_array[i] = len(cat_measures)
        y_avg_array[i] = np.average(cat_measures)
    y_total_avg = np.sum(np.multiply(y_avg_array, n_array))/np.sum(n_array)
    numerator = np.sum(np.multiply(n_array, np.power(np.subtract(y_avg_array, y_total_avg), 2)))
    denominator = np.sum(np.power(np.subtract(measurements, y_total_avg), 2))
    if numerator == 0:
        eta = 0.0
    else:
        eta = numerator/denominator
    return eta


## Make a symmetrical Theils U with nested Delayed

In [27]:
def theils_u_symmetrical(x, y, **kwargs):
    val_1 = delayed(theils_u)(x, y, **kwargs)
    val_2 = delayed(theils_u)(y, x, **kwargs)
    return delayed(np.mean)([val_1, val_2]).compute()

## Apply-Concat-Apply function for Dask Distributed

In [28]:
def dask_correlation_aca(corr_func, *args, **kwargs):
    my_kwargs = deepcopy(kwargs)
    chunk_kwargs = {'nan_strategy': my_kwargs.pop('nan_strategy', 'skip')}
    my_kwargs.update({'meta': float})
    return aca(args, 
               chunk=corr_func,
               aggregate=np.mean,
               chunk_kwargs=chunk_kwargs,
               **my_kwargs)

In [29]:
def dask_pairwise_pearson(df, x_col, y_col, **kwargs):
    """
    Parameters
    ----------
    df : dask.dataframe
        The
    x_col : str
        Name of a numeric column.
    y_col : str
        Name of a numeric column.

    Returns
    -------
    float
        Coefficient in the range [-1, 1].
    """
    val = df[[x_col, y_col]].corr().values.min()
    return val

def dask_pairwise_corr_func(corr_func, df, x_col, y_col, **kwargs):
    return dask_correlation_aca(corr_func, df[x_col], df[y_col], **kwargs)

## Correlation Getter

In [30]:
class DaskCorrelationGetter():
    def __init__(self, use_theils_u=False):
        """Wraps correlation methods for nominal and numeric series.

        Parameters
        ----------
        use_theils_u : bool, default=False
            Whether or not to use a symmetric Theil's U for nominal-only columns
        """
        self.use_theils_u = use_theils_u
        self.corr_funcs = self._initialize_corr_methods()
        
    def _initialize_corr_methods(self):
        numeric = dask_pairwise_pearson
        catnum = partial(dask_pairwise_corr_func, correlation_ratio)
        if self.use_theils_u:
            categorical = partial(dask_pairwise_corr_func, theils_u_symmetrical)
        else:
            categorical = partial(dask_pairwise_corr_func, cramers_v)
            
        return {
                      'numericnumeric': numeric,
                      'nominalnumeric': catnum,
                      'nominalnominal': categorical,
                      'numericnominal': catnum
        }
    
    def get_corr_value(self, df, x_col, y_col, x_type, y_type, **kwargs):
        key = x_type+y_type
        if key == 'numericnominal':
            x_col, y_col = y_col, x_col
        return self.corr_funcs[key](df, x_col, y_col, **kwargs)

## Base Associations function

In [31]:
def associations_dask(dataset,
                 nominal_columns='auto',
                 mark_columns=False,
                 theils_u=True,
                 nan_strategy=REPLACE,
                 nan_replace_numeric=DEFAULT_REPLACE_NUMERIC,
                 nan_replace_nominal=DEFAULT_REPLACE_NOMINAL):
    """
    Calculate the correlation/strength-of-association of features in data-set
    with both categorical (eda_tools) and continuous features using:
     * Pearson's R for continuous-continuous cases
     * Correlation Ratio for categorical-continuous cases
     * Cramer's V or Theil's U for categorical-categorical cases
    **Returns:** a DataFrame of the correlation/strength-of-association between
    all features
    **Example:** see `associations_example` under `dython.examples`
    Parameters
    ----------
    dataset : dask.dataframe.DataFrame
        The data-set for which the features' correlation is computed
    nominal_columns : string / list / NumPy ndarray
        Names of columns of the data-set which hold categorical values. Can
        also be the string 'all' to state that all columns are categorical,
        'auto' (default) to try to identify nominal columns, or None to state
        none are categorical
    mark_columns : Boolean, default = False
        if True, output's columns' names will have a suffix of '(nom)' or
        '(con)' based on there type (eda_tools or continuous), as provided
        by nominal_columns
    theils_u : Boolean, default = False
        In the case of categorical-categorical feaures, use a symmetrical Theil's U instead
        of Cramer's V. Computation cost is 2x Theils U but will perform better than Cramer's V for higher 
        cardinality.
    nan_strategy : string, default = 'replace'
        How to handle missing values: can be either 'drop_samples' to remove
        samples with missing values, 'drop_features' to remove features
        (columns) with missing values, or 'replace' to replace all missing
        values with the nan_replace_value. Missing values are None and np.nan.
    nan_replace_numeric : numeric, default = 0.0
        The value used to replace missing values with. Only applicable when
        nan_strategy is set to 'replace'
    nan_replace_nominal: str, default = "MISSING"
        The value used to replace missing values with. Only applicable when
        nan_strategy is set to 'replace'
    """
    print('WARNING: High Cardinality Nominal types (e.g. identifiers) will increase run-time non-linearly')
    dataset = dataset.select_dtypes(exclude=['datetime'])
    columns = dataset.columns
    if nominal_columns is None:
        nominal_columns = list()
    elif nominal_columns == 'all':
        nominal_columns = columns
    elif nominal_columns == 'auto':
        nominal_columns = identify_nominal_columns(dataset)
    numeric_columns = list(set(columns) - set(nominal_columns))
    
    col_types = dict([(col,'nominal') if col in nominal_columns else (col,'numeric') for col in columns])
    if nan_strategy == DROP_FEATURES:
        dataset.dropna(axis=1, inplace=True)
    elif nan_strategy == REPLACE:
        dataset[nominal_columns] = dataset[nominal_columns].fillna(nan_replace_nominal)
        dataset[numeric_columns] = dataset[numeric_columns].fillna(nan_replace_numeric)
    
    corrgttr = DaskCorrelationGetter(use_theils_u=theils_u)
    corr_dict = {}
    for i in range(0, len(columns)):
        col_i = columns[i]
        col_i_type = col_types[col_i]
        for j in range(i, len(columns)):
            col_j = columns[j]
            col_j_type = col_types[col_j]
            key = str(i)+'.'+str(j)
            if i == j:
                corr_dict[key] = 1.0
            else:
                val = corrgttr.get_corr_value(df, col_i, col_j, col_i_type, col_j_type)
                corr_dict[key] = val
    corr_dict = delayed(corr_dict).compute()
    
    corr = pd.DataFrame(index=columns, columns=columns)
    for key, val in corr_dict.items():
        col_i = columns[int(key.split('.')[0])]
        col_j = columns[int(key.split('.')[-1])]
        corr.loc[col_i, col_j] = val
        corr.loc[col_j, col_i] = val
    corr.fillna(value=np.nan, inplace=True)
    
    if mark_columns:
        marked_columns = [
            '{} (nom)'.format(col)
            if col in nominal_columns else '{} (con)'.format(col)
            for col in columns
        ]
        corr.columns = marked_columns
        corr.index = marked_columns
    return corr

## Load Test Data

In [41]:
df = dd.read_parquet(data_path, engine='pyarrow')

In [42]:
df = df.repartition(npartitions=6)

In [43]:
df = df.persist()

In [44]:
#drop identifiers
df = df.drop(columns=[col for col in df.columns if '_id' in col])

In [37]:
# Make Nominal type
categoricals = ['canceled', 'diverted']
for col in categoricals:
    df[col] = df[col].astype(str)

In [46]:
%%time
corr_df = associations_dask(df, theils_u=True)





KeyboardInterrupt: 

In [47]:
corr_df.head()

Unnamed: 0,dep_delay,taxi_out,taxi_in,arr_delay,scheduled_elapsed_time,air_time,distance,carrier_delay,weather_delay,national_airspace_delay,...,scheduled_dep_time_month,scheduled_arr_time_hour,scheduled_arr_time_day,scheduled_arr_time_month,dep_time_hour,dep_time_day,dep_time_month,arr_time_hour,arr_time_day,arr_time_month
dep_delay,1.0,0.074259,0.018619,0.937776,0.021679,0.018686,0.021222,0.682229,0.273309,0.303782,...,8.39656e-05,0.009078,0.012128,0.000133,0.046526,0.021655,0.003252,0.033388,0.014561,0.001481
taxi_out,0.074259,1.0,0.065574,0.218284,0.099332,0.108592,0.058233,0.050421,0.075015,0.298157,...,0.0001147705,0.004953,0.005731,0.000115,0.008667,0.006553,6.4e-05,0.006057,0.006314,0.00016
taxi_in,0.018619,0.065574,1.0,0.113699,0.13388,0.143779,0.108469,0.010914,0.009537,0.20785,...,2.275306e-07,0.006439,0.002651,1.5e-05,0.009052,0.002959,7e-06,0.005752,0.002632,4.8e-05
arr_delay,0.937776,0.218284,0.113699,1.0,-0.018549,0.012468,-0.013771,0.657476,0.277643,0.414571,...,0.0001888487,0.007292,0.015789,0.000233,0.037791,0.022639,0.002766,0.030917,0.017607,0.001835
scheduled_elapsed_time,0.021679,0.099332,0.13388,-0.018549,1.0,0.955794,0.978159,0.008639,0.007209,0.052575,...,4.679428e-05,0.137275,0.010152,0.001695,0.016381,0.00328,3.8e-05,0.118093,0.009017,0.001326
