# Python implementation of https://arxiv.org/pdf/1909.10140.pdf

In [5]:
from itertools import product
from joblib import Parallel, delayed
import sys

import numpy as np
from scipy.stats import rankdata
import pandas as pd

In [6]:
def XI_coef(xvec, yvec, simple=True, seed=42):
    np.random.seed(seed)

    n = len(xvec)

    # TODO: Fix this clearly inefficient way of computing the r code line:
    # PI <- rank(xvec, ties.method = "random"
    # This is a shameless stackoverflow copy
    pandas_series = pd.Series(xvec)
    PI = pandas_series.sample(frac=1).rank(method='first').reindex_like(pandas_series).values

    fr = rankdata(yvec, method='max') / n
    gr = rankdata(-yvec, method='max') / n

    ord = np.argsort(PI)
    fr = fr[ord]
    A1 = np.sum(np.abs(fr[1:n - 1] - fr[2:n])) / (2.0 * n)
    CU = np.mean(gr * (1.0 - gr))

    xi = 1.0 - A1 / (CU + sys.float_info.epsilon)
    if simple:
        return xi

    return [xi, fr, CU]


def XI_coef_matrix(np_array):
    assert len(np_array.shape) == 2
    _, w = np_array.shape

    return np.array([XI_coef(np_array[:, i], np_array[:, j])
                     for i, j in product(range(w), repeat=2)]).reshape((w, w))

In [13]:
def XI_coef_matrix_parallel(np_array):
    assert len(np_array.shape) == 2
    _, w = np_array.shape

    def process_index(i, j):
        return XI_coef(np_array[:, i], np_array[:, j])

    results = Parallel(n_jobs=-1)(delayed(process_index)(i, j)
                                  for i, j in product(range(w), repeat=2))
    return np.array(results).reshape((w, w))

In [9]:
# A reasonably sized dataframe for Data Science cases
np_array = np.random.randint(0, 100, size=(int(1e4), 70))

In [11]:
%time _ = XI_coef_matrix(np_array)

CPU times: user 13.4 s, sys: 0 ns, total: 13.4 s
Wall time: 13.4 s


In [14]:
%time _ = XI_coef_matrix_parallel(np_array)

CPU times: user 1.18 s, sys: 399 ms, total: 1.58 s
Wall time: 3.29 s


In [15]:
# TEST: a straight line with noise
size = int(1e6)
list_1 = np.array(list(range(0, size)))
list_2 = np.array(list(range(size, 0, -1))) + np.random.random_sample(len(list_1)) * 100

print("x -> y", XI_coef(list_1, list_2))
print("y -> x", XI_coef(list_2, list_1))
print("std coeff matrix", np.corrcoef(list_1, list_2))

x -> y 0.9998990193219999
y -> x 0.9998990433999999
std coeff matrix [[ 1. -1.]
 [-1.  1.]]


In [18]:
# Odd cases:
np_array = np.zeros(shape=(10, 10))
print("XI coeff matrix", XI_coef_matrix(np_array))

np_array = np.random.randint(0, 100, (1, 10))
print("XI coeff matrix", XI_coef_matrix(np_array))

np_array = np.random.randint(0, 100, (1000, 2))
np_array[:, 1] = -np_array[:, 0]
print("XI coeff matrix", XI_coef_matrix(np_array))

XI coeff matrix [[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
XI coeff matrix [[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]
XI coeff matrix [[0.99704564 0.99701953]
 [0.99704564 0.99701953]]


In [19]:
# TEST: a straight line with itself
size = int(1e6)
list_1 = np.array(list(range(0, size)))

print("x -> y", XI_coef(list_1, list_1))
print("std coeff matrix", np.corrcoef(list_1, list_1))

x -> y 0.999997000006
std coeff matrix [[1. 1.]
 [1. 1.]]


In [20]:
# TEST: a sin(x) function
# The std correlation cooeff is close to 0, but this should hopefully some that
# there is a relationship from x -> y, but a bad one from y -> x
list_1 = np.arange(0, 1000, np.pi/8)
list_2 = np.sin(list_1) + np.random.random_sample(len(list_1))/10

print("x -> y", XI_coef(list_1, list_2))
print("y -> x", XI_coef(list_2, list_1))
print("std coeff matrix", np.corrcoef(list_1, list_2))

x -> y 0.6504869275040979
y -> x -0.017841882054651315
std coeff matrix [[ 1.         -0.00328516]
 [-0.00328516  1.        ]]
