In [1]:
import random as rn
import numpy as np
import pandas as pd
import scipy.stats as ss

from cfair.backends import NumpyBackend
from cfair.metrics.kernel.hgr import CategoricalHGR, DoubleKernelHGR

In [2]:
backend = NumpyBackend()

# Cramer's V

In [3]:
def cramers_v(confusion_matrix):
    """ calculate Cramers V statistic for categorial-categorial association.
        uses correction from Bergsma and Wicher,
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    chi2 = ss.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))

# OHE kernel function

In [4]:
def one_hot_encode(x):
        unique_vals = np.unique(x)
        return np.array([[1 if val == xi else 0 for val in unique_vals] for xi in x]).transpose()

# Toy Class

In [5]:
toy_one_hot_hgr = CategoricalHGR(
    backend=backend,
    method='trust-constr',
    maxiter=1000,
    eps=1e-9,
    tol=1e-9,
    use_lstsq=True,
    delta_independent=None
)

## First Example - corr = 0

In [6]:
# Example categorical data
a = np.array(['cat', 'dog', 'cat', 'bird'])
b = np.array(['red', 'red', 'blue', 'green'])

In [7]:

ohe_a = one_hot_encode(a)
ohe_b = one_hot_encode(b)

In [8]:
print(ohe_a)

[[0 0 0 1]
 [1 0 1 0]
 [0 1 0 0]]


In [9]:
print(ohe_b)

[[0 0 1 0]
 [0 0 0 1]
 [1 1 0 0]]


In [10]:
result = toy_one_hot_hgr._compute(a, b)
print(result)

None


In [11]:
corr, alpha, beta = toy_one_hot_hgr._indicator(ohe_a, ohe_b, None, None)
print(corr, alpha, beta)

DEBUG: inside indicator - f_numpy = [[0 0 0 1]
 [1 0 1 0]
 [0 1 0 0]]
DEBUG: inside indicator - g_numpy = [[0 0 1 0]
 [0 0 0 1]
 [1 1 0 0]]
DEBUG: somehow i managed to concatenate them
DEBUG: inside indicator - fg_numpy = [[ 0  0  0  1  0  0 -1  0]
 [ 1  0  1  0  0  0  0 -1]
 [ 0  1  0  0 -1 -1  0  0]]
0.9449111800932964 [0.19385645 0.54457232 0.19385645 0.30321213] [0.22038587 0.22038587 0.24541706 0.3138112 ]


## Second Example - corr = 0

In [12]:
c = np.array(['cat', 'dog', 'cat', 'bird'])
d = np.array(['cat', 'dog', 'cat', 'bird'])

In [13]:
ohe_c = one_hot_encode(c)
ohe_d = one_hot_encode(d)

In [14]:
print(ohe_c)

[[0 0 0 1]
 [1 0 1 0]
 [0 1 0 0]]


In [15]:
print(ohe_d)

[[0 0 0 1]
 [1 0 1 0]
 [0 1 0 0]]


In [16]:
result = toy_one_hot_hgr._compute(c, d)
print(result)

None


In [17]:
corr, alpha, beta = toy_one_hot_hgr._indicator(ohe_c, ohe_d, None, None)
print(corr, alpha, beta)

DEBUG: inside indicator - f_numpy = [[0 0 0 1]
 [1 0 1 0]
 [0 1 0 0]]
DEBUG: inside indicator - g_numpy = [[0 0 0 1]
 [1 0 1 0]
 [0 1 0 0]]
DEBUG: somehow i managed to concatenate them
DEBUG: inside indicator - fg_numpy = [[ 0  0  0  1  0  0  0 -1]
 [ 1  0  1  0 -1  0 -1  0]
 [ 0  1  0  0  0 -1  0  0]]
0.999999998875 [0.26516504 0.26516504 0.26516504 0.26516504] [0.25 0.25 0.25 0.25]


# Polinomial Kernels test

In [18]:
polynomial_kernel = DoubleKernelHGR(
    backend=backend,          
    kernel_a=2, 
    kernel_b=3, 
)

## Joke Experiment

In [19]:
a = np.array([1, 2, 3, 4, 5])
b = np.array([5, 4, 3, 2, 1])

result_p = polynomial_kernel._compute(a, b)

DEBUG: inside result - f = [array([1, 2, 3, 4, 5], dtype=int32), array([ 1,  4,  9, 16, 25])]
DEBUG: inside result - f len = 2
DEBUG: inside result - g = [array([5, 4, 3, 2, 1], dtype=int32), array([25, 16,  9,  4,  1]), array([125,  64,  27,   8,   1], dtype=int32)]
DEBUG: inside result - g len = 3
DEBUG: inside result - f after strange things = [[ -2. -10.]
 [ -1.  -7.]
 [  0.  -2.]
 [  1.   5.]
 [  2.  14.]]
DEBUG: inside result - f len = 5
DEBUG: inside result - g after strange things = [[  2.  14.  80.]
 [  1.   5.  19.]
 [  0.  -2. -18.]
 [ -1.  -7. -37.]
 [ -2. -10. -44.]]
DEBUG: inside result - g len = 5
DEBUG: inside indicator - f_numpy = [[ -2. -10.]
 [ -1.  -7.]
 [  0.  -2.]
 [  1.   5.]
 [  2.  14.]]
DEBUG: inside indicator - g_numpy = [[  2.  14.  80.]
 [  1.   5.  19.]
 [  0.  -2. -18.]
 [ -1.  -7. -37.]
 [ -2. -10. -44.]]
DEBUG: somehow i managed to concatenate them
DEBUG: inside indicator - fg_numpy = [[ -2. -10.  -2. -14. -80.]
 [ -1.  -7.  -1.  -5. -19.]
 [  0.  -2.  

In [20]:
print(result_p.value)

0.9999999989999997


In [21]:
print(result_p.alpha)

[-0.98971955 -0.01028045]


In [22]:
print(result_p.beta)

[ 9.90848562e-01 -9.15143279e-03 -5.25302806e-09]


## 1000, all random

In [23]:
num1 = []
num2 = []

for i in range(1000):
    num1.append(rn.randint(0,9))
    num2.append(rn.randint(0,9))

In [24]:
result_1000_pol = polynomial_kernel._compute(num1, num2)

DEBUG: inside result - f = [array([6, 0, 9, 3, 6, 9, 9, 1, 3, 2, 0, 3, 0, 5, 1, 1, 5, 0, 5, 1, 7, 8,
       4, 2, 5, 4, 1, 5, 8, 0, 6, 1, 1, 3, 3, 0, 9, 8, 7, 1, 5, 0, 7, 8,
       5, 7, 2, 3, 4, 2, 9, 6, 3, 0, 5, 4, 2, 6, 4, 6, 2, 8, 4, 4, 2, 8,
       7, 7, 4, 3, 7, 4, 2, 8, 8, 5, 7, 4, 7, 2, 4, 6, 8, 7, 0, 1, 9, 7,
       9, 3, 6, 0, 1, 5, 5, 8, 8, 3, 4, 7, 6, 7, 8, 6, 3, 8, 4, 1, 4, 3,
       6, 6, 4, 7, 2, 2, 1, 8, 0, 4, 5, 1, 4, 7, 5, 4, 5, 3, 1, 2, 8, 9,
       7, 5, 5, 9, 7, 1, 9, 1, 9, 9, 7, 6, 7, 9, 6, 0, 3, 6, 1, 4, 4, 1,
       4, 4, 4, 6, 9, 4, 0, 6, 9, 7, 3, 3, 4, 8, 9, 4, 2, 6, 8, 4, 5, 1,
       7, 0, 4, 9, 5, 8, 1, 9, 0, 7, 8, 8, 1, 0, 6, 9, 1, 2, 6, 0, 0, 0,
       4, 9, 1, 2, 5, 5, 5, 6, 2, 1, 6, 0, 6, 3, 5, 4, 1, 3, 1, 1, 5, 4,
       5, 3, 1, 0, 8, 4, 0, 7, 7, 2, 8, 2, 5, 4, 0, 8, 4, 6, 3, 7, 6, 0,
       0, 5, 4, 3, 2, 9, 3, 7, 2, 1, 3, 4, 6, 6, 1, 9, 7, 8, 3, 4, 6, 2,
       8, 2, 1, 7, 6, 3, 3, 1, 8, 5, 6, 4, 1, 1, 9, 3, 1, 3, 3, 8, 8, 3,
       2, 2, 5, 3, 7, 3

DEBUG: inside result - f after strange things = [[  1.48    7.716]
 [ -4.52  -28.284]
 [  4.48   52.716]
 ...
 [ -0.52  -12.284]
 [  4.48   52.716]
 [  1.48    7.716]]
DEBUG: inside result - f len = 1000
DEBUG: inside result - g after strange things = [[  -4.427  -27.641 -194.657]
 [   0.573   -2.641  -69.657]
 [  -3.427  -26.641 -193.657]
 ...
 [   2.573   21.359  148.343]
 [  -0.427  -11.641 -130.657]
 [  -1.427  -18.641 -167.657]]
DEBUG: inside result - g len = 1000
DEBUG: inside indicator - f_numpy = [[  1.48    7.716]
 [ -4.52  -28.284]
 [  4.48   52.716]
 ...
 [ -0.52  -12.284]
 [  4.48   52.716]
 [  1.48    7.716]]
DEBUG: inside indicator - g_numpy = [[  -4.427  -27.641 -194.657]
 [   0.573   -2.641  -69.657]
 [  -3.427  -26.641 -193.657]
 ...
 [   2.573   21.359  148.343]
 [  -0.427  -11.641 -130.657]
 [  -1.427  -18.641 -167.657]]
DEBUG: somehow i managed to concatenate them
DEBUG: inside indicator - fg_numpy = [[   1.48     7.716    4.427   27.641  194.657]
 [  -4.52   -28.28

In [25]:
print(result_1000_pol.value)

0.04065338574433429


In [26]:
print(result_1000_pol.alpha)

[ 0.90537461 -0.09462539]


In [27]:
print(result_1000_pol.beta)

[-0.81860781  0.17719042 -0.00420177]


# Instantiating my class

In [28]:
my_kernel_one_hot = DoubleKernelHGR(
    backend=backend,          # Or any other appropriate backend
    kernel_a=one_hot_encode, # Custom kernel function for variable a
    kernel_b=one_hot_encode, # Custom kernel function for variable b
)

## Experiment 1 (numbers)

In [29]:
a = np.array([1, 2, 3, 4, 5])
b = np.array([5, 4, 2, 2, 1])

# Compute the HGR indicator
result1 = my_kernel_one_hot._compute(a, b)

DEBUG: inside result - f = [[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]]
DEBUG: inside result - f len = 5
DEBUG: inside result - g = [[0 0 0 0 1]
 [0 0 1 1 0]
 [0 1 0 0 0]
 [1 0 0 0 0]]
DEBUG: inside result - g len = 4
DEBUG: inside result - f after strange things = [[ 0.8 -0.2 -0.2 -0.2 -0.2]
 [-0.2  0.8 -0.2 -0.2 -0.2]
 [-0.2 -0.2  0.8 -0.2 -0.2]
 [-0.2 -0.2 -0.2  0.8 -0.2]
 [-0.2 -0.2 -0.2 -0.2  0.8]]
DEBUG: inside result - f len = 5
DEBUG: inside result - g after strange things = [[-0.2 -0.4 -0.2  0.8]
 [-0.2 -0.4  0.8 -0.2]
 [-0.2  0.6 -0.2 -0.2]
 [-0.2  0.6 -0.2 -0.2]
 [ 0.8 -0.4 -0.2 -0.2]]
DEBUG: inside result - g len = 5
DEBUG: inside indicator - f_numpy = [[ 0.8 -0.2 -0.2 -0.2 -0.2]
 [-0.2  0.8 -0.2 -0.2 -0.2]
 [-0.2 -0.2  0.8 -0.2 -0.2]
 [-0.2 -0.2 -0.2  0.8 -0.2]
 [-0.2 -0.2 -0.2 -0.2  0.8]]
DEBUG: inside indicator - g_numpy = [[-0.2 -0.4 -0.2  0.8]
 [-0.2 -0.4  0.8 -0.2]
 [-0.2  0.6 -0.2 -0.2]
 [-0.2  0.6 -0.2 -0.2]
 [ 0.8 -0.4 -0.2 -0.2]]
DEBUG: someho

In [30]:
print(result1.value)

0.9999999990000001


In [31]:
print(result1.alpha)

[0.20000184 0.20000837 0.20000051 0.20000051 0.19998878]


In [32]:
print(result1.beta)

[0.24998613 0.25000079 0.25001062 0.25000246]


## Experiment 2 (categorical)

In [33]:
a2 = np.array(['cat', 'dog', 'cat', 'dog'])
b2 = np.array(['sus', 'sis', 'sus', 'ses'])

result2 = my_kernel_one_hot._compute(a2, b2)

DEBUG: inside result - f = [[1 0 1 0]
 [0 1 0 1]]
DEBUG: inside result - f len = 2
DEBUG: inside result - g = [[0 0 0 1]
 [0 1 0 0]
 [1 0 1 0]]
DEBUG: inside result - g len = 3
DEBUG: inside result - f after strange things = [[ 0.5 -0.5]
 [-0.5  0.5]
 [ 0.5 -0.5]
 [-0.5  0.5]]
DEBUG: inside result - f len = 4
DEBUG: inside result - g after strange things = [[-0.25 -0.25  0.5 ]
 [-0.25  0.75 -0.5 ]
 [-0.25 -0.25  0.5 ]
 [ 0.75 -0.25 -0.5 ]]
DEBUG: inside result - g len = 4
DEBUG: inside indicator - f_numpy = [[ 0.5 -0.5]
 [-0.5  0.5]
 [ 0.5 -0.5]
 [-0.5  0.5]]
DEBUG: inside indicator - g_numpy = [[-0.25 -0.25  0.5 ]
 [-0.25  0.75 -0.5 ]
 [-0.25 -0.25  0.5 ]
 [ 0.75 -0.25 -0.5 ]]
DEBUG: somehow i managed to concatenate them
DEBUG: inside indicator - fg_numpy = [[ 0.5  -0.5   0.25  0.25 -0.5 ]
 [-0.5   0.5   0.25 -0.75  0.5 ]
 [ 0.5  -0.5   0.25  0.25 -0.5 ]
 [-0.5   0.5  -0.75  0.25  0.5 ]]


In [34]:
print(result2.value)

0.9999999989999999


In [35]:
print(result2.a)

['cat' 'dog' 'cat' 'dog']


In [36]:
print(result2.b)

['sus' 'sis' 'sus' 'ses']


In [37]:
print(result2.alpha)

[0.49998419 0.50001581]


In [38]:
print(result2.beta)

[0.33334036 0.33334036 0.33331928]


## DEBUG CONCATENATION

In [39]:
f = [ [1, 0],[0, 1],[1, 0],[1, 0] ]

f = np.stack(f, axis=1)
f = f - np.mean(f, axis=0)

print(f)

# g = np.stack(g, axis=1)
# g = g - np.mean(g, axis=0)

# print(g)

[[ 0.5 -0.5  0.5  0.5]
 [-0.5  0.5 -0.5 -0.5]]


# Warriors/Animals experiments

## 1. All Random

In [40]:
warriors = ["Ultramarine", "Salamander", "White Scar", "Space Wolf", "Raven Guard", "Iron Hand", "Imperial Fist", "Blood Angel", "Dark Angel"]
animals = ["cat", "dog", "monke", "horse"]

def sample(a_list):
    return rn.choice(a_list)

ani = []
war = []

for i in range(1000):
    ani.append(sample(animals))
    war.append(sample(warriors))

In [41]:
print(ani)

['cat', 'cat', 'monke', 'dog', 'cat', 'dog', 'horse', 'monke', 'cat', 'monke', 'dog', 'monke', 'horse', 'dog', 'monke', 'monke', 'cat', 'cat', 'dog', 'monke', 'cat', 'cat', 'dog', 'horse', 'cat', 'dog', 'dog', 'monke', 'horse', 'monke', 'horse', 'monke', 'monke', 'horse', 'horse', 'dog', 'dog', 'dog', 'horse', 'horse', 'cat', 'monke', 'monke', 'dog', 'cat', 'monke', 'dog', 'horse', 'cat', 'monke', 'dog', 'dog', 'dog', 'horse', 'horse', 'cat', 'monke', 'dog', 'dog', 'dog', 'monke', 'dog', 'cat', 'cat', 'horse', 'dog', 'horse', 'dog', 'monke', 'cat', 'horse', 'monke', 'dog', 'monke', 'monke', 'horse', 'monke', 'cat', 'dog', 'horse', 'horse', 'cat', 'horse', 'dog', 'horse', 'dog', 'dog', 'cat', 'cat', 'cat', 'monke', 'horse', 'horse', 'horse', 'cat', 'dog', 'dog', 'cat', 'horse', 'cat', 'monke', 'monke', 'horse', 'monke', 'dog', 'horse', 'cat', 'horse', 'horse', 'dog', 'dog', 'monke', 'horse', 'dog', 'dog', 'monke', 'horse', 'cat', 'horse', 'monke', 'dog', 'horse', 'dog', 'dog', 'monke', 

In [42]:
print(war)

['Dark Angel', 'Space Wolf', 'Blood Angel', 'Raven Guard', 'Dark Angel', 'Imperial Fist', 'Salamander', 'Salamander', 'Iron Hand', 'Raven Guard', 'Ultramarine', 'Salamander', 'Blood Angel', 'Raven Guard', 'Raven Guard', 'Iron Hand', 'Salamander', 'Salamander', 'Salamander', 'Raven Guard', 'Salamander', 'Salamander', 'Salamander', 'Imperial Fist', 'Raven Guard', 'Space Wolf', 'Raven Guard', 'Iron Hand', 'Space Wolf', 'Iron Hand', 'Raven Guard', 'Dark Angel', 'Dark Angel', 'Imperial Fist', 'Salamander', 'Salamander', 'White Scar', 'Space Wolf', 'Ultramarine', 'Raven Guard', 'Salamander', 'Space Wolf', 'Dark Angel', 'Space Wolf', 'Salamander', 'White Scar', 'Ultramarine', 'White Scar', 'Iron Hand', 'White Scar', 'Ultramarine', 'Ultramarine', 'Blood Angel', 'Imperial Fist', 'Iron Hand', 'Ultramarine', 'Salamander', 'Ultramarine', 'Blood Angel', 'Blood Angel', 'Salamander', 'White Scar', 'Space Wolf', 'Raven Guard', 'Iron Hand', 'Space Wolf', 'Dark Angel', 'Space Wolf', 'Ultramarine', 'Iron

In [43]:
result_1000 = my_kernel_one_hot._result(war, ani, kernel_a=True, kernel_b=True, a0=np.ones(9), b0=np.ones(4))#_compute(war, ani)

DEBUG: inside result - f = [[0 0 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 1 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
DEBUG: inside result - f len = 9
DEBUG: inside result - g = [[1 1 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 1]
 [0 0 1 ... 0 0 0]]
DEBUG: inside result - g len = 4
DEBUG: inside result - f after strange things = [[-0.122  0.886 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [ 0.878 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 ...
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]]
DEBUG: inside result - f len = 1000
DEBUG: inside result - g after strange things = [[ 0.755 -0.249 -0.267 -0.239]
 [ 0.755 -0.249 -0.267 -0.239]
 [-0.245 -0.249 -0.267  0.761]
 ...
 [ 0.755 -0.249 -0.267 -0.239]
 [-0.245 -0.249  0.733 -0.239]
 [-0.245 -0.249  0.733 -0.239]]
DEBUG: inside result - g len = 1000
DEBUG: inside indicator

In [44]:
print(result_1000.value)

0.07387263882127482


In [45]:
print(result_1000.alpha)

[0.10451438 0.12069043 0.11561201 0.1050806  0.09424388 0.11639229
 0.11511426 0.11757801 0.11077415]


In [46]:
print(result_1000.beta)

[-0.16320937  0.55861027  0.24244746 -0.0357329 ]


### Confusion Matrix extravaganza All Random

In [47]:
legionAllRand = pd.DataFrame({'warrior': war, 'steed': ani})

In [48]:
legionAllRand

Unnamed: 0,warrior,steed
0,Dark Angel,cat
1,Space Wolf,cat
2,Blood Angel,monke
3,Raven Guard,dog
4,Dark Angel,cat
...,...,...
995,Raven Guard,dog
996,Ultramarine,monke
997,Space Wolf,cat
998,Iron Hand,horse


In [49]:
confusion_matrix = pd.crosstab(legionAllRand['warrior'], legionAllRand['steed'])
print(confusion_matrix)

steed          cat  dog  horse  monke
warrior                              
Blood Angel     32   29     30     31
Dark Angel      27   30     37     20
Imperial Fist   19   24     30     26
Iron Hand       31   32     25     36
Raven Guard     41   27     31     30
Salamander      27   31     26     24
Space Wolf      23   24     27     19
Ultramarine     20   27     25     26
White Scar      25   25     36     27


In [50]:
cramers_v(confusion_matrix.values)

0.0

## 2. Autocorrelation

In [51]:
result_1000_auto = my_kernel_one_hot._result(war, war, kernel_a=True, kernel_b=True, a0=np.ones(9), b0=np.ones(9))#_compute(war, war)

DEBUG: inside result - f = [[0 0 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 1 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
DEBUG: inside result - f len = 9
DEBUG: inside result - g = [[0 0 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 1 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
DEBUG: inside result - g len = 9
DEBUG: inside result - f after strange things = [[-0.122  0.886 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [ 0.878 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 ...
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]]
DEBUG: inside result - f len = 1000
DEBUG: inside result - g after strange things = [[-0.122  0.886 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [ 0.878 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 ...
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [-0.12

In [52]:
print(result_1000_auto.value)

0.9999999990000056


In [53]:
print(result_1000_auto.alpha)

[-0.08275266  0.16597529  0.07576188  0.20731096  0.20794159  0.04559711
  0.0821245   0.01697606  0.11555996]


In [54]:
print(result_1000_auto.beta)

[-0.05941883  0.16358428  0.08270125  0.20064478  0.20121019  0.05565629
  0.08840581  0.02999539  0.11838318]


It is VERY HIGH -> as i expected, VERY GOODDE!!

### Confusion Matrix extravaganza Autoc

In [55]:
legionAuto = pd.DataFrame({'warrior1': war, 'warrior2': war})

In [56]:
legionAuto

Unnamed: 0,warrior1,warrior2
0,Dark Angel,Dark Angel
1,Space Wolf,Space Wolf
2,Blood Angel,Blood Angel
3,Raven Guard,Raven Guard
4,Dark Angel,Dark Angel
...,...,...
995,Raven Guard,Raven Guard
996,Ultramarine,Ultramarine
997,Space Wolf,Space Wolf
998,Iron Hand,Iron Hand


In [57]:
confusion_matrix = pd.crosstab(legionAuto['warrior1'], legionAuto['warrior2'])
print(confusion_matrix)

warrior2       Blood Angel  Dark Angel  Imperial Fist  Iron Hand  Raven Guard  \
warrior1                                                                        
Blood Angel            122           0              0          0            0   
Dark Angel               0         114              0          0            0   
Imperial Fist            0           0             99          0            0   
Iron Hand                0           0              0        124            0   
Raven Guard              0           0              0          0          129   
Salamander               0           0              0          0            0   
Space Wolf               0           0              0          0            0   
Ultramarine              0           0              0          0            0   
White Scar               0           0              0          0            0   

warrior2       Salamander  Space Wolf  Ultramarine  White Scar  
warrior1                                   

In [58]:
cramers_v(confusion_matrix.values)

1.0

## 3. "Via di mezzo"

In [59]:
ani2 = []

for warrior in war:
    if warrior == "White Scar":
        print("For the Khan!")
        anim = "horse"
    else:
        anim = sample(animals)
    ani2.append(anim)

For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For the Khan!
For th

In [60]:
print(ani2)

['horse', 'horse', 'horse', 'monke', 'horse', 'horse', 'monke', 'monke', 'monke', 'monke', 'monke', 'cat', 'cat', 'horse', 'cat', 'dog', 'dog', 'monke', 'dog', 'cat', 'cat', 'horse', 'horse', 'monke', 'monke', 'monke', 'monke', 'cat', 'cat', 'horse', 'dog', 'cat', 'cat', 'cat', 'cat', 'monke', 'horse', 'horse', 'horse', 'cat', 'cat', 'dog', 'cat', 'monke', 'monke', 'horse', 'dog', 'horse', 'dog', 'horse', 'cat', 'cat', 'cat', 'dog', 'dog', 'horse', 'dog', 'cat', 'monke', 'monke', 'monke', 'horse', 'horse', 'dog', 'cat', 'horse', 'monke', 'dog', 'dog', 'dog', 'horse', 'monke', 'horse', 'cat', 'dog', 'dog', 'cat', 'cat', 'cat', 'horse', 'cat', 'cat', 'monke', 'monke', 'cat', 'cat', 'cat', 'monke', 'dog', 'monke', 'cat', 'cat', 'horse', 'monke', 'monke', 'cat', 'cat', 'horse', 'cat', 'horse', 'cat', 'horse', 'horse', 'monke', 'horse', 'cat', 'horse', 'cat', 'cat', 'horse', 'horse', 'monke', 'monke', 'dog', 'monke', 'dog', 'horse', 'cat', 'cat', 'horse', 'horse', 'horse', 'horse', 'horse',

### Old way (legacy): using the _compute method

In [61]:
result_1000_via_di_mezzo = my_kernel_one_hot._compute(war, ani2)

DEBUG: inside result - f = [[0 0 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 1 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
DEBUG: inside result - f len = 9
DEBUG: inside result - g = [[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [0 0 0 ... 1 1 0]]
DEBUG: inside result - g len = 4
DEBUG: inside result - f after strange things = [[-0.122  0.886 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [ 0.878 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 ...
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]]
DEBUG: inside result - f len = 1000
DEBUG: inside result - g after strange things = [[-0.231 -0.209  0.658 -0.218]
 [-0.231 -0.209  0.658 -0.218]
 [-0.231 -0.209  0.658 -0.218]
 ...
 [-0.231 -0.209 -0.342  0.782]
 [-0.231 -0.209 -0.342  0.782]
 [ 0.769 -0.209 -0.342 -0.218]]
DEBUG: inside result - g len = 1000
DEBUG: inside indicator

In [62]:
print(result_1000_via_di_mezzo.value)

0.5027564768986081


In [63]:
print(result_1000_via_di_mezzo.alpha)

[0.11111229 0.11111185 0.11111168 0.11111152 0.11111211 0.11111167
 0.11111125 0.11111138 0.11110625]


In [64]:
print(result_1000_via_di_mezzo.beta)

[0.25000422 0.25000452 0.2499875  0.25000376]


## Calling directly "_result" [very important experiment, discovered a LOT of things]

If we do not initialize a0 and b0, they will NOT change. BUT, if we initialize them to be all ones, well they will indeed change and at the end their value will highlight the most correlated categories. This is awesome!!

In [65]:
result_1000_via_di_mezzo = my_kernel_one_hot._result(war, ani2, kernel_a=True, kernel_b=True, a0=np.ones(9), b0=np.ones(4))

DEBUG: inside result - f = [[0 0 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 1 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
DEBUG: inside result - f len = 9
DEBUG: inside result - g = [[0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [0 0 0 ... 1 1 0]]
DEBUG: inside result - g len = 4
DEBUG: inside result - f after strange things = [[-0.122  0.886 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [ 0.878 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 ...
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]]
DEBUG: inside result - f len = 1000
DEBUG: inside result - g after strange things = [[-0.231 -0.209  0.658 -0.218]
 [-0.231 -0.209  0.658 -0.218]
 [-0.231 -0.209  0.658 -0.218]
 ...
 [-0.231 -0.209 -0.342  0.782]
 [-0.231 -0.209 -0.342  0.782]
 [ 0.769 -0.209 -0.342 -0.218]]
DEBUG: inside result - g len = 1000
DEBUG: inside indicator

In [66]:
print(result_1000_via_di_mezzo.value)

0.5027564768998772


In [67]:
print(result_1000_via_di_mezzo.alpha)

[ 0.13672536  0.1238617   0.11905881  0.11436622  0.13156068  0.11855992
  0.10652144  0.11019229 -0.03915357]


In [68]:
print(np.unique(result_1000_via_di_mezzo.a))

['Blood Angel' 'Dark Angel' 'Imperial Fist' 'Iron Hand' 'Raven Guard'
 'Salamander' 'Space Wolf' 'Ultramarine' 'White Scar']


As we can see, the highest (or lowest) alfa is the one relative to the 'White Scar', that is the warrior that always chooses the 'horse'! (the more "correlated")

In [69]:
print(result_1000_via_di_mezzo.beta)

[ 0.28128236  0.28925114 -0.16031276  0.26915374]


In [70]:
print(np.unique(result_1000_via_di_mezzo.b))

['cat' 'dog' 'horse' 'monke']


As we can see, the highest (or lowest) beta is the one relative to the 'horse', that is the steed that's always choosen by the 'White Scar'! (the more "correlated")

### Confusion Matrix extravaganza W

In [71]:
legionW = pd.DataFrame({'warrior': war, 'steed': ani2})

In [72]:
legionW

Unnamed: 0,warrior,steed
0,Dark Angel,horse
1,Space Wolf,horse
2,Blood Angel,horse
3,Raven Guard,monke
4,Dark Angel,horse
...,...,...
995,Raven Guard,cat
996,Ultramarine,horse
997,Space Wolf,monke
998,Iron Hand,monke


In [73]:
confusion_matrix = pd.crosstab(legionW['warrior'], legionW['steed'])
print(confusion_matrix)

steed          cat  dog  horse  monke
warrior                              
Blood Angel     35   41     23     23
Dark Angel      27   32     28     27
Imperial Fist   21   17     26     35
Iron Hand       29   34     36     25
Raven Guard     42   30     27     30
Salamander      29   23     29     27
Space Wolf      26   13     30     24
Ultramarine     22   19     30     27
White Scar       0    0    113      0


In [74]:
cramers_v(confusion_matrix.values)

0.2919347375142444

## White Scar - horse and Raven Guard - cat

In [75]:
ani3 = []

for warrior in war:
    if warrior == "White Scar":
        anim = "horse"
    elif warrior == "Raven Guard":
        anim="cat"
    else:
        anim = sample(animals)
    ani3.append(anim)

In [76]:
result_1000_via_di_mezzo_white_raven = my_kernel_one_hot._result(war, ani3, kernel_a=True, kernel_b=True, a0=np.ones(9), b0=np.ones(4))

DEBUG: inside result - f = [[0 0 1 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 1 0 ... 1 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
DEBUG: inside result - f len = 9
DEBUG: inside result - g = [[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 1]
 [0 0 1 ... 0 0 0]
 [1 0 0 ... 1 1 0]]
DEBUG: inside result - g len = 4


DEBUG: inside result - f after strange things = [[-0.122  0.886 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [ 0.878 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 ...
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]]
DEBUG: inside result - f len = 1000
DEBUG: inside result - g after strange things = [[-0.289 -0.164 -0.338  0.791]
 [-0.289  0.836 -0.338 -0.209]
 [-0.289 -0.164  0.662 -0.209]
 ...
 [-0.289 -0.164 -0.338  0.791]
 [-0.289 -0.164 -0.338  0.791]
 [-0.289  0.836 -0.338 -0.209]]
DEBUG: inside result - g len = 1000
DEBUG: inside indicator - f_numpy = [[-0.122  0.886 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [ 0.878 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 ...
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ... -0.093 -0.098 -0.113]
 [-0.122 -0.114 -0.099 ...  0.907 -0.098 -0.113]]
DEBU

In [77]:
print(result_1000_via_di_mezzo_white_raven.value)

0.6398337392141717


(0.6216869820138966 with a0 and b0 None)

In [78]:
print(result_1000_via_di_mezzo_white_raven.alpha)

[ 0.11535118  0.10150546  0.10347363  0.10329035 -0.05231435  0.10189905
  0.11546289  0.1126061   0.19409699]


In [79]:
print(np.unique(result_1000_via_di_mezzo_white_raven.a))

['Blood Angel' 'Dark Angel' 'Imperial Fist' 'Iron Hand' 'Raven Guard'
 'Salamander' 'Space Wolf' 'Ultramarine' 'White Scar']


The 'White Scar' has a big value, as we should expect, and the other "anomalous" value is the 'Raven Guard', that is around 0. They are indeed the two 'correlated' warriors!!

In [80]:
print(result_1000_via_di_mezzo_white_raven.beta)

[-0.13380896  0.23981269  0.3879177   0.23846065]


In [81]:
print(np.unique(result_1000_via_di_mezzo_white_raven.b))

['cat' 'dog' 'horse' 'monke']


Here 'cat' and 'horse' are the anomalous values. Why cat is negative?

In [82]:
print(war.count('Raven Guard'))
print(war.count('White Scar'))
print(war.count('Iron Hand'))

129
113
124


### Confusion Matrix extravaganza WR

In [83]:
legionWR = pd.DataFrame({'warrior': war, 'steed': ani3})

In [84]:
legionWR

Unnamed: 0,warrior,steed
0,Dark Angel,monke
1,Space Wolf,dog
2,Blood Angel,horse
3,Raven Guard,cat
4,Dark Angel,horse
...,...,...
995,Raven Guard,cat
996,Ultramarine,monke
997,Space Wolf,monke
998,Iron Hand,monke


In [85]:
confusion_matrix = pd.crosstab(legionWR['warrior'], legionWR['steed'])
print(confusion_matrix)

steed          cat  dog  horse  monke
warrior                              
Blood Angel     25   19     48     30
Dark Angel      28   23     34     29
Imperial Fist   23   21     29     26
Iron Hand       30   23     39     32
Raven Guard    129    0      0      0
Salamander      25   23     29     31
Space Wolf      14   30     24     25
Ultramarine     15   25     22     36
White Scar       0    0    113      0


In [86]:
cramers_v(confusion_matrix.values)

0.4435287313939479

# 0. Proviamo 10 run diverse - INIZIALIZZANDO a0 e b0

In [87]:
results = []

for i in range(30):

    ani_i = []
    war_i = []
    res_i = {}

    for i in range(1000):
        
        warrior=sample(warriors)

        war_i.append(warrior)

        if warrior == "White Scar":
            anim = "horse"
        elif warrior == "Raven Guard":
            anim="cat"
        else:
            anim = sample(animals)

        ani_i.append(anim)

    result_i = my_kernel_one_hot._result(war_i, ani_i, kernel_a=True, kernel_b=True, a0=np.ones(9), b0=np.ones(4))

    res_i['correlation'] = result_i.value
    res_i['alpha'] = result_i.alpha
    res_i['beta'] = result_i.beta

    results.append(res_i)

DEBUG: inside result - f = [[1 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 1 ... 0 0 1]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
DEBUG: inside result - f len = 9
DEBUG: inside result - g = [[0 0 1 ... 0 1 0]
 [1 1 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]]
DEBUG: inside result - g len = 4
DEBUG: inside result - f after strange things = [[ 0.875 -0.111 -0.12  ... -0.121 -0.091 -0.098]
 [-0.125 -0.111 -0.12  ... -0.121 -0.091 -0.098]
 [-0.125 -0.111  0.88  ... -0.121 -0.091 -0.098]
 ...
 [-0.125  0.889 -0.12  ... -0.121 -0.091 -0.098]
 [-0.125 -0.111 -0.12  ... -0.121 -0.091 -0.098]
 [-0.125 -0.111  0.88  ... -0.121 -0.091 -0.098]]
DEBUG: inside result - f len = 1000
DEBUG: inside result - g after strange things = [[-0.319  0.799 -0.296 -0.184]
 [-0.319  0.799 -0.296 -0.184]
 [ 0.681 -0.201 -0.296 -0.184]
 ...
 [-0.319  0.799 -0.296 -0.184]
 [ 0.681 -0.201 -0.296 -0.184]
 [-0.319 -0.201 -0.296  0.816]]
DEBUG: inside result - g len = 1000
DEBUG: inside indicator

In [88]:
for res in results:
    print(res)

{'correlation': 0.5963489148836054, 'alpha': array([ 0.08960525,  0.09671147,  0.09915804,  0.11509033,  0.24450324,
        0.11069062,  0.1153641 ,  0.10458397, -0.02429299]), 'beta': array([0.49459821, 0.2383111 , 0.02892996, 0.23816073])}
{'correlation': 0.5515874695465922, 'alpha': array([ 0.12953402,  0.12006776,  0.14321841,  0.1107826 , -0.04465897,
        0.12417254,  0.12659003,  0.10671235,  0.09426332]), 'beta': array([-0.25344868,  0.30137803,  0.10860014,  0.33657315])}
{'correlation': 0.4902814051549299, 'alpha': array([ 0.1251578 ,  0.13210605,  0.13364212,  0.13014497, -0.03094299,
        0.1240407 ,  0.11996745,  0.11183664,  0.09216128]), 'beta': array([-0.01509296,  0.41399231,  0.21890806,  0.35200667])}
{'correlation': 0.6060700966129432, 'alpha': array([ 0.09561702,  0.10212803,  0.11751151,  0.10423215, -0.03812062,
        0.11845699,  0.09427515,  0.10946605,  0.22019247]), 'beta': array([0.01294648, 0.2699648 , 0.44495737, 0.27213135])}
{'correlation': 0.60

In [89]:
print(np.unique(result_1000_via_di_mezzo_white_raven.a))

['Blood Angel' 'Dark Angel' 'Imperial Fist' 'Iron Hand' 'Raven Guard'
 'Salamander' 'Space Wolf' 'Ultramarine' 'White Scar']


In [90]:
print(np.unique(result_1000_via_di_mezzo_white_raven.b))

['cat' 'dog' 'horse' 'monke']


The anomalous values are almost always those correlated with the, ehm ehm, CORRELATED entities ('Raven Guard' and 'White Scar' and 'cat' and 'horse')

# 1. Proviamo 10 run diverse - a0 e b0 a None

In [92]:
results_none = []

for i in range(30):

    ani_i = []
    war_i = []
    res_i = {}

    for i in range(1000):
        
        warrior=sample(warriors)

        war_i.append(warrior)

        if warrior == "White Scar":
            anim = "horse"
        elif warrior == "Raven Guard":
            anim="cat"
        else:
            anim = sample(animals)

        ani_i.append(anim)

    result_i = my_kernel_one_hot._result(war_i, ani_i, kernel_a=True, kernel_b=True, a0=None, b0=None)

    res_i['correlation'] = result_i.value
    res_i['alpha'] = result_i.alpha
    res_i['beta'] = result_i.beta

    results_none.append(res_i)

DEBUG: inside result - f = [[0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]]
DEBUG: inside result - f len = 9
DEBUG: inside result - g = [[1 1 0 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 1 1 0]
 [0 0 0 ... 0 0 0]]
DEBUG: inside result - g len = 4
DEBUG: inside result - f after strange things = [[-0.108 -0.112 -0.11  ... -0.11  -0.114 -0.138]
 [-0.108 -0.112 -0.11  ... -0.11  -0.114 -0.138]
 [ 0.892 -0.112 -0.11  ... -0.11  -0.114 -0.138]
 ...
 [-0.108 -0.112 -0.11  ...  0.89  -0.114 -0.138]
 [-0.108 -0.112 -0.11  ... -0.11  -0.114  0.862]
 [-0.108 -0.112 -0.11  ... -0.11  -0.114 -0.138]]
DEBUG: inside result - f len = 1000
DEBUG: inside result - g after strange things = [[ 0.708 -0.181 -0.336 -0.191]
 [ 0.708 -0.181 -0.336 -0.191]
 [-0.292 -0.181  0.664 -0.191]
 ...
 [-0.292 -0.181  0.664 -0.191]
 [-0.292 -0.181  0.664 -0.191]
 [ 0.708 -0.181 -0.336 -0.191]]
DEBUG: inside result - g len = 1000
DEBUG: inside indicator

In [93]:
for res in results_none:
    print(res)

{'correlation': 0.6179666482097272, 'alpha': array([0.11111122, 0.11111127, 0.11111108, 0.11111124, 0.11110697,
       0.11111094, 0.11111087, 0.11111065, 0.11111576]), 'beta': array([0.24999094, 0.24999917, 0.25001073, 0.24999916])}
{'correlation': 0.6132924684656681, 'alpha': array([0.111111  , 0.11111118, 0.11111103, 0.1111109 , 0.11110692,
       0.11111122, 0.11111067, 0.11111124, 0.11111584]), 'beta': array([0.24999052, 0.24999932, 0.2500106 , 0.24999956])}
{'correlation': 0.6046977795057805, 'alpha': array([0.11111096, 0.11111183, 0.11111102, 0.11111139, 0.11111578,
       0.11111073, 0.1111106 , 0.11111101, 0.11110669]), 'beta': array([0.25001066, 0.24999954, 0.24999019, 0.2499996 ])}
{'correlation': 0.5884340929735106, 'alpha': array([0.1111108 , 0.11111144, 0.11111107, 0.11111154, 0.11110689,
       0.11111094, 0.11111111, 0.1111105 , 0.1111157 ]), 'beta': array([0.2499906 , 0.24999939, 0.25001041, 0.24999961])}
{'correlation': 0.6145449034694108, 'alpha': array([0.11111098, 

In [94]:
print(np.unique(result_1000_via_di_mezzo_white_raven.a))

['Blood Angel' 'Dark Angel' 'Imperial Fist' 'Iron Hand' 'Raven Guard'
 'Salamander' 'Space Wolf' 'Ultramarine' 'White Scar']


In [95]:
print(np.unique(result_1000_via_di_mezzo_white_raven.b))

['cat' 'dog' 'horse' 'monke']


The anomalous values are almost always those correlated with the, ehm ehm, CORRELATED entities ('Raven Guard' and 'White Scar' and 'cat' and 'horse')