In [1]:
import numpy as np
import pandas as pd

In [2]:
seattle_data = pd.read_csv('Data/seattle.txt', sep = '\t', header = 0)
price2001 = seattle_data[seattle_data['Year']  == 2001].Price
price2002 = seattle_data[seattle_data['Year']  == 2002].Price

# 1. Pearson correlation 
-- is a statistic that measures **linear** correlation between two variables ([Wikipedia](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient)).

$$r_{x_1,x_2} = \frac{E((x_1 - Ex_1)(x_2 - Ex_2)}{\sqrt{Dx_1 Dx_2}}$$

In [5]:
from scipy.stats import pearsonr

corr, p_val = pearsonr(price2001, price2002)
print(f'Corellarion coeff.: {corr}, p-value: {p_val}.')

Corellarion coeff.: 0.26082935507435634, p-value: 0.06732402552064617.


# 2. Spearman correlation 
-- is a statistic that measures **monotone** correlation between two variables
$$\rho_{x_1,x_2} = 1- \frac{6}{n^3-n} \sum\limits_{i=1}^{n}(rank(X_{1i}) - rank(X_{2i}))$$

In [7]:
from scipy.stats import spearmanr

corr_sp, p_val_sp = pearsonr(price2001, price2002)
print(f'Corellarion coeff.: {corr_sp}, p-value: {p_val_sp}.')

Corellarion coeff.: 0.26082935507435634, p-value: 0.06732402552064617.


**!** More stable to outliers then Pearson

# 3. Matthews correlation 
-- is a statistic that measures correlation between two binary variables

 x_1\x_2  | 0 | 1 
  ------------- | -------------|---------
  0 | a | b
 1 | c  | d

$$mcc_{x_1,x_2}=\frac{ad-bc}{\sqrt{(a+b)(a+c)(b+d)(c+d)}}$$

In [9]:
a = 239
b = 515
c = 203
d = 718

In [10]:
def mcc(a,b,c,d):
    
    numerator = a * d - c * b
    denominator = np.sqrt((a + b) * (a + c) * (b + d) * (c + d))
    
    return numerator / denominator

In [11]:
mcc(a,b,c,d)

0.10900237458678963

Check statistical significance:

In [13]:
from scipy.stats import chi2_contingency

obs = np.array([[239, 203], [515, 718]])
chi2, p, dof, ex = chi2_contingency(obs)

In [15]:
p

1.0558987006638725e-05

# 4. Cramer's V correlation
-- generalization of Matthews correlation to the case of categorical features

$$cram = \frac{\chi^2(x_1^n, x_2^n)}{n(min(k_1, k_2) - 1)}$$

In [16]:
obs_2 = np.array([[197, 111, 33], [382, 685, 331], [110, 342, 333]])

In [18]:
def cramers_corrected_stat(confusion_matrix):
    
    chi2 = chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    k1, k2 = confusion_matrix.shape
    denom = n * (min(k1,k2) - 1)
    
    return np.sqrt(chi2 / denom)

In [19]:
cramers_corrected_stat(obs_2)

0.2412013934500338

Check statistical significance:

In [21]:
chi2, p, dof, ex = chi2_contingency(obs_2)

In [23]:
p

2.4964299580093467e-62