# Correlation

### Pearson's Correlation coefficient  

Assumptions:
- Independent, identically distributed  
- Normally distributed  
- Similar Variance between independent variables  
  
Null hypothesis: Samples are not correlated

In [1]:
from scipy.stats import pearsonr
from sklearn import datasets
import numpy as np
import pandas as pd
np.random.seed(173)

iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
y = iris.target

pcc, p = pearsonr(X[0], X[2])

print(f"Pearson correlation coefficient: {pcc:.3f}\np-value: {p:.3f}")

Pearson correlation coefficient: 0.872
p-value: 0.000


### Spearman’s Rank Correlation  

Assumption: independent, identically distributed  
Null hypthesis: the samples are not correlated.

In [7]:
from scipy.stats import spearmanr

ranking_1 = np.array([1.1, 2.3, 0.3, -2.6, 5, 5])
ranking_2 = np.array([0.1, 2.6, 0.4, -1.9, 4.5, 5.1])

corr, p = spearmanr(ranking_1, ranking_2)

print(f"Correlation: {corr:.3f}\np-value: {p:.4f}")

Correlation: 0.928
p-value: 0.0077


### Kendall's Rank Correlation  

Assumption: independent, identically distributed

In [8]:
from scipy.stats import kendalltau

corr, p = kendalltau(ranking_1, ranking_2)

print(f"Correlation: {corr:.3f}\np-value: {p:.4f}")

Correlation: 0.828
p-value: 0.0217


### Chi^2  
  
Assumption: Independent, n > 25  
Null Hypothesis: the samples are not correlated

In [13]:
from scipy.stats import chisquare

mean_1 = np.mean(ranking_1)
mean_2 = np.mean(ranking_2)


stat, p = chisquare(ranking_1, ranking_2 - mean_2 + mean_1)

print(f"Statistic: {stat:.3f},\np-value: {p:.3f}")

Statistic: 5.858,
p-value: 0.320
