In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import spearmanr, kendalltau

%matplotlib inline

In [3]:
play_data = pd.DataFrame({'x1': [-3, -2, -1, 0, 1, 2, 3], 
               'x2': [9, 4, 1, 0, 1, 4, 9],
               'x3': [1, 2, 3, 4, 5, 6, 7],
               'x4': [2, 5, 15, 27, 28, 30, 31]
})

In [6]:
play_data

Unnamed: 0,x1,x2,x3,x4
0,-3,9,1,2
1,-2,4,2,5
2,-1,1,3,15
3,0,0,4,27
4,1,1,5,28
5,2,4,6,30
6,3,9,7,31


$$CORR(\textbf{x}, \textbf{y}) = \frac{\sum\limits_{i=1}^{n}(x_i - \bar{x})(y_i - \bar{y})}{\sqrt{\sum\limits_{i=1}^{n}(x_i-\bar{x})^2}\sqrt{\sum\limits_{i=1}^{n}(y_i-\bar{y})^2}} $$

where $$\bar{x} = \frac{1}{n}\sum\limits_{i=1}^{n}x_i$$

In [7]:
def pearson_corr(x, y):
    '''
    INPUT
    x - an array of matching length to array y
    y - an array of matching length to array x
    OUTPUT
    corr - the pearson correlation coefficient for comparing x and y
    '''
    mean_x, mean_y = np.sum(x)/len(x), np.sum(y)/len(y)
    
    x_diffs = x - mean_x
    y_diffs = y - mean_y
    
    num = np.sum(x_diffs * y_diffs)
    den = np.sqrt(np.sum(x_diffs**2)) * np.sqrt(np.sum(y_diffs**2))
    
    corr = num / den
    
    return corr

In [11]:
assert pearson_corr(play_data['x1'], play_data['x2']) == np.corrcoef(play_data['x1'], play_data['x2'])[0][1], 'Oops!  The correlation between the first two columns should be 0, but your function returned {}.'.format(pearson_corr(play_data['x1'], play_data['x2']))

$$SCORR(\textbf{x}, \textbf{y}) = \frac{\sum\limits_{i=1}^{n}(x^{r}_i - \bar{x}^{r})(y^{r}_i - \bar{y}^{r})}{\sqrt{\sum\limits_{i=1}^{n}(x^{r}_i-\bar{x}^{r})^2}\sqrt{\sum\limits_{i=1}^{n}(y^{r}_i-\bar{y}^{r})^2}} $$

where

$$\bar{x}^r = \frac{1}{n}\sum\limits_{i=1}^{n}x^r_i$$

In [12]:
def corr_spearman(x, y):
    '''
    INPUT
    x - an array of matching length to array y
    y - an array of matching length to array x
    OUTPUT
    corr - the pearson correlation coefficient for comparing x and y
    '''
    x = x.rank()
    y = y.rank()
    
    mean_x, mean_y = np.sum(x)/len(x), np.sum(y)/len(y)
    
    x_diffs = x - mean_x
    y_diffs = y - mean_y
    
    num = np.sum(x_diffs * y_diffs)
    den = np.sqrt(np.sum(x_diffs**2)) * np.sqrt(np.sum(y_diffs**2))
    
    corr = num / den
    
    return corr

In [13]:
assert corr_spearman(play_data['x1'], play_data['x2']) == spearmanr(play_data['x1'], play_data['x2'])[0], 'Oops!  The correlation between the first two columns should be 0, but your function returned {}.'.format(compute_corr(play_data['x1'], play_data['x2']))

In [14]:
def kendalls_tau(x, y):
    '''
    INPUT
    x - an array of matching length to array y
    y - an array of matching length to array x
    OUTPUT
    tau - the kendall's tau for comparing x and y
    '''    
    # Change each vector to ranked values
    x = x.rank()
    y = y.rank()
    n = len(x)
     
    sum_vals = 0
    # Compute Mean Values
    for i, (x_i, y_i) in enumerate(zip(x, y)):
        for j, (x_j, y_j) in enumerate(zip(x, y)):
            if i < j:
                sum_vals += np.sign(x_i - x_j)*np.sign(y_i - y_j)
                        
    tau = 2*sum_vals/(n*(n-1))
    
    return tau

### Euclidean Distance

Euclidean distance can also just be considered as straight-line distance between two vectors.

For two vectors x and y, we can compute this as:

$$ EUC(\textbf{x}, \textbf{y}) = \sqrt{\sum\limits_{i=1}^{n}(x_i - y_i)^2}$$

### Manhattan Distance
Different from euclidean distance, Manhattan distance is a 'manhattan block' distance from one vector to another. Therefore, you can imagine this distance as a way to compute the distance between two points when you are not able to go through buildings.

Specifically, this distance is computed as:

$$ MANHATTAN(\textbf{x}, \textbf{y}) = \sqrt{\sum\limits_{i=1}^{n}|x_i - y_i|}$$

In [15]:
def eucl_dist(x, y):
    '''
    INPUT
    x - an array of matching length to array y
    y - an array of matching length to array x
    OUTPUT
    euc - the euclidean distance between x and y
    '''  
    return np.linalg.norm(x - y)
    
def manhat_dist(x, y):
    '''
    INPUT
    x - an array of matching length to array y
    y - an array of matching length to array x
    OUTPUT
    manhat - the manhattan distance between x and y
    '''  
    return sum(abs(e - s) for s, e in zip(x, y))