# Mahalanobis Distance

## Mahalanobis distance is the distance between two points in a multivariate space. It’s  used in statistical analyses to find outliers that involve serval variables.


## Formula: d(p,q) = √(p1-q1)^2 + (p2-q2)^2

In [1]:
import numpy as np
import scipy as stats
from scipy.stats import chi2

import warnings
warnings.filterwarnings("ignore") 

# yfinance is used to fetch data 
import yfinance as yf
yf.pdr_override()

In [2]:
symbol = 'AMD'

start = '2018-01-01'
end = '2019-01-01'

# Read data 
dataset = yf.download(symbol,start,end)

# View Columns
dataset.head()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-01-02,10.42,11.02,10.34,10.98,10.98,44146300
2018-01-03,11.61,12.14,11.36,11.55,11.55,154066700
2018-01-04,12.1,12.43,11.97,12.12,12.12,109503000
2018-01-05,12.19,12.22,11.66,11.88,11.88,63808900
2018-01-08,12.01,12.3,11.85,12.28,12.28,63346000


In [3]:
dataset.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-12-24,16.52,17.219999,16.370001,16.65,16.65,62933100
2018-12-26,16.879999,17.91,16.030001,17.9,17.9,108811800
2018-12-27,17.43,17.74,16.440001,17.49,17.49,111373000
2018-12-28,17.530001,18.309999,17.139999,17.82,17.82,109214400
2018-12-31,18.15,18.51,17.85,18.459999,18.459999,84732200


In [4]:
dataset = dataset.drop(['Adj Close', 'Volume'], axis=1)
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-02,10.42,11.02,10.34,10.98
2018-01-03,11.61,12.14,11.36,11.55
2018-01-04,12.1,12.43,11.97,12.12
2018-01-05,12.19,12.22,11.66,11.88
2018-01-08,12.01,12.3,11.85,12.28


In [5]:
def mahalanobis_distance(x=None, data=None, cov=None):

    x_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = np.linalg.inv(cov)
    left = np.dot(x_mu, inv_covmat)
    mahal = np.dot(left, x_mu.T)
    return mahal.diagonal()

In [6]:
df = mahalanobis_distance(x=dataset, data=dataset)
df

array([ 2.34360202,  2.44314893,  1.00051049,  1.21842069,  1.0076011 ,
        1.1097397 ,  1.16944107,  1.06911884,  1.00390335,  0.85728349,
        0.9411238 ,  1.0507168 ,  1.47890511,  0.93506169,  1.04127015,
        0.88274656,  0.91767493,  0.90369209,  0.64917069,  0.61214883,
        0.90104305,  1.04678643,  1.24920747,  6.04067172,  2.26186731,
        1.07026566,  1.62663777,  5.99515177,  0.96906497,  0.9987026 ,
        1.41960866,  0.86803649,  1.14565132,  1.10774379,  1.18389276,
        0.83843433,  1.14792339,  0.9038267 ,  1.51593834,  1.29103672,
        1.43003373,  0.96934244,  0.87583202,  1.06060443,  8.04445156,
        1.44196559,  1.10803103,  1.08750665,  2.09671055,  1.15534847,
        1.14637643,  1.22763692,  1.47350503,  1.13439172,  1.15168505,
        1.19876804,  1.26982093,  1.79159781,  2.17916227,  1.71617204,
        1.51686826,  2.41694351,  1.80552464,  4.09364976,  1.3772735 ,
        2.16947298,  1.67262181,  1.49514589,  1.60919369,  1.59

In [7]:
dataset = dataset.reset_index(drop=True)

In [8]:
dataset.head()

Unnamed: 0,Open,High,Low,Close
0,10.42,11.02,10.34,10.98
1,11.61,12.14,11.36,11.55
2,12.1,12.43,11.97,12.12
3,12.19,12.22,11.66,11.88
4,12.01,12.3,11.85,12.28


In [9]:
dataset['mahalanobis'] = mahalanobis_distance(x=dataset, data=dataset[['Open', 'High', 'Low', 'Close']])
dataset.head()

Unnamed: 0,Open,High,Low,Close,mahalanobis
0,10.42,11.02,10.34,10.98,2.343602
1,11.61,12.14,11.36,11.55,2.443149
2,12.1,12.43,11.97,12.12,1.00051
3,12.19,12.22,11.66,11.88,1.218421
4,12.01,12.3,11.85,12.28,1.007601


In [10]:
dataset['p'] = 1 - chi2.cdf(dataset['mahalanobis'], 4)
dataset.head()

Unnamed: 0,Open,High,Low,Close,mahalanobis,p
0,10.42,11.02,10.34,10.98,2.343602,0.672842
1,11.61,12.14,11.36,11.55,2.443149,0.654844
2,12.1,12.43,11.97,12.12,1.00051,0.909719
3,12.19,12.22,11.66,11.88,1.218421,0.875057
4,12.01,12.3,11.85,12.28,1.007601,0.908641
