# Distribution Shifts

+ Consider our stock data. 
+ We are interested in testing changes in return distribution for our sample data around the time of the onset of the COVID 19 pandemic.

In [1]:
%load_ext dotenv
%dotenv ./.env
import sys
sys.path.append("../../05_src")
from logger import get_logger
_logs = get_logger(__name__)

In [2]:
import dask
dask.config.set({'dataframe.query-planning': True})
import dask.dataframe as dd
import pandas as pd
import numpy as np
import os
from glob import glob

In [3]:
ft_dir = os.getenv("FEATURES_DATA")
ft_glob = glob(ft_dir+'/*.parquet')
df = dd.read_parquet(ft_glob).compute().reset_index()

## Data Preparation

+ First, prepare four datasets, each with returns between March of a given year and March of the following year.
+ For each data set, we can compute some descriptive statistics.
+ We observe that there may be some distribution changes.

In [4]:
df_2018 = df[(df['Date'] >= '2018-03-01') & (df['Date']  < '2019-03-01')]
df_2019 = df[(df['Date'] >= '2019-03-01') & (df['Date']  < '2020-03-01')]
df_2020 = df[(df['Date'] >= '2020-03-01') & (df['Date']  < '2021-03-01')]
df_2021 = df[(df['Date'] >= '2021-03-01') & (df['Date']  < '2022-03-01')]
df_2022 = df[(df['Date'] >= '2022-03-01') & (df['Date']  < '2023-03-01')]

In [5]:
df_2018['returns'].describe()

count    121482.000000
mean          0.008521
std           0.324490
min          -0.973106
25%          -0.007965
50%           0.000911
75%           0.009286
max          50.656051
Name: returns, dtype: float64

In [6]:
df_2019['returns'].describe()

count    123327.000000
mean          0.008152
std           0.261946
min          -0.892941
25%          -0.007530
50%           0.001030
75%           0.008966
max          40.907243
Name: returns, dtype: float64

In [7]:
df_2020['returns'].describe()

count    123506.000000
mean          0.011066
std           0.491350
min          -0.725330
25%          -0.012945
50%           0.001303
75%           0.016175
max         136.020301
Name: returns, dtype: float64

In [8]:
df_2021['returns'].describe()

count    124739.000000
mean          0.012777
std           0.723247
min          -0.800295
25%          -0.008837
50%           0.000742
75%           0.010323
max         209.045513
Name: returns, dtype: float64

In [9]:
df_2022['returns'].describe()

count    124010.000000
mean          0.008783
std           0.323070
min          -0.824178
25%          -0.012289
50%           0.000164
75%           0.012654
max          68.399998
Name: returns, dtype: float64

# Komogorov-Smirnov Test

+ The KS test can be accessed via the scipy library: `scipy.stats.kstest`
+ This function can be used to perform two sample tests.
+ The null hypothesis is that the two distributions are identical.

In [10]:
from scipy.stats import kstest

kstest(df_2018['returns'].dropna(), 
       df_2019['returns'].dropna())

KstestResult(statistic=0.011870110333109585, pvalue=6.421279579215346e-08, statistic_location=0.017902198410837622, statistic_sign=-1)

In [12]:
kstest(df_2019['returns'].dropna(), 
       df_2020['returns'].dropna())

KstestResult(statistic=0.1359222448771007, pvalue=0.0, statistic_location=0.017462597314621187, statistic_sign=1)

In [13]:
kstest(df_2020['returns'].dropna(), 
       df_2021['returns'].dropna())

KstestResult(statistic=0.10032186205331117, pvalue=0.0, statistic_location=0.017609724541885585, statistic_sign=-1)

In [15]:
kstest(df_2021['returns'].dropna(), 
       df_2022['returns'].dropna())

KstestResult(statistic=0.062022693869437734, pvalue=2.090930605968303e-208, statistic_location=-0.011759943380979188, statistic_sign=-1)