In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
df_1 = pd.read_csv("prepared_data_sample_1.csv", parse_dates=["date"])
df_1 = df_1[df_1["date"]>=pd.to_datetime("2020-02-25")].copy(deep=True)
df_2 = pd.read_csv("prepared_data_sample_2.csv", parse_dates=["date"])

In [3]:
#Verschiedene Wachstumsraten für HDAX
df_1["d_close_pct"] = ( df_1["hdax_close"] - df_1["hdax_close"].shift(1) ) / df_1["hdax_close"].shift(1)
df_1["d_close_log"] = ( np.log(df_1["hdax_close"]) - np.log(df_1["hdax_close"].shift(1)))
df_1["d_close_diff"] = ( df_1["hdax_close"] - df_1["hdax_close"].shift(1) )

In [4]:
#Verschiedene Wachstumsraten für Total Cases
df_1["d_total_pct"] = ( df_1["total_cases"] - df_1["total_cases"].shift(1) ) / df_1["total_cases"].shift(1)
df_1["d_total_log"] = ( np.log(df_1["total_cases"]) - np.log(df_1["total_cases"].shift(1)))
df_1["d_total_diff"] = ( df_1["total_cases"] - df_1["total_cases"].shift(1) )

In [5]:
#Verschiedene Wachstumsraten für New Cases
df_1["d_new_pct"] = ( df_1["new_cases"] - df_1["new_cases"].shift(1) ) / df_1["new_cases"].shift(1)
df_1["d_new_log"] = ( np.log(df_1["new_cases"]) - np.log(df_1["new_cases"].shift(1)))
df_1["d_new_diff"] = ( df_1["new_cases"] - df_1["new_cases"].shift(1) )

In [6]:
#Verschiedene Wachstumsraten für Containment Index
df_1["d_containment_pct"] = ( df_1["containment_index"] - df_1["containment_index"].shift(1) ) / df_1["containment_index"].shift(1)
df_1["d_containment_log"] = ( np.log(df_1["containment_index"]) - np.log(df_1["containment_index"].shift(1)))
df_1["d_containment_diff"] = ( df_1["containment_index"] - df_1["containment_index"].shift(1) )

In [7]:
df_1.head(5)

Unnamed: 0,date,hdax_open,hdax_close,total_cases,new_cases,stringency_index,containment_index,Monday,Tuesday,Wednesday,...,d_close_diff,d_total_pct,d_total_log,d_total_diff,d_new_pct,d_new_log,d_new_diff,d_containment_pct,d_containment_log,d_containment_diff
21,2020-02-25,7315.52,7146.96,17.0,1.0,11.11,16.67,0,1,0,...,,,,,,,,,,
22,2020-02-26,7080.95,7123.72,27.0,10.0,16.67,20.24,0,0,1,...,-23.24,0.588235,0.462624,10.0,9.0,2.302585,9.0,0.214157,0.19405,3.57
23,2020-02-27,7005.96,6890.99,46.0,19.0,16.67,20.24,0,0,0,...,-232.73,0.703704,0.532805,19.0,0.9,0.641854,9.0,0.0,0.0,0.0
24,2020-02-28,6645.63,6630.07,48.0,2.0,19.44,24.4,0,0,0,...,-260.92,0.043478,0.04256,2.0,-0.894737,-2.251292,-17.0,0.205534,0.186922,4.16
25,2020-03-02,6738.28,6613.65,159.0,29.0,25.0,27.98,1,0,0,...,-16.42,2.3125,1.197703,111.0,13.5,2.674149,27.0,0.146721,0.136907,3.58


1. Bei Wachstumsraten ist das Problem, wenn keine new Cases und Nullen. Deswegen angefangen ab dem 25. Februar.
2. Man merkt, dass Wachstumsraten anfangs sehr hoch sind, deswegen log Wachstum sehr ungenau
--> Also kein log Wachstum für covid_cases und containment verwenden!

In [8]:
df_1.drop(columns=["hdax_open", "hdax_close", "total_cases",], inplace=True)

In [9]:
#ADF-Test um die verschiedenen Zeitreihen auf Stationarität zu prüfen
from statsmodels.tsa.stattools import adfuller


def adf_test(timeseries):
    print("Results of Dickey-Fuller Test:")
    dftest = adfuller(timeseries, autolag="AIC")
    dfoutput = pd.Series(
        dftest[0:4],
        index=[
            "Test Statistic",
            "p-value",
            "#Lags Used",
            "Number of Observations Used",
        ],
    )
    for key, value in dftest[4].items():
        dfoutput["Critical Value (%s)" % key] = value
    print(dfoutput)

    
#KPSS-Test um die verschiedenen Zeitreiehen auf Stationarität zu prüfen  
from statsmodels.tsa.stattools import kpss


def kpss_test(timeseries):
    print("Results of KPSS Test:")
    kpsstest = kpss(timeseries, regression="c", nlags="auto")
    kpss_output = pd.Series(
        kpsstest[0:3], index=["Test Statistic", "p-value", "Lags Used"]
    )
    for key, value in kpsstest[3].items():
        kpss_output["Critical Value (%s)" % key] = value
    print(kpss_output)

# 1. HDAX Wachstumsraten auf Stationarität testen

## 1.1 ADF-Test

In [11]:
adf_test(df_1["d_close_pct"].dropna())

Results of Dickey-Fuller Test:
Test Statistic                 -2.637632
p-value                         0.085466
#Lags Used                      6.000000
Number of Observations Used    79.000000
Critical Value (1%)            -3.515977
Critical Value (5%)            -2.898886
Critical Value (10%)           -2.586694
dtype: float64


Based upon the significance level of 0.05 and the p-value of ADF test, the null hypothesis can not be rejected. Hence, the series is non-stationary.

In [12]:
adf_test(df_1["d_close_log"].dropna())

Results of Dickey-Fuller Test:
Test Statistic                 -2.613725
p-value                         0.090194
#Lags Used                      6.000000
Number of Observations Used    79.000000
Critical Value (1%)            -3.515977
Critical Value (5%)            -2.898886
Critical Value (10%)           -2.586694
dtype: float64


Based upon the significance level of 0.05 and the p-value of ADF test, the null hypothesis can not be rejected. Hence, the series is non-stationary.

In [13]:
adf_test(df_1["d_close_diff"].dropna())

Results of Dickey-Fuller Test:
Test Statistic                 -2.509850
p-value                         0.113096
#Lags Used                      6.000000
Number of Observations Used    79.000000
Critical Value (1%)            -3.515977
Critical Value (5%)            -2.898886
Critical Value (10%)           -2.586694
dtype: float64


Based upon the significance level of 0.05 and the p-value of ADF test, the null hypothesis can not be rejected. Hence, the series is non-stationary.

--> Laut dem ADF ist die %-Wachstumsrate vom HDAX statistisch am signifikantesten nicht stationär. Allerdings nur unter dem 10% Niveau.

## 1.2 KPSS-Test

In [16]:
kpss_test(df_1["d_close_pct"].dropna())

Results of KPSS Test:
Test Statistic           0.326573
p-value                  0.100000
Lags Used                3.000000
Critical Value (10%)     0.347000
Critical Value (5%)      0.463000
Critical Value (2.5%)    0.574000
Critical Value (1%)      0.739000
dtype: float64


look-up table. The actual p-value is greater than the p-value returned.



Based upon the significance level of 0.05 and the p-value of KPSS test, there is no evidence for rejecting the null hypothesis in favor of the alternative. Hence, the series is stationary as per the KPSS test.

--> KPSS indicates stationarity and ADF indicates non-stationarity - The series is trend stationary. Trend needs to be removed to make series strict stationary. The detrended series is checked for stationarity.

In [17]:
kpss_test(df_1["d_close_log"].dropna())

Results of KPSS Test:
Test Statistic           0.345759
p-value                  0.100000
Lags Used                3.000000
Critical Value (10%)     0.347000
Critical Value (5%)      0.463000
Critical Value (2.5%)    0.574000
Critical Value (1%)      0.739000
dtype: float64


look-up table. The actual p-value is greater than the p-value returned.



Based upon the significance level of 0.05 and the p-value of KPSS test, there is no evidence for rejecting the null hypothesis in favor of the alternative. Hence, the series is stationary as per the KPSS test.

--> KPSS indicates stationarity and ADF indicates non-stationarity - The series is trend stationary. Trend needs to be removed to make series strict stationary. The detrended series is checked for stationarity.

In [18]:
kpss_test(df_1["d_close_diff"].dropna())

Results of KPSS Test:
Test Statistic           0.390086
p-value                  0.081428
Lags Used                3.000000
Critical Value (10%)     0.347000
Critical Value (5%)      0.463000
Critical Value (2.5%)    0.574000
Critical Value (1%)      0.739000
dtype: float64


Based upon the significance level of 0.05 and the p-value of KPSS test, there is no evidence for rejecting the null hypothesis in favor of the alternative. Hence, the series is stationary as per the KPSS test.

### Frage: Die Log-Wachstumsrate scheint Trendstationär zu sein. Man könnte nun also eine Differenzierung der Wachstumsrate vornehmen, was allerdings die Interpretation stark erschweren würde. Oder aber man arbeitet einfach mit der log- bzw. %-Wachstumsrate weiter, obwohl laut dem ADF Test die Stationarität zum 10% Niveau verworfen werden muss. 

# 2. Covid Cases auf Stationarität untersuchen

## 2.1 ADF-Test

In [19]:
adf_test(df_1["d_total_pct"].dropna())

Results of Dickey-Fuller Test:
Test Statistic                 -3.815882
p-value                         0.002749
#Lags Used                     10.000000
Number of Observations Used    75.000000
Critical Value (1%)            -3.520713
Critical Value (5%)            -2.900925
Critical Value (10%)           -2.587781
dtype: float64


Based upon the significance level of 0.05 and the p-value of ADF test, the null hypothesis can be rejected. Hence, the series is stationary.

In [20]:
adf_test(df_1["d_total_log"].dropna())

Results of Dickey-Fuller Test:
Test Statistic                 -2.656899
p-value                         0.081798
#Lags Used                     10.000000
Number of Observations Used    75.000000
Critical Value (1%)            -3.520713
Critical Value (5%)            -2.900925
Critical Value (10%)           -2.587781
dtype: float64


Based upon the significance level of 0.05 and the p-value of ADF test, the null hypothesis can not be rejected. Hence, the series is non-stationary.

In [21]:
adf_test(df_1["d_total_diff"].dropna())

Results of Dickey-Fuller Test:
Test Statistic                 -2.035835
p-value                         0.271085
#Lags Used                      7.000000
Number of Observations Used    78.000000
Critical Value (1%)            -3.517114
Critical Value (5%)            -2.899375
Critical Value (10%)           -2.586955
dtype: float64


Based upon the significance level of 0.05 and the p-value of ADF test, the null hypothesis can not be rejected. Hence, the series is non-stationary.

## 2.2 KPSS-Test

In [22]:
kpss_test(df_1["d_total_pct"].dropna())

Results of KPSS Test:
Test Statistic           1.207636
p-value                  0.010000
Lags Used                4.000000
Critical Value (10%)     0.347000
Critical Value (5%)      0.463000
Critical Value (2.5%)    0.574000
Critical Value (1%)      0.739000
dtype: float64


look-up table. The actual p-value is smaller than the p-value returned.



Based upon the significance level of 0.05 and the p-value of KPSS test, there is evidence for rejecting the null hypothesis in favor of the alternative. Hence, the series is non-stationary as per the KPSS test.

In [23]:
kpss_test(df_1["d_total_log"].dropna())

Results of KPSS Test:
Test Statistic           1.13292
p-value                  0.01000
Lags Used                5.00000
Critical Value (10%)     0.34700
Critical Value (5%)      0.46300
Critical Value (2.5%)    0.57400
Critical Value (1%)      0.73900
dtype: float64


look-up table. The actual p-value is smaller than the p-value returned.



Based upon the significance level of 0.05 and the p-value of KPSS test, there is evidence for rejecting the null hypothesis in favor of the alternative. Hence, the series is non-stationary as per the KPSS test.

In [31]:
kpss_test(df_1["d_total_diff"].dropna())

Results of KPSS Test:
Test Statistic           0.452969
p-value                  0.054324
Lags Used                5.000000
Critical Value (10%)     0.347000
Critical Value (5%)      0.463000
Critical Value (2.5%)    0.574000
Critical Value (1%)      0.739000
dtype: float64


Based upon the significance level of 0.05 and the p-value of KPSS test, there is no evidence for rejecting the null hypothesis in favor of the alternative. Hence, the series is stationary as per the KPSS test.

# 3. New Cases auf Stationarität untersuchen

## 3.1 ADF-Test

In [25]:
adf_test(df_1["d_new_pct"].dropna())

Results of Dickey-Fuller Test:
Test Statistic                -1.095441e+01
p-value                        8.634915e-20
#Lags Used                     0.000000e+00
Number of Observations Used    8.500000e+01
Critical Value (1%)           -3.509736e+00
Critical Value (5%)           -2.896195e+00
Critical Value (10%)          -2.585258e+00
dtype: float64


Based upon the significance level of 0.05 and the p-value of ADF test, the null hypothesis can be rejected. Hence, the series is stationary.

In [26]:
adf_test(df_1["d_new_log"].dropna())

Results of Dickey-Fuller Test:
Test Statistic                 -3.670153
p-value                         0.004553
#Lags Used                      4.000000
Number of Observations Used    81.000000
Critical Value (1%)            -3.513790
Critical Value (5%)            -2.897943
Critical Value (10%)           -2.586191
dtype: float64


Based upon the significance level of 0.05 and the p-value of ADF test, the null hypothesis can be rejected. Hence, the series is stationary.

In [27]:
adf_test(df_1["d_new_diff"].dropna())

Results of Dickey-Fuller Test:
Test Statistic                 -1.662478
p-value                         0.450541
#Lags Used                      5.000000
Number of Observations Used    80.000000
Critical Value (1%)            -3.514869
Critical Value (5%)            -2.898409
Critical Value (10%)           -2.586439
dtype: float64


Based upon the significance level of 0.05 and the p-value of ADF test, the null hypothesis can be rejected. Hence, the series is stationary.

## 3.2 KPSS-Test

In [28]:
kpss_test(df_1["d_new_pct"].dropna())

Results of KPSS Test:
Test Statistic           0.431997
p-value                  0.063363
Lags Used                2.000000
Critical Value (10%)     0.347000
Critical Value (5%)      0.463000
Critical Value (2.5%)    0.574000
Critical Value (1%)      0.739000
dtype: float64


Based upon the significance level of 0.05 and the p-value of KPSS test, there is no evidence for rejecting the null hypothesis in favor of the alternative. Hence, the series is stationary as per the KPSS test.

In [29]:
kpss_test(df_1["d_new_log"].dropna())

Results of KPSS Test:
Test Statistic            0.338039
p-value                   0.100000
Lags Used                21.000000
Critical Value (10%)      0.347000
Critical Value (5%)       0.463000
Critical Value (2.5%)     0.574000
Critical Value (1%)       0.739000
dtype: float64


look-up table. The actual p-value is greater than the p-value returned.



Based upon the significance level of 0.05 and the p-value of KPSS test, there is no evidence for rejecting the null hypothesis in favor of the alternative. Hence, the series is stationary as per the KPSS test.

In [30]:
kpss_test(df_1["d_new_diff"].dropna())

Results of KPSS Test:
Test Statistic            0.500000
p-value                   0.041667
Lags Used                85.000000
Critical Value (10%)      0.347000
Critical Value (5%)       0.463000
Critical Value (2.5%)     0.574000
Critical Value (1%)       0.739000
dtype: float64


Based upon the significance level of 0.05 and the p-value of KPSS test, there is evidence for rejecting the null hypothesis in favor of the alternative. Hence, the series is non-stationary as per the KPSS test.

# 4. Containment Index auf Stationarität prüfen

## 4.1 ADF-Test

In [32]:
adf_test(df_1["d_containment_pct"].dropna())

Results of Dickey-Fuller Test:
Test Statistic                 -3.385961
p-value                         0.011447
#Lags Used                      4.000000
Number of Observations Used    81.000000
Critical Value (1%)            -3.513790
Critical Value (5%)            -2.897943
Critical Value (10%)           -2.586191
dtype: float64


Based upon the significance level of 0.05 and the p-value of ADF test, the null hypothesis can be rejected. Hence, the series is stationary.

In [33]:
adf_test(df_1["d_containment_diff"].dropna())

Results of Dickey-Fuller Test:
Test Statistic                -8.324862e+00
p-value                        3.492661e-13
#Lags Used                     0.000000e+00
Number of Observations Used    8.500000e+01
Critical Value (1%)           -3.509736e+00
Critical Value (5%)           -2.896195e+00
Critical Value (10%)          -2.585258e+00
dtype: float64


Based upon the significance level of 0.05 and the p-value of ADF test, the null hypothesis can be rejected. Hence, the series is stationary.

## 4.2 KPSS-Test

In [34]:
kpss_test(df_1["d_containment_pct"].dropna())

Results of KPSS Test:
Test Statistic           0.924071
p-value                  0.010000
Lags Used                3.000000
Critical Value (10%)     0.347000
Critical Value (5%)      0.463000
Critical Value (2.5%)    0.574000
Critical Value (1%)      0.739000
dtype: float64


look-up table. The actual p-value is smaller than the p-value returned.



Based upon the significance level of 0.05 and the p-value of KPSS test, there is evidence for rejecting the null hypothesis in favor of the alternative. Hence, the series is non-stationary as per the KPSS test.

In [35]:
kpss_test(df_1["d_containment_diff"].dropna())

Results of KPSS Test:
Test Statistic           0.707899
p-value                  0.012827
Lags Used                2.000000
Critical Value (10%)     0.347000
Critical Value (5%)      0.463000
Critical Value (2.5%)    0.574000
Critical Value (1%)      0.739000
dtype: float64


Based upon the significance level of 0.05 and the p-value of KPSS test, there is evidence for rejecting the null hypothesis in favor of the alternative. Hence, the series is non-stationary as per the KPSS test.

--> KPSS indicates non-stationarity and ADF indicates stationarity - The series is difference stationary. Differencing is to be used to make series stationary. The differenced series is checked for stationarity.
