# Create Granger DataFrame

## Imports

In [7]:
# Pandas is an open source data analysis and manipulation tool
import pandas as pd
#adf test
from statsmodels.tsa.stattools import adfuller
#granger causality test
from statsmodels.tsa.stattools import grangercausalitytests

## Load DataFrame

In [5]:
#read dataframe
Occurence_df = pd.read_csv("/Users/jan/Documents/Python_Projects/Bachelorthesis/Analysis/DataFrames/Combined_Dataframe_11042022.csv")
#set correact_datatypes for dataframe
Occurence_df = set_correct_datatypes(Occurence_df)
Occurence_df = Occurence_df.groupby("KeyWord").apply(smoothen_timeseries)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 110380 entries, 0 to 110379
Data columns (total 14 columns):
 #   Column                                        Non-Null Count   Dtype         
---  ------                                        --------------   -----         
 0   KeyWord                                       110380 non-null  object        
 1   date                                          110380 non-null  datetime64[ns]
 2   Occurence_in_News                             110380 non-null  int64         
 3   normalized_Occurence_in_News                  110380 non-null  float64       
 4   Occurence_in_Google                           110380 non-null  int64         
 5   normalized_Occurence_in_Google                110380 non-null  float64       
 6   Occurence_in_Wikipedia                        110380 non-null  int64         
 7   normalized_Occurence_in_Wikipedia             110380 non-null  float64       
 8   smoothened_normalized_Occurence_in_News       105949 n

## Helper

In [13]:
#save dataframe as csv
def saveDF_as_CSV(dataframe, filename):
    dataframe.to_csv("/Users/jan/Documents/Python_Projects/Bachelorthesis/Analysis/DataFrames/"+ filename +".csv",index=False)

def smoothen_timeseries(dataframe):
    df = dataframe.copy()
    window_size = 7
    normalized_Occurence_column_names = ["normalized_Occurence_in_News","normalized_Occurence_in_Google",
                                         "normalized_Occurence_in_Wikipedia"]
    Occurence_column_names = ["Occurence_in_News","Occurence_in_Google","Occurence_in_Wikipedia"]

    Column_names = normalized_Occurence_column_names + Occurence_column_names

    for column_name in Column_names:
        # smoothing Occurence

        data = df[column_name].rolling(window_size).sum()
        dataframe["smoothened_"+column_name] = data
    return dataframe

def set_correct_datatypes(dataframe):
    dataframe.KeyWord.astype(str)
    dataframe["date"] = pd.to_datetime(dataframe["date"], format='%Y-%m-%d')
    return dataframe



## Create DataFrame

This function takes time series data of the Occurence Dataframe, performs ADF Test, performs the grangercausalitytest and saves all results in a dataframe

!!! Data is not corrected for stationarity !!!

In [16]:
granger_test_df = Occurence_df[["KeyWord","date", "Occurence_in_News", "Occurence_in_Wikipedia",]]
granger_test_df= granger_test_df.set_index("date")
# add missing dates
#get first and last date
date_range = pd.date_range(granger_test_df.index.min(), granger_test_df.index.max(), freq='D')

return_list = []

News_grangerCauses_Wikipedia = ['Occurence_in_Wikipedia', 'Occurence_in_News']
Wikipedia_grangerCauses_News = ['Occurence_in_News', 'Occurence_in_Wikipedia']

grouped_by_keyword = granger_test_df.groupby("KeyWord")
try:
    for key, item in grouped_by_keyword:
        return_dict = {}
        grouped_keyWord = grouped_by_keyword.get_group(key)

        #get name of keyword
        keyWord = grouped_keyWord["KeyWord"].iloc[0]
        return_dict["keyword_name"] = keyWord
        print(keyWord)

        #add missing dates
        grouped_keyWord.index = pd.DatetimeIndex(grouped_keyWord.index)
        grouped_keyWord = grouped_keyWord.reindex(date_range, fill_value=0)

        #ADF test
        #print(f'ADF Test: {keyword_name} time series')
        adf_test_result_wikipedia = adfuller(pd.DataFrame(grouped_keyWord['Occurence_in_Wikipedia']).values)
        adf_test_result_news      = adfuller(pd.DataFrame(grouped_keyWord['Occurence_in_News']).values)

        #save ADF test results
        #wikipedia
        return_dict["ADF_Statistics_wiki"] = adf_test_result_wikipedia[0]
        return_dict["ADF_p_value_wiki"] = adf_test_result_wikipedia[1]
        return_dict["ADF_critical_values_wiki"] = adf_test_result_wikipedia[4]
        #news
        return_dict["ADF_Statistics_news"] = adf_test_result_news[0]
        return_dict["ADF_p_value_news"] = adf_test_result_news[1]
        return_dict["ADF_critical_values_news"] = adf_test_result_news[4]

        number_of_lags = 7

        granger_test_result = grangercausalitytests(grouped_keyWord[Wikipedia_grangerCauses_News], maxlag=number_of_lags)

        test_name='ssr_ftest'
        #column of result
        column = 1

        p_values = [round(granger_test_result[i+1][0][test_name][column],4) for i in range(number_of_lags)]
        #print(p_values)
        for lag in range(number_of_lags):
            # lags are not zero based
            lag_not_zero_based = lag+1
            if column == 1:
                column = "pValue"
            key = f"{test_name}_{column}_lag{lag_not_zero_based}"
            value = p_values[lag]
            return_dict[key] = value
        return_list.append(return_dict)

except:
    print(f"Error! {keyWord}" )



df = pd.DataFrame(return_list)

saveDF_as_CSV(df, "Wikipedia_grangerCauses_News___NOT_Stationary_DataFrame")

2G

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=2.0089  , p=0.1585  , df_denom=150, df_num=1
ssr based chi2 test:   chi2=2.0491  , p=0.1523  , df=1
likelihood ratio test: chi2=2.0355  , p=0.1537  , df=1
parameter F test:         F=2.0089  , p=0.1585  , df_denom=150, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=1.0287  , p=0.3600  , df_denom=147, df_num=2
ssr based chi2 test:   chi2=2.1274  , p=0.3452  , df=2
likelihood ratio test: chi2=2.1127  , p=0.3477  , df=2
parameter F test:         F=1.0287  , p=0.3600  , df_denom=147, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=4.4891  , p=0.0048  , df_denom=144, df_num=3
ssr based chi2 test:   chi2=14.1220 , p=0.0027  , df=3
likelihood ratio test: chi2=13.5001 , p=0.0037  , df=3
parameter F test:         F=4.4891  , p=0.0048  , df_denom=144, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=4.5695  , p=0.001

In [8]:
df

Unnamed: 0,keyword_name,ADF_Statistics_wiki,ADF_p_value_wiki,ADF_critical_values_wiki,ADF_Statistics_news,ADF_p_value_news,ADF_critical_values_news,ssr_ftest_pValue_lag1,ssr_ftest_pValue_lag2,ssr_ftest_pValue_lag3,ssr_ftest_pValue_lag4,ssr_ftest_pValue_lag5,ssr_ftest_pValue_lag6,ssr_ftest_pValue_lag7
0,2G,-6.152355,7.509110e-08,"{'1%': -3.4779446621720114, '5%': -2.882415612...",-2.929129,4.206405e-02,"{'1%': -3.4753253063120644, '5%': -2.881274703...",0.1628,0.0447,0.0000,0.0000,0.0001,0.0002,0.0004
1,2G-Regel,-1.781506,3.896814e-01,"{'1%': -3.4744158894942156, '5%': -2.880878382...",-2.630608,8.683474e-02,"{'1%': -3.4756368462466662, '5%': -2.881410446...",0.2824,0.4765,0.6366,0.5006,0.5426,0.4408,0.3383
2,3G,-5.992842,1.729179e-07,"{'1%': -3.4776006742422374, '5%': -2.882265832...",-11.614575,2.466558e-21,"{'1%': -3.473829775724492, '5%': -2.8806228997...",0.9263,0.9745,0.8604,0.0692,0.1051,0.0899,0.1025
3,A,-11.208058,2.159864e-20,"{'1%': -3.473829775724492, '5%': -2.8806228997...",-12.570639,2.007762e-23,"{'1%': -3.473829775724492, '5%': -2.8806228997...",0.0076,0.0377,0.0476,0.0806,0.1077,0.1540,0.1452
4,Ab,-12.506757,2.729811e-23,"{'1%': -3.473829775724492, '5%': -2.8806228997...",-10.305089,3.294816e-18,"{'1%': -3.473829775724492, '5%': -2.8806228997...",0.8998,0.6889,0.3913,0.4013,0.4167,0.5142,0.6270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
735,Ärger,-4.321307,4.079498e-04,"{'1%': -3.474714913481481, '5%': -2.8810087081...",-11.623479,2.353854e-21,"{'1%': -3.473829775724492, '5%': -2.8806228997...",0.7315,0.8551,0.3871,0.4459,0.4721,0.5255,0.7085
736,Ärzte,-4.486036,2.083853e-04,"{'1%': -3.474714913481481, '5%': -2.8810087081...",-4.323685,4.040804e-04,"{'1%': -3.4753253063120644, '5%': -2.881274703...",0.6931,0.6019,0.7662,0.5954,0.4606,0.4579,0.3696
737,Öl,-2.982027,3.660724e-02,"{'1%': -3.476273058920005, '5%': -2.8816876165...",-2.575347,9.820030e-02,"{'1%': -3.4769274060112707, '5%': -2.881972632...",0.0017,0.0129,0.0360,0.0332,0.0051,0.0082,0.0124
738,Österreich,-3.359066,1.243600e-02,"{'1%': -3.4744158894942156, '5%': -2.880878382...",-12.410791,4.348386e-23,"{'1%': -3.473829775724492, '5%': -2.8806228997...",0.5494,0.7045,0.7955,0.8864,0.8307,0.7202,0.7702
