# Create Granger DataFrame

## Imports

In [7]:
# Pandas is an open source data analysis and manipulation tool
import pandas as pd
#adf test
from statsmodels.tsa.stattools import adfuller
#granger causality test
from statsmodels.tsa.stattools import grangercausalitytests

## Load DataFrame

In [5]:
#read dataframe
Occurence_df = pd.read_csv("/Users/jan/Documents/Python_Projects/Bachelorthesis/Analysis/DataFrames/Combined_Dataframe_11042022.csv")
#set correact_datatypes for dataframe
Occurence_df = set_correct_datatypes(Occurence_df)
Occurence_df = Occurence_df.groupby("KeyWord").apply(smoothen_timeseries)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 110380 entries, 0 to 110379
Data columns (total 14 columns):
 #   Column                                        Non-Null Count   Dtype         
---  ------                                        --------------   -----         
 0   KeyWord                                       110380 non-null  object        
 1   date                                          110380 non-null  datetime64[ns]
 2   Occurence_in_News                             110380 non-null  int64         
 3   normalized_Occurence_in_News                  110380 non-null  float64       
 4   Occurence_in_Google                           110380 non-null  int64         
 5   normalized_Occurence_in_Google                110380 non-null  float64       
 6   Occurence_in_Wikipedia                        110380 non-null  int64         
 7   normalized_Occurence_in_Wikipedia             110380 non-null  float64       
 8   smoothened_normalized_Occurence_in_News       105949 n

## Helper

In [13]:
#save dataframe as csv
def saveDF_as_CSV(dataframe, filename):
    dataframe.to_csv("/Users/jan/Documents/Python_Projects/Bachelorthesis/Analysis/DataFrames/"+ filename +".csv",index=False)

def smoothen_timeseries(dataframe):
    df = dataframe.copy()
    window_size = 7
    normalized_Occurence_column_names = ["normalized_Occurence_in_News","normalized_Occurence_in_Google",
                                         "normalized_Occurence_in_Wikipedia"]
    Occurence_column_names = ["Occurence_in_News","Occurence_in_Google","Occurence_in_Wikipedia"]

    Column_names = normalized_Occurence_column_names + Occurence_column_names

    for column_name in Column_names:
        # smoothing Occurence

        data = df[column_name].rolling(window_size).sum()
        dataframe["smoothened_"+column_name] = data
    return dataframe

def set_correct_datatypes(dataframe):
    dataframe.KeyWord.astype(str)
    dataframe["date"] = pd.to_datetime(dataframe["date"], format='%Y-%m-%d')
    return dataframe



## Create DataFrame

This function takes time series data of the Occurence Dataframe, performs ADF Test, performs the grangercausalitytest and saves all results in a dataframe

!!! Data is not corrected for stationarity !!!

In [38]:
granger_test_df = Occurence_df[["KeyWord","date", "Occurence_in_News", "Occurence_in_Wikipedia",]]
granger_test_df= granger_test_df.set_index("date")
# add missing dates
#get first and last date
date_range = pd.date_range(granger_test_df.index.min(), granger_test_df.index.max(), freq='D')

return_list = []

News_grangerCauses_Wikipedia = ['Occurence_in_Wikipedia', 'Occurence_in_News']
Wikipedia_grangerCauses_News = ['Occurence_in_News', 'Occurence_in_Wikipedia']

grouped_by_keyword = granger_test_df.groupby("KeyWord")

not_testable_keywords = []

for key, item in grouped_by_keyword:
    try:
        return_dict = {}
        grouped_keyWord = grouped_by_keyword.get_group(key)

        #get name of keyword
        keyWord = grouped_keyWord["KeyWord"].iloc[0]
        return_dict["keyword_name"] = keyWord
        print(keyWord)

        #add missing dates
        grouped_keyWord.index = pd.DatetimeIndex(grouped_keyWord.index)
        grouped_keyWord = grouped_keyWord.reindex(date_range, fill_value=0)

        #ADF test
        #print(f'ADF Test: {keyword_name} time series')
        adf_test_result_wikipedia = adfuller(pd.DataFrame(grouped_keyWord['Occurence_in_Wikipedia']).values)
        adf_test_result_news      = adfuller(pd.DataFrame(grouped_keyWord['Occurence_in_News']).values)

        #save ADF test results
        #wikipedia
        return_dict["ADF_Statistics_wiki"] = adf_test_result_wikipedia[0]
        return_dict["ADF_p_value_wiki"] = adf_test_result_wikipedia[1]
        return_dict["ADF_critical_values_wiki"] = adf_test_result_wikipedia[4]
        #news
        return_dict["ADF_Statistics_news"] = adf_test_result_news[0]
        return_dict["ADF_p_value_news"] = adf_test_result_news[1]
        return_dict["ADF_critical_values_news"] = adf_test_result_news[4]

        number_of_lags = 7

        granger_test_result = grangercausalitytests(grouped_keyWord[News_grangerCauses_Wikipedia], maxlag=number_of_lags)

        test_name='ssr_ftest'
        #column of result
        column = 1

        p_values = [round(granger_test_result[i+1][0][test_name][column],4) for i in range(number_of_lags)]
        #print(p_values)
        for lag in range(number_of_lags):
            # lags are not zero based
            lag_not_zero_based = lag+1
            if column == 1:
                column = "pValue"
            key = f"{test_name}_{column}_lag{lag_not_zero_based}"
            value = p_values[lag]
            return_dict[key] = value
        return_list.append(return_dict)

    except:
        print(f"Error! {keyWord}" )
        not_testable_keywords.append(keyWord)
        pass



df = pd.DataFrame(return_list)

saveDF_as_CSV(df, "News_grangerCauses_Wikipedia___NOT_Stationary_DataFrame")

2G

Granger Causality
number of lags (no zero) 1
ssr based F test:         F=1.9675  , p=0.1628  , df_denom=150, df_num=1
ssr based chi2 test:   chi2=2.0068  , p=0.1566  , df=1
likelihood ratio test: chi2=1.9938  , p=0.1579  , df=1
parameter F test:         F=1.9675  , p=0.1628  , df_denom=150, df_num=1

Granger Causality
number of lags (no zero) 2
ssr based F test:         F=3.1741  , p=0.0447  , df_denom=147, df_num=2
ssr based chi2 test:   chi2=6.5640  , p=0.0376  , df=2
likelihood ratio test: chi2=6.4263  , p=0.0402  , df=2
parameter F test:         F=3.1741  , p=0.0447  , df_denom=147, df_num=2

Granger Causality
number of lags (no zero) 3
ssr based F test:         F=10.4425 , p=0.0000  , df_denom=144, df_num=3
ssr based chi2 test:   chi2=32.8504 , p=0.0000  , df=3
likelihood ratio test: chi2=29.7232 , p=0.0000  , df=3
parameter F test:         F=10.4425 , p=0.0000  , df_denom=144, df_num=3

Granger Causality
number of lags (no zero) 4
ssr based F test:         F=10.4953 , p=0.000