# Dynamic Panel regressions

## Section 1: Load modules

In [None]:
# Import modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from joblib import dump
import os
from joblib import load

Define sentiment measure (move cell to select).

In [None]:
# Ratio filtered sentiment
sent_choice_pos="Sent_ratio_filt_"
sent_choice_neg=("x")

In [None]:
# positive confidence in sentiment 
sent_choice_pos="Sent_conf_pos_"
sent_choice_neg=("Y")

In [None]:
# Average Sentiment Filtered
sent_choice_pos="Sent_avg_filt_"
sent_choice_neg=("Y")

In [None]:
# Positive Sentiment Filtered
sent_choice_pos="Sent_pos_filt_"
sent_choice_neg=("Y")

In [None]:
# average confidence in sentiment 
sent_choice_pos="Sent_avg_conf_neg_"
sent_choice_neg=("Y")

In [None]:
# negative confidence in sentiment 
sent_choice_pos="Sent_conf_pos_"
sent_choice_neg=("Y")

In [None]:
# Ratio sentiment
sent_choice_pos="Sent_ratio_"
sent_choice_neg=("Sent_ratio_filt")

In [None]:
# negative confidence in sentiment 
sent_choice_pos="Sent_conf_neg_"
sent_choice_neg=("Y")

In [None]:
# Define which sentiment score should be loaded

# Negative Sentiment
sent_choice_pos="Sent_neg_"
sent_choice_neg=("Sent_neg_filt")

In [None]:
# Negative Sentiment Filtered
sent_choice_pos="Sent_neg_filt_"
sent_choice_neg=("Y")

In [None]:
# Average Sentiment 
sent_choice_pos="Sent_avg_"
sent_choice_neg=("Sent_avg_filt_")

In [None]:
# average confidence in sentiment 
sent_choice_pos="News_vol_"
sent_choice_neg=("y")

In [None]:
# average confidence in sentiment 
sent_choice_pos="Sent_avg_conf_"
sent_choice_neg=("Sent_avg_conf_n")

In [None]:
# Ratio sentiment confidence
sent_choice_pos="Sent_ratio_conf_"
sent_choice_neg=("Sent_ratio_filt")

In [None]:
# absolute difference in sentiment confidence
sent_choice_pos="Sent_conf_abs_"
sent_choice_neg=("Y")

## Section 1: Transform prices to returns

### Section 1.1: Load data, exploratory data analysis

In [None]:
df_prices = pd.read_excel("../01_Data/01_Eikon/2_Prices/2_Full Stock Prices data/2_WRDS_SP 500 Full stock price.xlsx", index_col="Date")
df_prices.head()

In [None]:
# plot some companies prices 
plt.subplot(311)
df_prices['AAPL.O'].plot(figsize=(16, 6), legend=True)
plt.subplot(312)
df_prices['PG'].plot(figsize=(16, 6), legend=True)
plt.subplot(313)
df_prices['.SPX'].plot(figsize=(16, 6), legend=True)

In [None]:
# calculate returns, fills Nan with prior price by default
df_returns=df_prices.pct_change().dropna(how="all")

# Store to excel 
#df_returns.to_excel("../01_Data/10_Modelling/50_Daily_firm_specific_returns.xlsx")

df_returns

In [None]:
# calculate the unconditional mean of SPX returns in % 
print(df_returns[df_returns.index <"2019-09-01"]['.SPX'].mean()*100)
mean_in_bps=df_returns[df_returns.index <"2019-09-01"]['.SPX'].mean()*10000
print(mean_in_bps)
print(1.8/mean_in_bps)


In [None]:
# calculate the unconditional mean of SPX returns in % 
print(df_returns[(df_returns.index >"2019-12-31") & (df_returns.index <"2020-03-01")]['.SPX'].mean()*100)

In [None]:
# calculate the unconditional mean of SPX returns in % 
print(df_returns[(df_returns.index >"2020-02-29")]['.SPX'].mean()*100)

In [None]:
# Create a new dataframe that contains excess returns over the S&P 500 (market returns)
df_excess_returns = pd.DataFrame()
for x in df_returns.columns:
    df_excess_returns[x] = df_returns[x] - df_returns[".SPX"]

# Drop  column including S&P returns    
df_excess_returns.drop([".SPX"], axis=1, inplace=True)

df_excess_returns

# Store to excel 
#df_excess_returns.to_excel("../01_Data/10_Modelling/50_Daily_firm_specific_excess_returns.xlsx")

## Section 2: Data transformation 

### Section 2.1: Transform return data to panel

In [None]:
temp=df_excess_returns.reset_index()
temp

In [None]:
# Transform return data to panel data
df_ex_ret_transf = df_excess_returns.reset_index().melt(id_vars='Date',var_name = 'Company', value_name = 'Excess_returns')
df_ex_ret_transf

In [None]:
# Remove rows for companies with NaN Excess_returns (days not in index)
df_ex_ret_transf= df_ex_ret_transf[df_ex_ret_transf['Excess_returns'].notna()]
df_ex_ret_transf

In [None]:
# Set Company and date as multiindex
df_ex_ret_transf = df_ex_ret_transf.set_index(['Company', 'Date'])
df_ex_ret_transf

In [None]:
# Create lagged returns for each companies' returns
for i in range(1,6):
    df_ex_ret_transf['ER_L'+str(i)] = df_ex_ret_transf.groupby(level=0)['Excess_returns'].shift(i)
df_ex_ret_transf

In [None]:
df_ex_ret_transf=df_ex_ret_transf.reset_index()

In [None]:
# Convert Date to the right format
df_ex_ret_transf.Date=pd.to_datetime(df_ex_ret_transf["Date"]).dt.date
df_ex_ret_transf

### Section 2.2: Transform sentiment data to panel data

In [None]:
# Load sentiment scores
df_sp500_scores=pd.read_excel("../01_Data/10_Modelling/32_word2vec_Sentiment Analysis_Semeval_Daily_firm_specific_sentiment_scores.xlsx",\
                              usecols=lambda x: x.startswith(("Date",sent_choice_pos)) and not x.startswith(sent_choice_neg))
df_sp500_scores

In [None]:
# Transform sentiment data to panel data
df_sp500_scores = df_sp500_scores.melt(id_vars='Date',var_name = 'Company', value_name = sent_choice_pos)
df_sp500_scores

In [None]:
# remove Sentiment Descr from company label
df_sp500_scores["Company"]=df_sp500_scores['Company'].str.replace(sent_choice_pos, '')
df_sp500_scores

In [None]:
df_sp500_scores.iloc[:,2]

In [None]:
df_sp500_scores.describe()

In [None]:
df_sp500_scores[sent_choice_pos].plot(figsize=(15,5))

In [None]:
# identify highly negative score 
#df_sp500_scores[df_sp500_scores[sent_choice_pos]>100]["Companies"]

In [None]:
# consider winsorizing the data

"""from scipy.stats.mstats import winsorize

# test winsorising the top 5 % 
df_main[sent_choice_pos]= winsorize(df_main[sent_choice_pos], limits=[0.0001, 0.1],nan_policy="omit")
df_main[sent_choice_pos].plot(figsize=(15,5))"""

In [None]:
# Remove missing values (days without news)
df_sp500_scores=df_sp500_scores.dropna()

In [None]:
# Convert Date to the right format
df_sp500_scores.Date=pd.to_datetime(df_sp500_scores["Date"]).dt.date
df_sp500_scores

In [None]:
""" Laggs through weekends

# Set Company and date as multiindex
df_sp500_scores = df_sp500_scores.set_index(['Company', 'Date'])
df_sp500_scores

# Create lagged returns for each companies' returns
for i in range(1,6):
    df_sp500_scores[sent_choice_pos+'_L'+str(i)] = df_sp500_scores.groupby(level=0)[sent_choice_pos].shift(i)
df_sp500_scores

df_sp500_scores.to_excel("test_score.xlsx")

"""

### Section 2.3: Merge Panel data

In [None]:
# merge dataframes (left outer)
#df_main = pd.merge(df_sp500_scores, df_excess_returns, left_index=True, right_index=True, how="outer")

# merge dataframes (left - only keep trading days)
df_main = pd.merge(df_ex_ret_transf,df_sp500_scores, on=["Date","Company"], how="left")
df_main

In [None]:
# Filter out all companies except the ones analysed by Ahmad
list_comp=["AAPL","BA","CVX","F", "GE","HD","HPE","IBM","INTC","JNJ","MRK","MSFT","PFE","VZ", "WMT"]
#df_main=df_main[df_main.Company.isin(list_comp)]
#dell not constituation anymore

In [None]:
# Set Company and date as multiindex
df_main = df_main.set_index(['Company', 'Date'])
df_main

# Create lagged returns for each companies' returns
for i in range(1,6):
    df_main[sent_choice_pos+'L_'+str(i)] = df_main.groupby(level=0)[sent_choice_pos].shift(i)
df_main

df_main = df_main.reset_index()

In [None]:
# Check that data is stacked
df_main.iloc[33:340]

In [None]:
#df_main=df_main[df_main.Company!="MCO"]

In [None]:
# store to excel
#df_main.to_excel("../01_Data/10_Modelling/50_Stata_Panel_Dataframe.xlsx")

In [None]:
# Interpolate missing values linearly
#df.interpolate(method="linear", axis=0).ffill().bfill()

### Section 2.4: Set up control variables / dummies

In [None]:
dict_days={0:"Mon",
          1:"Tue",
          2:"Wed",
          3:"Thu",
          4:"Fri",
          5:"Sat",
          6:"Sun"}


# Insert controll variable for the day of the week (0=Monday, 6=Sunday)
df_main["Weekday"]=pd.to_datetime(df_main.Date).dt.dayofweek.map(dict_days)
df_main

In [None]:
# Create dummy for monday
df_main["Monday"]=[1 if x=="Mon" else 0 for x in df_main["Weekday"]]

In [None]:
""""# Insert control variable for January (Tetlock 2007), 
# outcommented as no January in training set 
df_main["January"]=[1 if x==1 else 0 for x in pd.to_datetime(df_main.Date).dt.month]
df_main"""

In [None]:
# Remove weekday column
df_main.drop(columns=["Weekday"],inplace=True)
df_main

In [None]:
# Create Company Dummies (FE Regression)
df_main["Companies"]=df_main["Company"]
df_main=pd.get_dummies(df_main,columns=["Company"], drop_first=True)
df_main

In [None]:
# Set index back to Date
df_main= df_main.set_index("Date")
df_main

In [None]:
# Export to csv
#df_main.to_csv("../01_Data/10_Modelling/50_Stata_Panel_Dataframe_incl_dummies.csv")

## Section 3: Dynamic Panel regressions

#### Section 3.1: Train-Test Split

In [None]:
"""from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df_main, test_size=0.2, random_state=7, shuffle=False)
print(f"{train_set.shape[0]} train and {test_set.shape[0]} test instances")
"""
# use 80% / 20% roughly -> use time series split 

In [None]:
# Split dataset into train and test sets
train_set=df_main[df_main.index < "2019-10-01"]
test_set_pre_corona=df_main[(df_main.index > "2019-09-30")&(df_main.index < '2020-03-01')]
test_set_corona=df_main[df_main.index > '2020-03-01']
test_set_full=df_main[df_main.index > '2019-09-30']


#test_set_pre_corona=df_main[((df_main.index > "2019-09-30") & (df_main.index < "2020-01-01"))| \
#                            ((df_main.index > '2020-01-31')& (df_main.index < "2020-03-01"))]

#test_set_pre_corona=df_main[(df_main.index > "2019-09-30")&(df_main.index < '2020-01-01')]
#test_set_corona=df_main[df_main.index > '2019-12-31']

In [None]:
train_set.sort_index(ascending=False)

In [None]:
# to excel to check if datasets are filtered correctly
#train_set["Companies"].to_excel("test.xlsx")
#test_set_pre_corona["Companies"].to_excel("test2.xlsx")

#### Section 3.2  Data cleaning and transformation

To build a VAR model, the data needs to be stationary.
To do so, we will firstly test for that using the Augmented Dickey-Fuller (ADF) test and the KPSS (Kwiatkowski-Phillips-Schmidt-Shin) tests.

The test will only be applied on the training data and if necessary any transformations will be applied to both sets.

First, we will test the raw data.

In [None]:
# replace missing data by 0
#train_set=train_set.fillna(0)
train_set

In [None]:
# Define columns
list_columns=["Excess_returns",sent_choice_pos]

In [None]:
from statsmodels.tsa.stattools import adfuller, kpss

# implement stationarity check for each timeseries of the selected sentiment measure

# transform the table back to a company per column
train_set_test=train_set.pivot(index=train_set.index,columns="Companies")[sent_choice_pos]
train_set_test

list_results=[]

for x in train_set_test.columns:
    adf_pval = adfuller(train_set_test.fillna(0)[x])[1]
   # print(x)
  #  print(f"ADF, p-value: {adf_pval}")
    kpss_stat, kpss_pval, lags, crit_vals = kpss(train_set_test.fillna(0)[x])
   # print(f"KPSS, p-value: {kpss_pval}")
    
    list_results.append(
        {
            'Company': x,
            'ADF': adf_pval,
            'KPSS':  kpss_pval
        }
    )
    
df_results=pd.DataFrame(list_results)
df_results

In [None]:
# Define function to create stationarity flag
def cond_check(df):
    if (df.ADF < 0.05) and (df.KPSS >0.05):
        return 1
    else:
        return 0
    
# apply function
df_results["stationary_flag"]=df_results.apply(cond_check,axis=1)

# print result
print("Non-stationary Sentiment Timeseries: " + df_results[df_results.stationary_flag==0]["stationary_flag"].count().astype(str))

In [None]:
# show companies that fail stationary criterium
list_drop=[]
list_drop=df_results[df_results["stationary_flag"]==0]["Company"]
list_drop[:5]

In [None]:
train_set.shape

In [None]:
# drop companies that failed the stationary criterion from dataframe
train_set=train_set[~train_set.Companies.isin(list_drop)]
train_set.shape

In [None]:
from statsmodels.tsa.stattools import adfuller, kpss

# implement stationarity check for each timeseries of the selected sentiment measure

# transform the table back to a company per column
train_set_test=train_set.pivot(index=train_set.index,columns="Companies")["Excess_returns"]
train_set_test

list_results=[]

for x in train_set_test.columns:
    adf_pval = adfuller(train_set_test.fillna(0)[x])[1]
   # print(x)
  #  print(f"ADF, p-value: {adf_pval}")
    kpss_stat, kpss_pval, lags, crit_vals = kpss(train_set_test.fillna(0)[x])
   # print(f"KPSS, p-value: {kpss_pval}")
    
    list_results.append(
        {
            'Company': x,
            'ADF': adf_pval,
            'KPSS':  kpss_pval
        }
    )
    
df_results=pd.DataFrame(list_results)
#df_results

In [None]:
df_results["stationary_flag"]=df_results.apply(cond_check,axis=1)
print("Non-stationary Return Timeseries: " + df_results[df_results.stationary_flag==0]["stationary_flag"].count().astype(str))

In [None]:
# show companies that fail stationary criterium
list_drop=df_results[df_results["stationary_flag"]==0]["Company"]
list_drop[:5]

In [None]:
train_set.shape

Returns should be stationary. As they are already differenced. However, some time series are not. Thus, they will be dropped too.

In [None]:
# drop companies that failed the stationary criterion from dataframe
train_set=train_set[~train_set.Companies.isin(list_drop)]
train_set.shape

In [None]:
train_set.index.value_counts()

In [None]:
# the dropping of the companies that failed the stationary criterion is performed later 
# in the out of sample performance checks

In [None]:
#df_results.to_excel("1_Results/Stationarity/Stationarity_Test_"+sent_choice_pos+"_results.xlsx")

In [None]:
#train_set[sent_choice_pos].plot(figsize=(16, 4))

Stationarise for avg, abs only by log or differencing.

#### Section 3.2.1  Data Differencing
Log transformation is not possible as 0 are included.
Thus, **differencing** is used.

In [None]:
len(train_set['Companies'].unique())

Both tests indicate that all the three series become stationary after first-differencing: the ADF test rejects the null of unit root, and the KPSS test fails to reject the null of stationarity, at the 0.05 significance level. Thus, there is no need to stationarize the data.

#### Drop company column

In [None]:
# drop companies from training datasets
train_set = train_set.drop(['Companies'],axis=1)

#### Section 3.3 Determine the order of the model

In [None]:
#from statsmodels.tsa.vector_ar.var_model import VAR

We can use the `select_order` method to determine the best order: taking the maximum number of lags, the method will build VAR models for each number of lags and output the values of IC for each.

The optimal number of lags are indicated with an asterisk in each column.

In [None]:
#results = VAR(train_set[list_columns]).select_order(maxlags=5)
#results.summary()

All information criteria suggest 5 lags. Let's go for the lag of 5.

### 3.4 Estimate a Dynamic Panel Regression

### 3.4.1 Dynamic Panel Regression for Excess Returns

In [None]:
import statsmodels.api as sm

# drop rows that contain a missing value for returns
train_set.dropna(subset=["ER_L1","ER_L2","ER_L3","ER_L4","ER_L5","Excess_returns"],inplace=True)

# replace nan by 0 for sentiment columns
train_set=train_set.fillna(0)

In [None]:
# transform test data too
# drop rows that contain a missing value for returns
test_set_pre_corona.dropna(subset=["ER_L1","ER_L2","ER_L3","ER_L4","ER_L5","Excess_returns"],inplace=True)
test_set_pre_corona=test_set_pre_corona.fillna(0)

# same for corona testset
test_set_corona.dropna(subset=["ER_L1","ER_L2","ER_L3","ER_L4","ER_L5","Excess_returns"],inplace=True)
test_set_corona=test_set_corona.fillna(0)

# same for full testset
test_set_full.dropna(subset=["ER_L1","ER_L2","ER_L3","ER_L4","ER_L5","Excess_returns"],inplace=True)
test_set_full=test_set_full.fillna(0)

In [None]:
train_set

In [None]:
#x.January.sum()

In [None]:
# Excess returns regression multiply y with 100 to multiply the coefficients with 100
# display impact by 100 (as Ahmad) / same effect as values as %
y=train_set["Excess_returns"]*100
x=train_set.drop(['Excess_returns',sent_choice_pos], axis=1)

In [None]:
x.iloc[:,5:10]

In [None]:
# Standard Regression
# take simplest approach if there is no difference
model_ER_plain=sm.OLS(y, x).fit()

# Print the model's summary
print(model_ER_plain.summary())

In [None]:
# save company coefficients to excel
df_dummy_coef = pd.concat((model_ER_plain.params, model_ER_plain.tvalues), axis=1).iloc[11:,:]
df_dummy_coef.index=df_dummy_coef.index.str[8:]
df_dummy_coef=df_dummy_coef.sort_values(by=1)
df_dummy_coef.rename(columns={0: 'beta', 1: 't'}).to_excel('ER_reg_Summary_results.xls', 'sheet1')
df_dummy_coef

In [None]:
# load decils
df_decils=pd.read_excel("RIC and News Quantiles.xlsx",index_col="Unnamed: 0")
df_analyse=df_dummy_coef.merge(df_decils, how="left", left_index=True, right_on="Company")
df_analyse.rename(columns={0:"Coeff", 1:"t-stat"}, inplace=True)
df_analyse.dropna(inplace=True)
df_analyse.head()

In [None]:
(df_analyse.News_Quantile.unique())

In [None]:
from scipy import stats

df_res=pd.DataFrame()

for c1 in np.sort(df_analyse.News_Quantile.unique()):
    for c2 in np.sort(df_analyse.News_Quantile.unique()):
        t_val, p_val = stats.ttest_ind(df_analyse[df_analyse.News_Quantile==c1]["Coeff"], df_analyse[df_analyse.News_Quantile==c2]["Coeff"])
        df_res.loc[c1, c2] = p_val
df_res

In [None]:
df_res.to_excel("1_Results/Table of p_vals by media coverage.xlsx")

In [None]:
df_industry=pd.read_excel("RIC and Industry Sector 2.xlsx")
df_analyse=df_analyse.merge(df_industry,how="left",left_on="Company",right_on="RIC")
df_analyse.head()

In [None]:
table_cross=pd.pivot_table(df_analyse, values=['Company'], index=['ICB Sector'],columns=['News_Quantile'],\
               aggfunc="count")
table_cross.to_excel("Counts industries by coverage.xlsx")
table_cross



In [None]:
df_res=pd.DataFrame()

for a in np.sort(df_analyse["ICB Sector"].unique()):
    for b in np.sort(df_analyse["ICB Sector"].unique()):
        t_val, p_val = stats.ttest_ind(df_analyse[df_analyse["ICB Sector"]==a]["Coeff"], df_analyse[df_analyse["ICB Sector"]==b]["Coeff"])
        df_res.loc[a, b] = p_val
df_res

In [None]:
df_res.to_excel("1_Results/Table of p_vals by industry.xlsx")

In [None]:
# report t-stats
df_res=pd.DataFrame()

for a in np.sort(df_analyse["ICB Sector"].unique()):
    for b in np.sort(df_analyse["ICB Sector"].unique()):
        t_val, p_val = stats.ttest_ind(df_analyse[df_analyse["ICB Sector"]==a]["Coeff"], df_analyse[df_analyse["ICB Sector"]==b]["Coeff"])
        df_res.loc[a, b] = t_val
df_res

In [None]:
df_analyse=df_analyse.groupby("News_Quantile").agg("mean")

In [None]:
df_analyse.to_excel("Coefficients by media coverage.xlsx")

In [None]:
# Conduct Regression analysis with Newey and West Standard errors 
#- robust to heteroskedasticity and autocorrelation up to five lags
#model_ER = sm.OLS(y, x).fit()
model_ER=sm.OLS(y, x).fit(cov_type='nw-panel',cov_kwds={'maxlags':5,'time':train_set.index})

# Print the model's summary
print(model_ER.summary())

In [None]:
#print(model_ER.summary2().as_text())

In [None]:
#model_ER.summary(fit)$coefficients[,4]

In [None]:
# produces same results
#model_ER2=model_ER.get_robustcov_results(cov_type='hac-panel',time=train_set.index,maxlags=5)
#print(model_ER2.summary())

In [None]:
# Conduct Regression analysis with Huber-White std errors 
#- robust to heteroskedasticity
model_ER_HW=sm.OLS(y, x).fit(cov_type='HC0')

# Print the model's summary
print(model_ER_HW.summary())

### 3.4.2 Dynamic Panel Regression for News Sentiment

In [None]:
# Sentiment regression
y_sent=train_set[sent_choice_pos]*100
x_sent=train_set.drop(['Excess_returns',sent_choice_pos], axis=1)

In [None]:
# Conduct Regression analysis 
#model_sent = sm.OLS(y_sent, x_sent).fit()
model_sent_plain=sm.OLS(y_sent, x_sent).fit()

# Print the model's summary
print(model_sent_plain.summary())

In [None]:
# Conduct Regression analysis 
#model_sent = sm.OLS(y_sent, x_sent).fit()
model_sent=sm.OLS(y_sent, x_sent).fit(cov_type='nw-panel',cov_kwds={'maxlags':5,'time':train_set.index})

# Print the model's summary
print(model_sent.summary())

In [None]:
# Conduct Regression analysis 
#model_sent = sm.OLS(y_sent, x_sent).fit()
model_sent_HW=sm.OLS(y_sent, x_sent).fit(cov_type='HC0')

# Print the model's summary
print(model_sent_HW.summary())

In [None]:
#https://github.com/mwburke/stargazer/blob/master/examples.ipynb
from stargazer.stargazer import Stargazer, LineLocation

stargazer = Stargazer([model_ER_plain, model_sent_plain,model_ER,model_sent,model_ER_HW,model_sent_HW])
stargazer.title(sent_choice_pos)
stargazer.custom_columns(['Excess returns', 'Sentiment',\
                          'NW: Excess returns', 'NW: Sentiment',\
                          'HW: Excess returns', 'HW: Sentiment'], [1, 1,1,1,1,1])
stargazer.show_model_numbers(False)
#stargazer.significant_digits(2)
list_relevant=['ER_L1', 'ER_L2', 'ER_L3', 'ER_L4', 'ER_L5',sent_choice_pos+'L_1',sent_choice_pos+'L_2',\
              sent_choice_pos+'L_3',sent_choice_pos+'L_4',sent_choice_pos+'L_5',"Monday"]
stargazer.covariate_order(list_relevant)

#stargazer.rename_covariates({'Age': 'Oldness'})

stargazer.show_degrees_of_freedom(False)
#stargazer.add_custom_notes(['First note', 'Second note'])
stargazer.add_line('Company dummies', ['Yes', 'Yes','Yes','Yes','Yes','Yes'])
#stargazer.add_line('Preferred', ['No', 'Yes'], LineLocation.FOOTER_TOP)

#stargazer.significance_levels([0.1, 0.05, 0.07])
#stargazer.append_notes(False)

stargazer

In [None]:
# Assign the table data to a Pandas dataframe 
table = pd.read_html(stargazer.render_html())
table=table[0][2:]

In [None]:
table

In [None]:
df_table=pd.DataFrame(table)
df_table.to_excel("1_Results/Train_Regression results_"+sent_choice_pos+".xlsx")

In [None]:
# Refit Excess returns regression with regular coefficient
y=train_set["Excess_returns"]
x=train_set.drop(['Excess_returns',sent_choice_pos], axis=1)

# Standard Regression
# take simplest approach if there is no difference
model_ER_plain=sm.OLS(y, x).fit()

# Print the model's summary
print(model_ER_plain.summary())

# Sentiment regression
y_sent=train_set[sent_choice_pos]
x_sent=train_set.drop(['Excess_returns',sent_choice_pos], axis=1)


# Conduct Regression analysis 
#model_sent = sm.OLS(y_sent, x_sent).fit()
model_sent_plain=sm.OLS(y_sent, x_sent).fit()

# Print the model's summary
print(model_sent_plain.summary())

# 4. Evaluate the model on test data

The procedure to forecast returns only for the next day, retrain the model and forecast again takes too long.
Thus, we decided to conduct the out-of-sample testing with the basic model only.

### Calculate accuracy measures

In [None]:
def get_mda_orig(y, yhat):
    """Mean Directional Accuracy, as per:
    https://www.wikiwand.com/en/Mean_Directional_Accuracy
    """
    
    a = np.sign(np.diff(y))
    b = np.sign(np.diff(yhat))
    
    return np.sum(a == b)/a.shape[0]


from sklearn.metrics import mean_squared_error

**Baseline**

Pre Corona

In [None]:
test_set_pre_corona = test_set_pre_corona.reset_index()
test_set_corona = test_set_corona.reset_index()
test_set_pre_corona.head()

In [None]:
# use mean return to create baseline for each company
train_set=df_main[df_main.index < "2019-10-01"]
# calculate mean by company
comp_mean=train_set.groupby('Companies')["Excess_returns"].agg('mean')
comp_mean=comp_mean.reset_index().rename(columns={"Excess_returns": "mean_return"})

comp_mean

In [None]:
# merge returns on pre corona testset and drop all companies that were non-stationary  by using "inner"
test_set_pre_corona=test_set_pre_corona.merge(comp_mean, how="inner", left_on="Companies",right_on="Companies")

# merge returns on  corona testset and drop all companies that were non-stationary  by using "inner"
test_set_corona=test_set_corona.merge(comp_mean, how="inner", left_on="Companies",right_on="Companies")

# merge returns on  full testset and drop all companies that were non-stationary  by using "inner"
test_set_full=test_set_full.merge(comp_mean, how="inner", left_on="Companies",right_on="Companies")

In [None]:
test_set_pre_corona.head()

In [None]:
test_set_corona.head()

In [None]:
# double-check calculation
#test_set_pre_corona.loc["A"]["Excess_returns"].mean()

In [None]:
# find predom sign in training set
np.sign(train_set.Excess_returns).sum()

As np.sign() returns -1 / 1 indicating the sign, from the summing the fields value, we can obtain the predominant sign. As the calculated sum is positive, we can conclude that the predom. sign is positive.

In [None]:
# show only positive
np.sign(train_set.Excess_returns)[np.sign(train_set.Excess_returns)>0]

In [None]:
# show only negative
np.sign(train_set.Excess_returns)[np.sign(train_set.Excess_returns)<0]

In [None]:
# double check computation
#test_set_pre_corona.loc["A"]["return_sign"].sum()

In [None]:
test_set_pre_corona["Excess_returns"]

## Calculate RMSE / MDA per company and average score for the full test set by industry

In [None]:
# join industries to trainingset
df_industry=pd.read_excel("RIC and Industry Sector 2.xlsx")
df_industry.head()

In [None]:
test_set_full.shape

In [None]:
#join data on test set
test_set_full=test_set_full.merge(df_industry,how="left",left_on="Companies",right_on="RIC")
test_set_full.shape

In [None]:
test_set_full.Companies

In [None]:
test_set_full.drop_duplicates(subset="Companies")["ICB Sector"].value_counts()

In [None]:
test_set_full[test_set_full["ICB Sector"].isnull()]["Companies"].value_counts()

In [None]:
df_industry_results=pd.DataFrame()
df_industry_results["Industry"]=test_set_full["ICB Sector"].unique()
df_industry_results

In [None]:
# Return MDA

# mean baseline

list_scores=[]
for i in test_set_full["ICB Sector"].unique()[:]:
    x=get_mda_orig(test_set_full[test_set_full["ICB Sector"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["ICB Sector"]==i]["mean_return"])
    list_scores.append(x)

#return mean score per company
df_industry_results["MDA_mean"]=list_scores
#df_industry_results.head()

# persistence baseline

list_scores=[]
for i in test_set_full["ICB Sector"].unique()[:]:
    x=get_mda_orig(test_set_full[test_set_full["ICB Sector"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["ICB Sector"]==i]["ER_L1"])
    list_scores.append(x)

df_industry_results["MDA_pers"]=list_scores
#df_industry_results.head()


# positive baseline

list_scores=[]
for i in test_set_full["ICB Sector"].unique()[:]:
    x=get_mda_orig(test_set_full[test_set_full["ICB Sector"]==i]["Excess_returns"],\
                    np.arange(test_set_full[test_set_full["ICB Sector"]==i]["mean_return"].shape[0]))
    list_scores.append(x)
    
df_industry_results["MDA_pos"]=list_scores
df_industry_results

In [None]:
# RMSE 

list_scores=[]
for i in test_set_full["ICB Sector"].unique()[:]:
    x=np.sqrt(mean_squared_error(test_set_full[test_set_full["ICB Sector"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["ICB Sector"]==i]["mean_return"]))
    list_scores.append(x)

df_industry_results["RMSE_mean"]=list_scores


list_scores=[]
for i in test_set_full["ICB Sector"].unique()[:]:
    x= np.sqrt(mean_squared_error(test_set_full[test_set_full["ICB Sector"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["ICB Sector"]==i]["ER_L1"]))
    list_scores.append(x)

df_industry_results["RMSE_pers"]=list_scores
df_industry_results

In [None]:
test_set_full.head()

In [None]:
# split dependent and independent variables
x_test_full=test_set_full.drop(['Excess_returns',\
                                       sent_choice_pos,"mean_return","RIC","ICB Sector",'Companies'], axis=1)

y_test_full=test_set_full["Excess_returns"]

In [None]:
# predict all values straight
y_hat_full = model_ER_plain.predict(x_test_full)
y_hat_full

In [None]:
#include in orgi datafram
test_set_full["y_hat"]=y_hat_full

In [None]:
# MDA

list_scores=[]
for i in test_set_full["ICB Sector"].unique()[:]:
    x=get_mda_orig(test_set_full[test_set_full["ICB Sector"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["ICB Sector"]==i]["y_hat"])
    list_scores.append(x)

df_industry_results["MDA_dynp"]=list_scores
#df_industry_results.head()

In [None]:
# RMSE 

list_scores=[]
for i in test_set_full["ICB Sector"].unique()[:]:
    x=np.sqrt(mean_squared_error(test_set_full[test_set_full["ICB Sector"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["ICB Sector"]==i]["y_hat"]))
    list_scores.append(x)

df_industry_results["RMSE_dynp"]=list_scores

In [None]:
df_industry_results

In [None]:
df_industry_results.to_excel("1_Results/OOS_Industry performance.xlsx")

## Calculate RMSE / MDA per company and average score for the full test set by media coverage

In [None]:
# join industries to trainingset
df_decils=pd.read_excel("RIC and News Quantiles.xlsx",index_col="Unnamed: 0")
df_decils.head()

In [None]:
test_set_full.shape

In [None]:
#join data on test set
test_set_full=test_set_full.merge(df_decils, how="left", left_on="Companies", right_on="Company")
test_set_full.shape

In [None]:
test_set_full[test_set_full["News_Quantile"].isnull()]["Companies"].value_counts()

In [None]:
test_set_full.drop_duplicates(subset="Company")["News_Quantile"].value_counts()
test_set_full.dropna(subset=["News_Quantile"], inplace=True)

In [None]:
df_decils_results=pd.DataFrame()
df_decils_results["News_Quantile"]=test_set_full["News_Quantile"].unique()

In [None]:
# Return MDA

# mean baseline

list_scores=[]
for i in test_set_full["News_Quantile"].unique()[:]:
    x=get_mda_orig(test_set_full[test_set_full["News_Quantile"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["News_Quantile"]==i]["mean_return"])
    list_scores.append(x)

#return mean score per company
df_decils_results["MDA_mean"]=list_scores
#df_industry_results.head()

# persistence baseline

list_scores=[]
for i in test_set_full["News_Quantile"].unique()[:]:
    x=get_mda_orig(test_set_full[test_set_full["News_Quantile"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["News_Quantile"]==i]["ER_L1"])
    list_scores.append(x)

df_decils_results["MDA_pers"]=list_scores
#df_industry_results.head()


# positive baseline

list_scores=[]
for i in test_set_full["News_Quantile"].unique()[:]:
    x=get_mda_orig(test_set_full[test_set_full["News_Quantile"]==i]["Excess_returns"],\
                    np.arange(test_set_full[test_set_full["News_Quantile"]==i]["mean_return"].shape[0]))
    list_scores.append(x)
    
df_decils_results["MDA_pos"]=list_scores
df_decils_results

In [None]:
test_set_full["News_Quantile"].isna().sum()

In [None]:
# RMSE 

list_scores=[]
for i in test_set_full["News_Quantile"].unique()[:]:
    x=np.sqrt(mean_squared_error(test_set_full[test_set_full["News_Quantile"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["News_Quantile"]==i]["mean_return"]))
    list_scores.append(x)

df_decils_results["RMSE_mean"]=list_scores


list_scores=[]
for i in test_set_full["News_Quantile"].unique()[:]:
    x= np.sqrt(mean_squared_error(test_set_full[test_set_full["News_Quantile"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["News_Quantile"]==i]["ER_L1"]))
    list_scores.append(x)

df_decils_results["RMSE_pers"]=list_scores
df_decils_results

In [None]:
test_set_full.head()

In [None]:
# split dependent and independent variables
x_test_full=test_set_full.drop(['Excess_returns',\
                                       sent_choice_pos,"mean_return","RIC","ICB Sector",'Companies',"Company","News_Quantile","y_hat"], axis=1)

y_test_full=test_set_full["Excess_returns"]

In [None]:
# predict all values straight
y_hat_full = model_ER_plain.predict(x_test_full)
y_hat_full

In [None]:
# include in orginal dataframe
test_set_full["y_hat"]=y_hat_full

In [None]:
# MDA

list_scores=[]
for i in test_set_full["News_Quantile"].unique()[:]:
    x=get_mda_orig(test_set_full[test_set_full["News_Quantile"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["News_Quantile"]==i]["y_hat"])
    list_scores.append(x)

df_decils_results["MDA_dynp"]=list_scores
#df_industry_results.head()

In [None]:
# RMSE 

list_scores=[]
for i in test_set_full["News_Quantile"].unique()[:]:
    x=np.sqrt(mean_squared_error(test_set_full[test_set_full["News_Quantile"]==i]["Excess_returns"],\
                   test_set_full[test_set_full["News_Quantile"]==i]["y_hat"]))
    list_scores.append(x)

df_decils_results["RMSE_dynp"]=list_scores

In [None]:
df_decils_results

In [None]:
df_decils_results.to_excel("1_Results/OOS_News coverage performance.xlsx")

# Calculate RMSE / MDA per company and average score for the full test set

In [None]:
# Return MDA

# mean baseline

list_scores=[]
for i in test_set_full.Companies.unique()[:]:
    x=get_mda_orig(test_set_full[test_set_full.Companies==i]["Excess_returns"],\
                   test_set_full[test_set_full.Companies==i]["mean_return"])
    list_scores.append(x)

#return mean score per company
mean_bl_mda=np.mean(list_scores)
print(mean_bl_mda)

# persistence baseline

list_scores=[]
for i in test_set_full.Companies.unique()[:]:
    x=get_mda_orig(test_set_full[test_set_full.Companies==i]["Excess_returns"],\
                   test_set_full[test_set_full.Companies==i]["ER_L1"])
    list_scores.append(x)

#return mean score per company
per_bl_mda=np.mean(list_scores)
print(per_bl_mda)

# positive baseline

list_scores=[]
for i in test_set_full.Companies.unique()[:]:
    x=get_mda_orig(test_set_full[test_set_full.Companies==i]["Excess_returns"],\
                    np.arange(test_set_full[test_set_full.Companies==i]["mean_return"].shape[0]))
    list_scores.append(x)
    
up_bl_mda=np.mean(list_scores)

#return mean score per company
print(up_bl_mda)

In [None]:
# RMSE 

list_scores=[]
for i in test_set_full.Companies.unique()[:]:
    x=np.sqrt(mean_squared_error(test_set_full[test_set_full.Companies==i]["Excess_returns"],\
                   test_set_full[test_set_full.Companies==i]["mean_return"]))
    list_scores.append(x)

#return mean score per company
mean_bl_rmse=np.mean(list_scores)
print(mean_bl_rmse)

list_scores=[]
for i in test_set_full.Companies.unique()[:]:
    x= np.sqrt(mean_squared_error(test_set_full[test_set_full.Companies==i]["Excess_returns"],\
                   test_set_full[test_set_full.Companies==i]["ER_L1"]))
    list_scores.append(x)

#return mean score per company
per_bl_rmse=np.mean(list_scores)
print(per_bl_rmse)

# Calculate RMSE / MDA per company and average score by Corona split

In [None]:
# Return MDA

# Pre Corona
# mean baseline

list_scores=[]
for i in test_set_pre_corona.Companies.unique()[:]:
    x=get_mda_orig(test_set_pre_corona[test_set_pre_corona.Companies==i]["Excess_returns"],\
                   test_set_pre_corona[test_set_pre_corona.Companies==i]["mean_return"])
    list_scores.append(x)

#return mean score per company
pc_mean_bl_mda=np.mean(list_scores)


list_scores=[]
for i in test_set_pre_corona.Companies.unique()[:]:
    x=get_mda_orig(test_set_pre_corona[test_set_pre_corona.Companies==i]["Excess_returns"],\
                   test_set_pre_corona[test_set_pre_corona.Companies==i]["ER_L1"])
    list_scores.append(x)

#return mean score per company
pc_per_bl_mda=np.mean(list_scores)

In [None]:
pc_per_bl_mda

In [None]:
# RMSE 

list_scores=[]
for i in test_set_pre_corona.Companies.unique()[:]:
    x=np.sqrt(mean_squared_error(test_set_pre_corona[test_set_pre_corona.Companies==i]["Excess_returns"],\
                   test_set_pre_corona[test_set_pre_corona.Companies==i]["mean_return"]))
    list_scores.append(x)

#return mean score per company
pc_mean_bl_rmse=np.mean(list_scores)


list_scores=[]
for i in test_set_pre_corona.Companies.unique()[:]:
    x= np.sqrt(mean_squared_error(test_set_pre_corona[test_set_pre_corona.Companies==i]["Excess_returns"],\
                   test_set_pre_corona[test_set_pre_corona.Companies==i]["ER_L1"]))
    list_scores.append(x)

#return mean score per company
pc_per_bl_rmse=np.mean(list_scores)

In [None]:
pc_mean_bl_rmse

In [None]:
# Return RMSE and MDA

# Pre Corona
# mean baseline
#pc_mean_bl_rmse = np.sqrt(mean_squared_error(test_set_pre_corona["Excess_returns"],test_set_pre_corona["mean_return"]))
#pc_mean_bl_mda = get_mda(test_set_pre_corona["Excess_returns"],test_set_pre_corona["mean_return"])
print(f"Pre corona Mean baseline")
print(f"RMSE: {pc_mean_bl_rmse}")
print(f"MDA: {pc_mean_bl_mda}")

# persistence baseline
#pc_per_bl_rmse = np.sqrt(mean_squared_error(test_set_pre_corona["Excess_returns"],test_set_pre_corona["ER_L1"]))
#pc_per_bl_mda = get_mda(test_set_pre_corona["Excess_returns"],test_set_pre_corona["ER_L1"])
print(f"Pre corona persistence baseline")
print(f"RMSE: {pc_per_bl_rmse}")
print(f"MDA: {pc_per_bl_mda}")

# predominant sign baseline    
#pc_ds_bl_mda = get_mda(test_set_pre_corona["Excess_returns"],test_set_pre_corona["sign_diff"])
#print(f"Pre corona predom. sign baseline")
#print(f"MDA: {pc_ds_bl_mda}")

In [None]:
# check if computation is correct pc_mean_bl_mda
#test_set_pre_corona[["Excess_returns","mean_return","ER_L1"]].to_excel("MDA Performance analysis.xlsx")

In [None]:
# Return MDA for always positive baseline

# Pre Corona
# positive baseline

list_scores=[]
for i in test_set_pre_corona.Companies.unique()[:]:
    x=get_mda_orig(test_set_pre_corona[test_set_pre_corona.Companies==i]["Excess_returns"],\
                  # np.random.randint(1, 99999999999999,\
                           #          test_set_pre_corona[test_set_pre_corona.Companies==i]["mean_return"].shape[0]))
                    np.arange(test_set_pre_corona[test_set_pre_corona.Companies==i]["mean_return"].shape[0]))
    list_scores.append(x)
    
pc_up_bl_mda=np.mean(list_scores)
#return mean score per company
print("pre corona")
print(np.mean(list_scores))

# Post Corona
# positive baseline

list_scores=[]
for i in test_set_corona.Companies.unique()[:]:
    x=get_mda_orig(test_set_corona[test_set_corona.Companies==i]["Excess_returns"],\
                   #np.random.randint(1, 99999999999999,\
                    #                 test_set_corona[test_set_corona.Companies==i]["mean_return"].shape[0]))
                   np.arange(test_set_corona[test_set_corona.Companies==i]["mean_return"].shape[0]))
    list_scores.append(x)

dc_up_bl_mda=np.mean(list_scores)
    
#return mean score per company
print("corona")
print(np.mean(list_scores))

In [None]:
# Return MDA

# Pre Corona
# mean baseline

list_scores=[]
for i in test_set_corona.Companies.unique()[:]:
    x=get_mda_orig(test_set_corona[test_set_corona.Companies==i]["Excess_returns"],\
                   test_set_corona[test_set_corona.Companies==i]["mean_return"])
    list_scores.append(x)

#return mean score per company
dc_mean_bl_mda=np.mean(list_scores)


list_scores=[]
for i in test_set_corona.Companies.unique()[:]:
    x=get_mda_orig(test_set_corona[test_set_corona.Companies==i]["Excess_returns"],\
                   test_set_corona[test_set_corona.Companies==i]["ER_L1"])
    list_scores.append(x)

#return mean score per company
dc_per_bl_mda=np.mean(list_scores)

In [None]:
# RMSE 

list_scores=[]
for i in test_set_corona.Companies.unique()[:]:
    x=np.sqrt(mean_squared_error(test_set_corona[test_set_corona.Companies==i]["Excess_returns"],\
                   test_set_corona[test_set_corona.Companies==i]["mean_return"]))
    list_scores.append(x)

#return mean score per company
dc_mean_bl_rmse=np.mean(list_scores)


list_scores=[]
for i in test_set_corona.Companies.unique()[:]:
    x= np.sqrt(mean_squared_error(test_set_corona[test_set_corona.Companies==i]["Excess_returns"],\
                   test_set_corona[test_set_corona.Companies==i]["ER_L1"]))
    list_scores.append(x)

#return mean score per company
dc_per_bl_rmse=np.mean(list_scores)

In [None]:
# Post Corona
# mean baseline
#dc_mean_bl_rmse = np.sqrt(mean_squared_error(test_set_corona["Excess_returns"],test_set_corona["mean_return"]))
#dc_mean_bl_mda = get_mda(test_set_corona["Excess_returns"],test_set_corona["mean_return"])
print(f"Corona Mean baseline")
print(f"RMSE: {dc_mean_bl_rmse}")
print(f"MDA: {dc_mean_bl_mda}")

# persistence baseline
#dc_per_bl_rmse = np.sqrt(mean_squared_error(test_set_corona["Excess_returns"],test_set_corona["ER_L1"]))
#dc_per_bl_mda = get_mda(test_set_corona["Excess_returns"],test_set_corona["ER_L1"])
print(f"Corona persistence baseline")
print(f"RMSE: {dc_per_bl_rmse}")
print(f"MDA: {dc_per_bl_mda}")

# predominant sign baseline    
#dc_ds_bl_mda = get_mda(test_set_corona["Excess_returns"],test_set_corona["sign_diff"])
#print(f"Pre corona predom. sign baseline")
#print(f"MDA: {dc_ds_bl_mda}")

In [None]:
test_set_pre_corona.head()

In [None]:
# remove companies, reset index
#test_set_pre_corona = test_set_pre_corona.reset_index()
test_set_pre_corona = test_set_pre_corona.set_index(['Date'])

#test_set_corona = test_set_corona.reset_index()
test_set_corona = test_set_corona.set_index(['Date'])

# drop companies from both datasets
#test_set_pre_corona = test_set_pre_corona.drop(['Companies'],axis=1)
#test_set_corona = test_set_corona.drop(['Companies'],axis=1)

**Use model to make predictions**

In [None]:
# split dependent and independent variables
x_test_pre_c=test_set_pre_corona.drop(['Excess_returns',\
                                       sent_choice_pos,"mean_return",'Companies'], axis=1)

x_test_c=test_set_corona.drop(['Excess_returns',\
                                       sent_choice_pos,"mean_return",'Companies'], axis=1)

y_test_pre_c=test_set_pre_corona["Excess_returns"]
y_test_c=test_set_corona["Excess_returns"]

In [None]:
x_test_pre_c.head()

In [None]:
# predict all values straight
y_hat_pre_c = model_ER_plain.predict(x_test_pre_c)
y_hat_pre_c

y_hat_c = model_ER_plain.predict(x_test_c)
y_hat_c

In [None]:
# include estimates in orig dataframe
test_set_pre_corona["y_hat"]=y_hat_pre_c
test_set_corona["y_hat"]=y_hat_c
test_set_corona.head()

In [None]:
y_test_pre_c

In [None]:
y_hat_pre_c.shape

In [None]:
# Return RMSE and MDA
#pc_var_rmse = np.sqrt(mean_squared_error(y_test_pre_c,y_hat_pre_c))
print(f"Pre Corona:")


list_scores=[]
for i in test_set_pre_corona.Companies.unique()[:]:
    x= np.sqrt(mean_squared_error(test_set_pre_corona[test_set_pre_corona.Companies==i]["Excess_returns"],\
                   test_set_pre_corona[test_set_pre_corona.Companies==i]["y_hat"]))
    list_scores.append(x)

#return mean score per company
pc_var_rmse=np.mean(list_scores)

print(f"RMSE: {pc_var_rmse}")

list_scores=[]
for i in test_set_pre_corona.Companies.unique()[:]:
    x=get_mda_orig(test_set_pre_corona[test_set_pre_corona.Companies==i]["Excess_returns"],\
                   test_set_pre_corona[test_set_pre_corona.Companies==i]["y_hat"])
    list_scores.append(x)

#return mean score per company
pc_var_mda=np.mean(list_scores)

#pc_var_mda = get_mda(y_test_pre_c,y_hat_pre_c)
print(f"MDA: {pc_var_mda}")

In [None]:
# plot pre corona scores dist
df_analyse=pd.DataFrame(list_scores)
df_analyse["Company"]=test_set_pre_corona.Companies.unique()
df_analyse.sort_values(by=0, ascending=False)[:10]

In [None]:
# load decils
df_decils=pd.read_excel("RIC and News Quantiles.xlsx",index_col="Unnamed: 0")
df_analyse=df_analyse.merge(df_decils, how="left", left_on="Company", right_on="Company")
df_analyse.rename(columns={0:"MDA_score"}, inplace=True)
df_analyse

In [None]:
df_analyse.MDA_score.hist(by=df_analyse["News_Quantile"], bins=10, figsize=(20,20))

In [None]:
df_analyse.groupby("News_Quantile").agg(("mean","median","min","max","std"))

In [None]:
# Return RMSE and MDA
#dc_var_rmse = np.sqrt(mean_squared_error(y_hat_c, y_test_c))
print(f"During Corona:")


list_scores=[]
for i in test_set_corona.Companies.unique()[:]:
    x= np.sqrt(mean_squared_error(test_set_corona[test_set_corona.Companies==i]["Excess_returns"],\
                   test_set_corona[test_set_corona.Companies==i]["y_hat"]))
    list_scores.append(x)

#return mean score per company
dc_var_rmse=np.mean(list_scores)

print(f"RMSE: {dc_var_rmse}")

list_scores=[]
for i in test_set_corona.Companies.unique()[:]:
    x=get_mda_orig(test_set_corona[test_set_corona.Companies==i]["Excess_returns"],\
                   test_set_corona[test_set_corona.Companies==i]["y_hat"])
    list_scores.append(x)

#return mean score per company
dc_var_mda=np.mean(list_scores)

#dc_var_mda = get_mda(y_test_c,y_hat_c)
print(f"MDA: {dc_var_mda}")

# Summary

### RMSE  results

In [None]:
rmse_table = pd.DataFrame({
    'method': ['Mean baseline', 'Persistence baseline','Dynamic Panel'],
    'Pre Corona': [pc_mean_bl_rmse,pc_per_bl_rmse,pc_var_rmse],
    'During Corona': [dc_mean_bl_rmse,dc_per_bl_rmse,dc_var_rmse],
})

rmse_table.set_index("method", inplace=True)

# add columns with percent changes on the baselines
rmse_table['Pre Corona -  % Change on Mean baseline'] = (rmse_table['Pre Corona'] / pc_mean_bl_rmse -1)
rmse_table['Pre Corona -  % Change on Persistence baseline'] = (rmse_table['Pre Corona'] / pc_per_bl_rmse -1)
rmse_table['During Corona - % Change on Mean baseline'] =  (rmse_table['During Corona'] / dc_mean_bl_rmse -1)
rmse_table['During Corona - % Change on Persistence baseline'] =  (rmse_table['During Corona'] / dc_per_bl_rmse -1)

rmse_table

Very similar performance

### MDA results

In [None]:
mda_table = pd.DataFrame({
    'method': ['Up baseline', 'Persistence baseline','Dynamic Panel'],
    'Pre Corona':    [pc_up_bl_mda,pc_per_bl_mda,pc_var_mda],
    'During Corona': [dc_up_bl_mda,dc_per_bl_mda,dc_var_mda],
})

mda_table.set_index("method", inplace=True)

# add columns with percent changes on the baselines
mda_table['Pre Corona -  % Change on Up baseline'] = (mda_table['Pre Corona'] - pc_up_bl_mda )
mda_table['Pre Corona -  % Change on Persistence baseline'] = (mda_table['Pre Corona'] - pc_per_bl_mda )
mda_table['During Corona - % Change on Up baseline'] =  (mda_table['During Corona'] - dc_up_bl_mda )
mda_table['During Corona - % Change on Persistence baseline'] =  (mda_table['During Corona'] - dc_per_bl_mda )

mda_table

Huge increase in MDA performance.

In [None]:
list_results=[rmse_table,mda_table]

In [None]:
from pandas import ExcelWriter
# from pandas.io.parsers import ExcelWriter

def save_xls(list_dfs, xls_path):
    with ExcelWriter(xls_path) as writer:
        for n, df in enumerate(list_dfs):
            df.to_excel(writer,'sheet%s' % n)
        writer.save()

In [None]:
# store to excel
save_xls(list_results,"1_Results/Test_Regression results_"+sent_choice_pos+".xlsx")