In [1]:
%matplotlib notebook

import pandas as pd
import numpy as np
from pandas import DataFrame
from pandas import Series
from pandas import concat
from pandas import read_csv
from scipy import stats
from matplotlib import pyplot
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from statsmodels.graphics.tsaplots import plot_acf
from sklearn.metrics import mean_absolute_error as mae
from statsmodels.tsa.stattools import adfuller
import statsmodels.api as sm
import math
from math import sqrt
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import os
import codecs
# import seaborn as sns

from matplotlib import rc
rc('text', usetex=True)

## Load financial data, add close price and volume

In [2]:
# Download Daily Data:
bakka_daily = pd.read_excel(os.getcwd() + '\\daily\\Bakkafrost_04.01.2016-15.07.2022.xlsx')
grieg_daily = pd.read_excel(os.getcwd() + '\\daily\\Grieg_04.01.2016-15.07.2022.xlsx')
leroy_daily = pd.read_excel(os.getcwd() + '\\daily\\Leroy_04.01.2016-15.07.2022.xlsx')
mowi_daily  = pd.read_excel(os.getcwd() + '\\daily\\MOWI_04.01.2016-15.07.2022.xlsx')
salmar_daily = pd.read_excel(os.getcwd() + '\\daily\\SalMar_04.01.2016-15.07.2022.xlsx')

In [3]:
def df(df):
    col = ["Exchange Date", "Close", "Volume"]
    df = df.loc[:,col]
    return df

In [4]:
def put_together(bakka, grieg, leroy, mowi, salmar):
    date = bakka['Exchange Date']
    together = []
    for i in (bakka, grieg, leroy, mowi, salmar):
        together.append(df(i)) # a list of 5 list
    closed_prices = pd.concat([date, together[0]['Close'], together[1]['Close'], 
                               together[2]['Close'], together[3]['Close'], together[4]['Close']], axis = 1)
    volumes = pd.concat([date, together[0]['Volume'], together[1]['Volume'], 
                               together[2]['Volume'], together[3]['Volume'], together[4]['Volume']], axis = 1)
    companies = ("bakka", "grieg", "leroy", "mowi", "salmar")
    companies_names = [i+"_close" for i in companies]
    companies_volumes = [i+"_volume" for i in companies]
    cls = closed_prices['Close']
    vlms = volumes['Volume']
    cls.columns = companies_names
    vlms.columns = companies_volumes
    weight = 0.2
    weighted = []
    weighted_volumes = []
    
    for j in range(cls.shape[1]):
        weighted.append((cls.iloc[:,j])*weight)
        
    for j in range(vlms.shape[1]):
        weighted_volumes.append((vlms.iloc[:,j])*weight)
        
    weighted = pd.DataFrame(weighted)
    weighted = weighted.transpose()
    weight_names = [i+'_weight' for i in companies]
    weighted.columns = weight_names
    weighted['equally_weighted_index'] = weighted.sum(axis = 1)
    
    weighted_volumes = pd.DataFrame(weighted_volumes)
    weighted_volumes = weighted_volumes.transpose()
    weighted_volumes_names = [i+'_weight_volume' for i in companies]
    weighted_volumes.columns = weighted_volumes_names
    weighted_volumes['equally_weighted_volume'] = weighted_volumes.sum(axis = 1)

    all_prices = pd.concat([date, cls, weighted, vlms, weighted_volumes], axis = 1)
    # The index is normalized so that its logarithm equals one before the first observation (01:2016) 
    all_prices['norm_index'] =(all_prices['equally_weighted_index']/all_prices['equally_weighted_index'][0])
    all_prices['log_ret'] = np.log(all_prices['norm_index']/all_prices['norm_index'].shift(1))
    all_prices.fillna(0, inplace = True)
    all_prices['react_label'] = np.where(all_prices['log_ret']>=0, 'Up', 'Down')
    all_prices['year'] = pd.DatetimeIndex(all_prices['Exchange Date']).year
    all_prices['month'] = pd.DatetimeIndex(all_prices['Exchange Date']).month
    all_prices['day'] = pd.DatetimeIndex(all_prices['Exchange Date']).day
    return (all_prices)

In [5]:
daily_spi_volume = put_together(bakka_daily, grieg_daily, leroy_daily, mowi_daily, salmar_daily)

## Deal with holidays, weekends etc.

In [6]:
path = os.path.realpath(__name__) # path
drt = os.path.dirname(path)       # directory

In [7]:
articles_import = drt + '/articles_sorted.csv'
ph_import = drt + '/public_holidays_2016-2022.07.csv'

In [8]:
# Import daily SPI data:
fin_dt = daily_spi_volume
# Pre-processed articles:
infos = pd.read_csv(articles_import, encoding = 'utf-8-sig', sep=';')
# Import 'Public Holidays' data:
ph = pd.read_csv(ph_import)

In [9]:
# get the columns we need
cols = ['day', 'month','year','log_ret', 'react_label', 'equally_weighted_index', 'equally_weighted_volume'] 
# data frame
fin_df = fin_dt.loc[:,cols] 
# remove, index column, word_count
articles = infos.iloc[:,2:9]                            

In [10]:
print(len(fin_df), len(articles))

1640 6082


In [11]:
# Download the packages
from datetime import datetime
from datetime import date

In [12]:
# First, make the date series:
def get_weekends(info):
    info['date'] = pd.to_datetime({"year": info.year, "month": info.month, "day": info.day})
    info['weekday'] = info['date'].dt.dayofweek
    info['weekend']= info['weekday'] > 4
    # Filter for ONLY the weekends:
    info_weekends = info[info.weekend == True]
    # Get the dates of the Weekends:
    weekends_dates = pd.to_datetime({"year": info_weekends.year, "month": info_weekends.month, "day": info_weekends.day}).drop_duplicates(keep='first')
    return (info, info.weekend.value_counts(), weekends_dates)

In [13]:
articles_daily, trues_articles, weekends_dates_articles = get_weekends(articles) # 75 dates are weekends
fin_df, trues_fin, weekends_dates_daily = get_weekends(fin_df)                   # No weekends
ph, ph_trues, ph_weekends = get_weekends(ph)                                     # Get the weekends to delete them

In [14]:
# Delete Weekends from public holidays:
ph = ph[(ph.weekend != True)]

In [15]:
# Change Fridays after 14:20 GMT to Mondays:
# len(articles_daily.loc[(articles_daily['weekday'] == 4) & (articles_daily['hour'] >=14) & (articles_daily['minute'] >=20)]) # 142
articles_daily.loc[(articles_daily['weekday'] == 4) & (articles_daily['hour'] >=14) & (articles_daily['minute'] >=20), 'impact'] = articles_daily['date'] + pd.to_timedelta(3, unit='D')

# Change Weekends articles to Mondays:
# len(articles_daily.loc[(articles_daily['weekday'] == 5) | (articles_daily['weekday'] == 6)]) # 83
articles_daily.loc[(articles_daily['weekday'] == 5), 'impact'] = articles_daily['date'] + pd.to_timedelta(2, unit='D') # Saturday
articles_daily.loc[(articles_daily['weekday'] == 6), 'impact'] = articles_daily['date'] + pd.to_timedelta(1, unit='D') # Sunday

# Now, all articles published on Monday, Tuesday, Wednesday, and Thursday after 14:20 GMT
# have an impact on next trading day's returns:
articles_daily.loc[(articles_daily['hour'] >= 14) & (articles_daily['minute'] >= 20) & (articles_daily['impact'].isnull()), 'impact'] = articles_daily['date'] + pd.to_timedelta(1, unit='D')

# Also update the remaining as having an impact on the same day returns:
articles_daily.loc[(articles_daily['impact'].isnull()), 'impact'] = articles_daily['date']

In [16]:
# First, lets check how many of the public holidays are in the 'impact' dates:
articles_daily['ph'] = articles_daily.impact.isin(ph.date).astype(int)
len(articles_daily.loc[articles_daily['ph']==1]) # 157

157

In [17]:
# Separate the half trading public holidays
ph_half = ph.loc[ph['trading'] == "half"]
# Check if any of them are in the 'impact' column:
articles_daily['ph_half'] = articles_daily.impact.isin(ph_half.date).astype(int)

# We can say that if ph_half == 1, and the time the article was published is before 11:00 GMT then the impact is the date.
articles_daily.loc[(articles_daily['ph_half']==1) & (articles_daily['hour'] <= 10), 'ph'] = 0
# Delete the 'ph_half' column:
articles_daily = articles_daily.iloc[: , :-1]

In [18]:
while True:
    articles_daily['weekday_imp'] = articles_daily['impact'].dt.dayofweek #YES

    # Change them to Monday:
    articles_daily.loc[(articles_daily['ph'] == 1) & (articles_daily['weekday_imp'] == 4), 'impact'] = articles_daily['impact'] + pd.to_timedelta(3, unit='D')
        
    # Check again if those Mondays are also PH:
    articles_daily['ph'] = articles_daily.impact.isin(ph.date).astype(int)
    
    # Change the weekdays again:
    articles_daily['weekday_imp'] = articles_daily['impact'].dt.dayofweek
    
    # Change remaining dates:
    articles_daily.loc[articles_daily['ph'] == 1, 'impact'] = articles_daily['impact'] + pd.to_timedelta(1, unit='D')
    articles_daily['ph'] = articles_daily.impact.isin(ph.date).astype(int)
    
    # Check again how many there are that are Public Holidays:
    if len(articles_daily.loc[articles_daily['ph']== 1]) == 0: # we want this to become 0
        print("DONE")
        break
    else:
        print(f"{len(articles_daily.loc[articles_daily['ph']== 1])} articles left on public holidays")

58 articles left on public holidays
24 articles left on public holidays
DONE


In [19]:
# Delete columns that are not necessary:
articles_daily = articles_daily.drop(["weekday", "weekend", "ph"], axis = 1)

In [20]:
# Get only the dates and compare them with the daily SPI returns:
daily_articles_dates = pd.DataFrame(pd.to_datetime(articles_daily['impact']).drop_duplicates(keep='first'))

# change column name
daily_articles_dates = daily_articles_dates.rename(columns={"impact":"dates"})

In [21]:
# Now we will check which dates in the daily articles also exist in the daily returns:
fin_df['common'] = fin_df['date'].isin(daily_articles_dates.dates).astype(int)

# Drop unecessary columns:
fin_df = fin_df.drop(["weekday", "weekend"], axis = 1)

In [22]:
fin_df.head()

Unnamed: 0,day,month,year,log_ret,react_label,equally_weighted_index,equally_weighted_volume,date,common
0,4,1,2016,0.0,Up,117.806131,423161.300443,2016-01-04,0
1,5,1,2016,-0.013838,Down,116.187156,556681.308522,2016-01-05,0
2,6,1,2016,0.026477,Up,119.304562,463869.709177,2016-01-06,0
3,7,1,2016,-0.019801,Down,116.96542,605717.138036,2016-01-07,0
4,8,1,2016,-0.000171,Down,116.945399,572520.24364,2016-01-08,0


In [23]:
# Save:
fin_df.to_csv('daily/daily_common_forecasting.csv', index = False)

## Data for components: extended sentiment reversed for competitors and topics

In [24]:
# Set the paths
daily_topics_import = drt.replace('\\finance data', '') + '\\analysis\\analysis_topics' + '/daily_topics_forecasting.csv'
# Extended sentiment
daily_sentiment_path = drt.replace('\\finance data', '') + '\\analysis' + '/sentiment_daily_extend_comp_rev.xlsx'
daily_sentiment = pd.read_excel(daily_sentiment_path)

In [25]:
# Import daily SPI data:
daily_returns = fin_df
# Topics data:
daily_topics = pd.read_csv(daily_topics_import)
# Rename the 'dates_day' column to 'date':
daily_sentiment.rename(columns = {'dates_day':'date'}, inplace = True)

In [26]:
daily_topics['date'] = pd.to_datetime(daily_topics[['year', 'month', 'day']])
daily_topics = daily_topics.drop(['year', 'month', 'day'], axis=1)
col = daily_topics.pop('date')  # Pop the 'date' column
daily_topics.insert(0, 'date', col)  # Insert 'date' column at the first position (0 index)
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])

In [27]:
# Merge together:
daily_topics = pd.merge(daily_returns, daily_topics, on=['date'], how='left')
daily_topics = pd.merge(daily_topics, daily_sentiment, on=['date'], how='outer')
daily_topics = daily_topics.drop(["common", "react_label", "day", "month", 'year'], axis = 1)

In [28]:
daily_topics = daily_topics.loc[7:,:].reset_index(drop = True)

In [29]:
daily_topics = daily_topics.fillna(method='ffill')

In [30]:
daily_topics = daily_topics.drop(['equally_weighted_index', 'equally_weighted_volume'], axis=1)

In [31]:
daily_topics = daily_topics[daily_topics['date'] <= '2022-07-11']

In [32]:
# Save the DF:
daily_topics.to_csv('daily/daily_topics_extend_comp_rev_forecasting.csv', index = False)

## Data for components: LM sentiment and topics

In [33]:
# Set the paths 
daily_topics_import = drt.replace('\\finance data', '') + '\\analysis\\analysis_topics' + '/daily_topics_forecasting.csv'
# LM sentiment
daily_sentiment_path = drt.replace('\\finance data', '') + '\\analysis' + '/sentiment_daily_LM.xlsx'
daily_sentiment = pd.read_excel(daily_sentiment_path)

In [34]:
# Import daily SPI data:
daily_returns = fin_df
# Topics data:
daily_topics = pd.read_csv(daily_topics_import)
# Rename the 'dates_day' column to 'date':
daily_sentiment.rename(columns = {'dates_day':'date'}, inplace = True)

In [35]:
daily_topics['date'] = pd.to_datetime(daily_topics[['year', 'month', 'day']])
daily_topics = daily_topics.drop(['year', 'month', 'day'], axis=1)
col = daily_topics.pop('date')  # Pop the 'date' column
daily_topics.insert(0, 'date', col)  # Insert 'date' column at the first position (0 index)
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])

In [36]:
# Merge together:
daily_topics = pd.merge(daily_returns, daily_topics, on=['date'], how='left')
daily_topics = pd.merge(daily_topics, daily_sentiment, on=['date'], how='outer')
daily_topics = daily_topics.drop(["common", "react_label", "day", "month", 'year'], axis = 1)

In [37]:
daily_topics = daily_topics.loc[7:,:].reset_index(drop = True)

In [38]:
daily_topics = daily_topics.fillna(method='ffill')

In [39]:
daily_topics = daily_topics.drop(['equally_weighted_index', 'equally_weighted_volume'], axis=1)

In [40]:
daily_topics = daily_topics[daily_topics['date'] <= '2022-07-11']

In [41]:
# Save the DF:
daily_topics.to_csv('daily/daily_topics_LM_forecasting.csv', index = False)

## Combine all the data for forecasting into one df

In [42]:
# Set the paths
daily_sentiment_path = drt.replace('\\finance data', '') + '\\analysis' + '/sentiment_daily_LM.xlsx'
daily_sentiment = pd.read_excel(daily_sentiment_path)
daily_sentiment_extended_path = drt.replace('\\finance data', '') + '\\analysis' + '/sentiment_daily_extend_comp_rev.xlsx'
daily_sentiment_extended = pd.read_excel(daily_sentiment_extended_path)
daily_components_sent_extend_import = drt.replace('\\finance data', '') + '\\analysis\\VAR' + '/components_sent_extend_forecast.csv'
daily_components_sent_import = drt.replace('\\finance data', '') + '\\analysis\\VAR' + '/components_sent_forecast.csv'

In [43]:
# Import daily SPI data:
daily_returns = fin_df
# Rename the 'dates_day' column to 'date':
daily_sentiment.rename(columns = {'dates_day':'date'}, inplace = True)
daily_sentiment_extended.rename(columns = {'dates_day':'date'}, inplace = True)
# Components
daily_components_sent_extend = pd.read_csv(daily_components_sent_extend_import)
daily_components_sent = pd.read_csv(daily_components_sent_import)

In [44]:
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
daily_sentiment_extended['date'] = pd.to_datetime(daily_sentiment_extended['date'])
daily_components_sent_extend['date'] = pd.to_datetime(daily_components_sent_extend['date'])
daily_components_sent['date'] = pd.to_datetime(daily_components_sent['date'])

In [45]:
# Merge together:
daily_topics = pd.merge(daily_returns, daily_sentiment, on=['date'], how='left')
daily_topics = pd.merge(daily_topics, daily_sentiment_extended, on=['date'], how='outer')
daily_topics = pd.merge(daily_topics, daily_components_sent_extend, on=['date'], how='outer')
daily_topics = pd.merge(daily_topics, daily_components_sent, on=['date'], how='outer')
daily_topics = daily_topics.drop(["common", "react_label", "day", "month", 'year'], axis = 1)

In [46]:
daily_topics = daily_topics.loc[7:,:].reset_index(drop = True)

In [47]:
daily_topics = daily_topics.fillna(method='ffill')

In [48]:
daily_topics = daily_topics[daily_topics['date'] <= '2022-07-11']

In [49]:
# Save the DF:
path_to_file = os.getcwd().replace('\\finance data', '\\analysis\\forecasting')
daily_topics.to_csv(path_to_file + '\\data_for_forecasting.csv', index = False)