# 1. Scraping

## 1.1 Crawling CNBC to get the article links

In [None]:
def get_links_to_scrape(start_year, end_year):

    """
    This function takes a starting year and an ending year and crawls through
    CNBC's SiteMap structure to find all links to articles that are written
    in the given time period.
    """

    # Import of the required packages
    import requests
    from bs4 import BeautifulSoup, SoupStrainer

    # First, we define every month and year that should be scraped.
    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    years = [year for year in range(start_year, end_year+1)]

    # In order to scrape the links faster, we make use of SoupStrainer
    # to only parse the relevant part of the site: ALl the article links.
    strainer = SoupStrainer('a', class_ = 'SiteMapArticleList-link')

    # We instantiate an empty list that we can store all the links in.
    article_links = []

    # For every day, in every month in every year, construct a link
    # that should be scraped. We do this, because CNBC's link structure
    # in their archive is given af .../site-map/YEAR/MONTH/DAY_OF_MONTH.
    for year in years:
        for month in months:
            for day in range(1,32):
                link = f'https://www.cnbc.com/site-map/{year}/{month}/{day}/'

                # Take the generated link and load it in through requests,
                # then take the contents and convert it to a string.
                page = str(requests.get(link).content)

                # CNBC load in their CSS directly into the HTML, with usually
                # would make it very heavy to scrape. We can fix this, simply
                # by cutting remove the entire <head></head> part of the html.
                page = page[str(page).find('body'):]

                # Now, pass the remaining content through BeautifulSoup's parser
                # and apply the strainer. This means it will only parse link
                # elements with the class of SiteMapArticleList-Link and it
                # will only search through body-section of the document.
                soup = BeautifulSoup(page, parse_only = strainer)

                # Now we make use of the find_all function in BeautifulSoup
                # that simply generates a list of codeblocks that match the
                # query. This means we get a list of links, and we store the
                # href (actual links) in our instantiated list from before.
                links = soup.find_all('a', class_='SiteMapArticleList-link')
                if links != []:
                    for a_link in links:
                        article_links.append(a_link['href'])

    # Return the final list of article links that we want to scrape.
    return article_links

In [None]:
article_links = get_links_to_scrape(2006, 2020)

## 1.2 Scrape articles from link list

In [None]:
def scrape_cnbc_articles(list_of_links):

    """
    This function take a list of article links and attempts to scrape
    them according to CNBC's HTML structure. It will return a pandas
    DataFrame with the scraped data.
    """

    # Import of the required packages
    import requests
    from bs4 import BeautifulSoup, SoupStrainer
    import pandas as pd

    # First, we instantiate an empty list, that we can append the scraped
    # data to. Then we instantiate an index, which enables us to follow the
    # scrapers progress, and lastly, we instantiate a request Session, which
    # minimizes our load time per request by little bit, but that time will
    # accumulate for all our links and should have quite an impact.
    df = []
    index = 0
    request = requests.Session()

    # For every link in the list, get the source code and add 1 to the index.
    for link in list_of_links:
        page = request.get(link)
        index += 1

        # If the pages HTTP request Code is 200 (Meaning "I loaded the site
        # correctly, and the server sent me the data correctly"), then
        # attempt to scrape the site.
        if page.status_code == 200:
            try:

                # First, we convert the page content to a string, so we can
                # remove most of the irrelevant HTML. We remove everything
                # before a box with the class "MainContent"
                page = str(page.content)
                page = page[page.find('<div id="MainContent"'):]

                # Now we parse the remaining content of the page into
                # BeautifulSoup and try to extract the text of:
                soup_link = BeautifulSoup(page)

                # The Header-1 tag with a class of 'ArticleHeader-headline'
                title = soup_link.find('h1', class_='ArticleHeader-headline').get_text()

                # The div (box) tag with a class of 'ArticleBody-articleBody'
                article = soup_link.find('div', class_='ArticleBody-articleBody').get_text()

                # The date generated from the link. The link will always
                # contain the date if it is an article. We save it as
                # DD/MM/YYYY.
                date = f'{link[29:31]}-{link[26:28]}-{link[21:25]}'

                # The link that have a class of 'ArticleHeader-eyebrow',
                # which is their article topic.
                topic = soup_link.find('a', class_='ArticleHeader-eyebrow').get_text()

                # If they are all successfully gathered, we append it all
                # into our list called df.
                df.append([title, topic, date, article, link])

                # If successful, print the progress as well as the link.
                print(f'({index}/{len(list_of_links)}) : {link}')
            except:
                # If we get a status code 200, but somehow some of the elements
                # wasn't there, then skip the entire article. This ensures that
                # we get a dataset without missing variables.
                print(f'({index}/{len(list_of_links)}) : Skipped')
        else:
            # If we didn't get a status code 200 (Meaning something went wrong
            #  in the loading of the page), then skip the article.
            print(f'({index}/{len(list_of_links)}) : Skipped')

    # Lastly, we return a dataframe that contains all of our scraped articles.
    return pd.DataFrame(df)

In [None]:
df = scrape_cnbc_articles(article_links)

## 1.3 Collecting the datasets, if collected over multiple times

In [None]:
def collect_dataset(datasets_path):

    """
    This function takes all the files in a folder and tries to concatenate them
    into one dataframe. This is of course only possible if your datasets is
    the only csv-files in the folder and if they have the same data-structure.
    We made this, because we scraped the links over several sessions.
    """

    # Import neccesary packages
    import pandas as pd

    # List all of our datasets in the folder
    datasets = [ds for ds in datasets_path if '.csv' in ds]

    # Load all the datasets in and append it to a list called 'frames'. We can
    # use this list to concatenate all the dataframes according to the Pandas
    # documentation.
    frames = []
    for dataset in datasets:
        df = pd.read_csv(dataset, error_bad_lines=False, index_col=False)
        df = df[df.columns[-5:]]
        df.columns = ['Title', 'Topic', 'Date', 'Content', 'Link']
        frames.append(df)

    # Return a concatenated dataframe of all the dataframes.
    return pd.concat(frames)

In [None]:
import os
df = collect_dataset(os.listdir())

# 2. Preprocessing & Cleaning

In [None]:
df['Content'] = df['Content'].astype('str')
df['Title'] = df['Title'].astype('str')

In [None]:
def remove_clutter(text):

    """
    This function takes a string and removes what we consider as clutter
    in our data. This includes things such as special unicode characters
    and video timestamps.
    """

    # Importing neccesary packages
    import re

    # Trying to remove special unicode characters. Our regular expression finds
    # any substrings that starts with a \x followed by two char/num combinations
    text = re.sub(r'\\x[A-Za-z0-9_]{2}', '', text)

    # Trying to remove video annotation. We are using a regular expression,
    # to find any pattern that matches the word video followed by a
    # timestamp (length of video). We believe this is just clutter in
    # our data as well.
    text = re.sub(r'VIDEO([0-9]|[0-9]{2}):[0-9]{4}:[0-9]{2}', ' ', text)

    # Trying to remove image references. Whenever an article contains an
    # image, the page returns a string representation of the image as the
    # source "Getty Images". We remove this representation, as it brings
    # no value to the analysis.
    text = text.replace('Getty Images', '')

    # We now remove commas, apostrophes, and double spaces. We introduce
    # double spaces in the line above, however this could mess up our
    # tokenization, so we simply convert any doublespaces to single spaces.
    # We remove apostrophes after n's to normalize contracted words like
    # wasn't, couldn't etc. Some of these words are already normalized
    # since some of these apostrophes already have been remove by the regex
    # unicode decluttering.
    text = re.sub(r',','', text)
    text = re.sub(r"n'",'n', text)
    text = re.sub(r'  ',' ', text)

    # Finally, we return the decluttered text.
    return text

In [None]:
def cleaning(df,column):

    """
    This function takes a dataframe and a column name and cleans the entire
    column for clutter (using remove_clutter function), then pass it through
    spaCy to further clean and tokenize. It returns the original dataframe,
    but the given column is now cleared of clutter and it contains two
    additional columns (Tokens, cleaned_text) as well.
    """

    # Import neccesary packages
    import spacy
    nlp = spacy.load('en_core_web_sm')
    import pandas as pd

    # First we instantiate a list that we can append all processed tokens in.
    # This makes it possible for us to append it to the dataframe at a later
    # stage.
    tokens = []

    # Now, we apply our remove_clutter function to the chosen column in the
    # dataframe. This runs the remove_clutter function for every entry in
    # the column.
    df[column].apply(remove_clutter)

    # Define an variable to count the progress of our cleaning.
    index = 0

    # Now, we iterate over all entries (articles in our case) in the column
    # and create a nlp object for each, which we can work with.
    for article in nlp.pipe(df[column], disable=['parser']):

        # Now, we store all tokens that pass our requirements in a list for each
        # article. That means that each article will have their own
        # list of tokens.
        article_tok = [token.lemma_.lower() for token in article if _
            token.is_alpha _
            and not token.is_stop _
            and token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV', 'VERB'] _
            and token.ent_type_ not in ['PERSON', 'MONEY', 'PERCENT', 'LOC', 'DATE', 'TIME', 'QUANTITY', 'ORDINAL'] _
            and len(token)>1]

        # Now, we append said list of tokens for each article in our tokens list.
        tokens.append(article_tok)

        # When each article is processed, we increase the index by one and print
        # the progress. This allows us to keep track of how far it is in the
        # cleaning process. When you are dealing with many thousands of
        # articles, it might take a while, so this feature is quite nice.
        index += 1
        print(f'Processed {index}/{len(df[column])}')

    # When all cleaned articles are appended to our tokens list, we simply
    # add the list as a column in the original dataframe.
    df['tokens'] = tokens

    # Lastly, we reconstruct all the articles from the tokens, simply by joining
    # all the tokens in each article_tok list. We achieve this by a simple
    # combination of map & lambda functions.
    df['clean_articles'] = df['tokens'].map(lambda row: " ".join(row))

    # Returning the df that contains cleaned data and new columns.
    return df

In [None]:
df = cleaning(df, 'Content')

# 3. Attempting to classify topics

In [None]:
# Output topics to construct final_topic_list manually
pd.DataFrame(df.Topic.unique()).to_csv('unique_topics.csv')

In [None]:
# Read topic mapping
topic_list = pd.read_csv('final_topic_list.csv', sep = ";")
topic_list_clean = pd.DataFrame.dropna(topic_list)

In [None]:
# Map Topics to articles
predetermined = []
index = 0
for topic in df["Topic"]:
    index += 1
    if topic in list(topic_list_clean["Topic"]):
        predetermined.append(topic_list_clean[topic_list_clean["Topic"] == topic]["Predetermined topic"].to_numpy()[0])
    else:
        predetermined.append("Other")
    print(f'{index}')
df['final_topic'] = predetermined

In [None]:
# Splitting to train and predict
df_labelled = df[df['final_topic'] != 'Other']
df_predict = df[df['final_topic'] == 'Other']

In [None]:
# Split training into train and test and construct LSA
from sklearn.model_selection import train_test_split

r_state = 123

x_train, x_test, y_train, y_test = train_test_split(
    df_labelled['tokens'], df_labelled['final_topic'], test_size=0.25, random_state=r_state)

from gensim import models, corpora
import ast

data_processed = x_train.to_numpy()
data_conversion = []
for line in data_processed:
    line = ast.literal_eval(line)
    data_conversion.append(line)
data_processed = data_conversion

dictionary = corpora.Dictionary(data_processed)
corpus = [dictionary.doc2bow(line) for line in data_processed]
tfidf = models.TfidfModel(corpus, smartirs='ntc')
lsa_model = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=50)

In [None]:
# Evaluate XGBoost and conclude that it fails...
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state = r_state)
xgb_model.fit(x_train, y_train)
xgb_prediction = xgb_model.predict(x_test)
print(classification_report(y_test,xgb_prediction))

## 3.1 Filtering articles instead

In [None]:
df = df[df['final_topic'] != 'Other']

# 4. Collecting datasets

## 4.1 Collecting articles

In [None]:
def rolling_articles(start_date, end_date, df, start_range, end_range):

    """
    Just like the rolling returns, this function concatenates all the articles
    into a dataframe consisting of dates as rows. Intially, we played with the
    thought of concatenating the articles rolling with a weeks lag, which is
    why it supprts end and start range, however this demanded too much
    compututional power for our time scope, which is why we simply used
    concatenated them daily.
    """

    # Importing neccesary packages
    from datetime import timedelta, datetime
    import ast

    # Generating a list of dates that we can use to filter the articles from.
    date_list = [start_date + timedelta(days=x) for x in range(0,int((end_date - start_date).days)+1)]

    # Converting all date-strings in date column to actual date objects
    df['Date'] = pd.to_datetime(df['Date']).dt.date

    # Generate new dataframe and instantiating a count variable that we can use
    # to display the progress whilst running it.
    date_index = []
    count = 0

    # For every date in our generated list of dates, find all the articles that
    # lies within the range. Then take their tokens (because of re-import,
    # these were actually a string) and convert to a string objects without
    # list characters. Also, if any date has more than 30 articles,
    # just take the 30 first articles. We integrated the last condition because
    # of diminishing marginal benefit compared to the extra compututional effort.
    for date in date_list:

        # Here we get the articles
        articles = df[(df['Date'] <= date + timedelta(days=end_range)) & (df['Date'] >= date + timedelta(days=start_range))].head(30*(1+end_range-start_range))
        count += len(articles)
        processed = ""
        for article in articles['tokens']:
            try:
                # Here we attempt to remove the list characters. I didn't matter
                # if we passed our articles as tokens or strings to gensim's
                # Word2Vec algo, so we chose as string for the ease of it.
                article = str(article).replace("[", "").replace("]", "").replace(",", "").replace("'","")
                processed += article
                processed += " "
            except:
                pass
        # Now, append the date and the related articles to our date_index list,
        # which we can turn into a dataframe, once it is returned.
        date_index.append([date, processed])
        print(f'{date}: {count}')

    # Finally, return the date_index
    return date_index

In [None]:
from datetime import date

sdate = date(2006, 11, 27)
edate = date(2020, 11, 30)

x = rolling_articles(sdate, edate, df, 0, 0)

## 4.2 Calculating returns

In [None]:
bb = pd.read_csv('bb_prices.csv')

In [None]:
def calculate_returns(prices, interval):

    """
    This function takes a dataset with prices for different securities over
    time, where each row is a point in time and each column is a security.
    It then calculates the returns for each security for a given interval
    for at each date. It returns a dataset with dates as rows and securities as
    columns, with returns for the given interval as values.
    """

    # Importing neccesary packages
    import pandas as pd

    # Converting all date-strings in date column to actual date objects. We can
    # use these at a later stage to match returns to news articles.
    prices['Dates'] = pd.to_datetime(prices['Dates']).dt.date

    # Now we instantiate a new list to store our returns in.
    date_index = []

    # For every entry in the prices dataframe, try to fetch the current prices
    # and the prices 'interval' periods in the future. If successful, get the
    # return and append it to a list called 'returns'
    for i in range(0,len(prices)):
        try:
            # Getting the current date of the entry
            date = prices.iloc[i,0]

            # Getting the prices for said date
            prices_at_date = prices.iloc[i,1:]

            # Getting the prices 'interval' periods in the future
            prices_at_future_date = prices.iloc[i+interval,1:]

            # Attempt to calculate the returns between the two periods.
            return_at_date = list(prices_at_future_date / prices_at_date)

            # Create a list called returns that contains the date. We can then
            # append the returns in this list as well.
            returns = [date]
            for sector in return_at_date:
                # For every column (sector) in our returns data, append it to
                # the returns list.
                returns.append(sector)

            # Now, we can take the returns for each date and append it to our
            # date_index list, which will make up our final dataframe in the end.
            date_index.append(returns)
        except:
            # If we can't calculate the returns, simply pass the date.
            pass

    # Now, convert date_index to a dataframe and return the dataframe.
    df = pd.DataFrame(date_index, columns = prices.columns)
    return df

In [None]:
y = calculate_returns(bb, 7)

## 4.3 Collect dataset

In [None]:
x = x.set_index('Date')
y = y.set_index('Dates')
df = pd.concat([y, x.reindex(y.index)], axis = 1).dropna()

# 5. Neural Network Modelling

In [1]:
from numpy import array
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
import pandas as pd

In [2]:
df = pd.read_csv('https://www.dropbox.com/s/llxun0a85lpf6e3/df_final.csv?dl=1')
df['Dates'] = pd.to_datetime(df['Dates']).dt.date
df.head()

Unnamed: 0,Dates,SPTR INDEX,SPTRINFT INDEX,SPTRENRS INDEX,SPTRFINL INDEX,SPTRHLTH INDEX,SPTRINDU INDEX,SPTRCOND INDEX,SPTRUTIL INDEX,SPTRMATR INDEX,SPTRCONS INDEX,SPTRTELS INDEX,SPTRRLST INDEX,tokens
0,2006-12-12,1.010224,1.002885,1.004642,1.016601,1.014123,1.0169,1.010797,1.002999,1.016288,1.006769,0.993621,0.983808,imagine disney nbc universal team desperate co...
1,2006-12-13,1.007461,1.001175,0.982023,1.017135,1.015294,1.022835,1.006014,0.995535,1.009593,1.005985,0.992353,0.999058,amc jump public market movie theater company a...
2,2006-12-14,0.995271,0.983649,0.959095,1.006377,1.00768,1.009989,0.991821,0.990393,0.988344,1.002454,0.997374,0.985457,red sox look go sign japanese pitcher report d...
3,2006-12-15,0.989004,0.974092,0.9629,0.998287,0.99786,0.996399,0.9942,0.990149,0.974067,0.998513,0.988887,0.983588,official start award season nomination good hi...
4,2006-12-18,0.992207,0.980559,0.98943,0.993734,0.997623,0.993896,0.995171,0.997737,0.983931,0.998484,0.993293,0.982202,merck win vioxx case get victory federal case ...


In [3]:
# list of tokens in list of articles(in a day) - We train the gensim model on all data prior to 2011
from datetime import date
token_list = list([token.split(" ") for token in df[df['Dates'] <= date(2010, 12, 31)]['tokens']])
size = 100

In [4]:
# Training gensim word2vec
import gensim

model = gensim.models.Word2Vec(sentences = token_list, size = size, window = 5, workers = 4, min_count = 20)
# Vocab size:
words = list(model.wv.vocab)
print('vocabulary size: %d' % len(words))

vocabulary size: 7311


In [5]:
# save the model
filename = 'article_embeddings.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [6]:
# Load in the model
import os
import numpy as np

embeddings_index= {}
f = open(os.path.join('', 'article_embeddings.txt'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [7]:
from keras.preprocessing.text import Tokenizer
from datetime import date

# vectorise the text samples into a 2D integer tensor with the data for token list before 2011
tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_list)

# Now convert all the tokens to sequences
token_list = list([token.split(" ") for token in df['tokens']])
sequences = tokenizer.texts_to_sequences(token_list)

In [8]:
# pad sequences to make input length constant
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Get average of list
def Average(lst): 
    return sum(lst) / len(lst) 

max_length = int(Average([len(doc) for doc in token_list]))
articles_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
print('Shape of article tensor:', articles_pad.shape)

Found 54770 unique tokens.
Shape of article tensor: (3468, 4901)


In [9]:
# Constructing the embedding matrix
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, size))

for word, i in word_index.items():
    if i > vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [10]:
# Import neccesary packages for NN model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import LSTM
from keras.initializers import Constant

In [11]:
# define Untrainable model
model = Sequential()
embedding_layer = Embedding(vocab_size,
                           size,
                           embeddings_initializer=Constant(embedding_matrix),
                           mask_zero = True,
                           input_length=None,
                           trainable=False)
# Add embedding layer
model.add(embedding_layer)

# Add a LSTM layer with 50 internal units.
model.add(LSTM(50, return_sequences=True, input_shape=(100,12), dropout = 0.2))
model.add(LSTM(50, return_sequences=True, input_shape=(100,12), dropout = 0.2))
model.add(LSTM(50, dropout = 0.2))
# Add a Dense layer with 12 units.
model.add(Dense(12))
# Add compiler with XXX
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
# Print summary of model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         5477100   
_________________________________________________________________
lstm (LSTM)                  (None, None, 50)          30200     
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 50)          20200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 12)                612       
Total params: 5,548,312
Trainable params: 71,212
Non-trainable params: 5,477,100
_________________________________________________________________
None


In [12]:
# Scale our Y
from sklearn.preprocessing import StandardScaler

y = df.iloc[:,1:13]
y = y - 1

scaler = StandardScaler()
scaler.fit(y)
y = scaler.transform(y)
y = pd.DataFrame(y)

In [13]:
def walk_forward_validation(model, epochs, x, y, step_size, train_steps, val_window):

    """
    This function takes a model, specifically our neural net with multiple
    LSTM-layers, the desired number of epochs, x data, y data, desired step
    size, desired number of steps that should be trained per round, and the
    desired validation window. It then trains a model through a Walk Forward
    Validation method that stores MSE-scores for both training and validation
    steps over time. It returns our backtesting predicted y, our trained y's
    and our MSE-scores.
    """

    # First, we import the required packages. We only have one dependency which
    # we uses to calculate the mean squared error each period
    from sklearn.metrics import mean_squared_error

    # Now we instantiate a couple of things. First we define how many records
    # that we have - We use this to loop through our data. Then we define the
    # initial training size, which gives us the point in time where we
    # should start over test. Then we instantiate three empty lists that we
    # later will use to store our results.
    n_records = len(x)
    n_init_train = step_size * train_steps
    train_pred = []
    val_pred = []
    mse_scores = []

    # This for loop goes from the starting point in time (as defined above)
    # to the end of our data and step through the data, enabling us to make the
    # walk forward validation. Our current point in time, i, will jump by the
    # step size each iteration.
    for i in range(n_init_train, n_records, step_size):

        # We know that the starting point for the training data, must be the
        # current point in time minus the training period.
        train_from = i-n_init_train

        # We need to train it to the current point in time.
        train_to = i

        # We then need to validate starting from tomorrow relative to
        # the current point in time.
        test_from = i+1
        # And validate the desired window in the future relative to the
        # point in time
        test_to = i+val_window

        # Now we can split our data at this point in time
        x_train, x_test = x[train_from:train_to], x[test_from:test_to]
        y_train, y_test = y[train_from:train_to], y[test_from:test_to]

        # And then use the data to train the model
        print(f'Train from {i-n_init_train} to {i} and validate for {i+1} to {i+val_window}')
        model.fit(x_train, y_train, epochs=epochs, verbose=1)

        # Here, we can store the training phase's historical predictions of seen y.
        y_train_pred = model.predict(x_train)
        for y_train_day in y_train_pred:
            train_pred.append(y_train_day.tolist())

        # Here, we store the validation phase's future predictions of unseen y.
        y_pred = model.predict(x_test)
        for y_test_day in y_pred:
            val_pred.append(y_test_day.tolist())

        # Here, we calculate MSE for both and append it to our MSE-scores list.
        train_mse = mean_squared_error(y_train,y_train_pred)
        val_mse = mean_squared_error(y_test,y_pred)
        mse_scores.append([train_mse, val_mse])

        print(f'     train: {train_mse} \nvalidation: {val_mse} \n')

    # Lastly, we return the training predictions, the actual validation
    # predictions as well as the observed MSE-scores.
    return train_pred, val_pred, mse_scores

In [17]:
# Epoch Hyper parameter tuning
epoch_tuning_performance = []
for epoch in range(1,11):
    train_pred, val_pred, validation_metrics = walk_forward_validation(model = model, epochs = epoch, x = pd.DataFrame(articles_pad[970:]), y = y[970:], step_size = 60, train_steps = 3, val_window = 60)
    validation_metrics_total = pd.DataFrame(validation_metrics).mean(axis=0)
    mean_train_mse = validation_metrics_total.iloc[0]
    mean_val_mse = validation_metrics_total.iloc[1]
    epoch_tuning_performance.append([mean_train_mse, mean_val_mse])

Train from 0 to 180 and validate for 181 to 240
     train: 1.0366643882529083 
validation: 1.0932809439798452 

Train from 60 to 240 and validate for 241 to 300
     train: 1.2221448387646097 
validation: 0.2751915217980937 

Train from 120 to 300 and validate for 301 to 360
     train: 1.1164253230069383 
validation: 0.6169644231006765 

Train from 180 to 360 and validate for 361 to 420

KeyboardInterrupt: ignored

In [18]:
# Using plotly.express to display epoch tuning
import plotly.express as px
import pandas as pd

df = pd.DataFrame(epoch_tuning_performance)
fig = px.line(df, x='epochs', y=["train","validation"], color_discrete_sequence=['cornflowerblue', 'indigo'])
fig.update_xaxes(title_text='Epochs', showgrid=False)
fig.update_yaxes(title_text='MSE')
fig.show()

ValueError: ignored

In [14]:
# Train model and get MSE & Predicted values
train_pred, val_pred, validation_metrics = walk_forward_validation(model = model, epochs = 7, x = pd.DataFrame(articles_pad[970:]), y = y[970:], step_size = 60, train_steps = 3, val_window = 60)
validation_metrics_total = pd.DataFrame(validation_metrics).mean(axis=0)
mean_train_mse = validation_metrics_total.iloc[0]
mean_val_mse = validation_metrics_total.iloc[1]

Train from 0 to 180 and validate for 181 to 240
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
     train: 0.6942882596549129 
validation: 1.3387487139963603 

Train from 60 to 240 and validate for 241 to 300
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
     train: 0.432780366696452 
validation: 0.4921329955089613 

Train from 120 to 300 and validate for 301 to 360
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
     train: 0.29094088271567325 
validation: 1.2590004770169247 

Train from 180 to 360 and validate for 361 to 420
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
     train: 0.2189237224841379 
validation: 0.6691865915182179 

Train from 240 to 420 and validate for 421 to 480
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
     train: 0.1777991811341131 
validation: 0.6406233376535638 

Train from 300 to 480 and validate for 481 to 540
Epoch 1/7
Epoch 2/7
Epoch 3/

In [15]:
pd.DataFrame(scaler.inverse_transform(train_pred)+1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.002378,1.002619,1.002351,1.003180,1.004984,1.001935,1.003995,1.003169,0.999011,1.002974,1.004817,1.002959
1,1.008032,1.006587,1.008192,1.010795,1.011593,1.011804,1.011275,1.010059,1.009192,1.006125,1.008965,1.014159
2,1.009558,1.010869,1.020395,1.005061,1.007466,1.005868,1.009217,1.007681,1.010284,1.005065,1.005884,1.010713
3,1.008305,1.010248,1.016456,1.004312,1.009305,1.008463,1.010186,1.005562,1.011795,1.006100,1.008183,1.014799
4,0.996029,0.997010,1.006602,0.980299,0.996965,0.995938,0.998330,1.004756,1.002105,0.997065,1.001291,0.998593
...,...,...,...,...,...,...,...,...,...,...,...,...
7015,1.003488,1.007053,0.992608,0.997444,1.000962,1.001356,0.996913,1.007907,1.004697,1.006221,1.002968,1.001685
7016,0.973314,0.977563,0.951799,0.958609,0.981944,0.964594,0.977606,0.969724,0.967360,0.980519,0.972557,0.961496
7017,0.946851,0.957664,0.911454,0.928922,0.960423,0.933270,0.953728,0.948397,0.936850,0.968745,0.958439,0.951526
7018,0.960045,0.966446,0.935860,0.944467,0.963984,0.949808,0.960699,0.964137,0.953775,0.974391,0.959989,0.948994


In [16]:
pd.DataFrame(scaler.inverse_transform(val_pred)+1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.982655,0.980964,0.964470,0.968158,0.981726,0.970010,0.977613,0.988501,0.967699,0.990554,0.984038,0.963558
1,1.006547,1.005892,1.005766,1.003910,1.006647,1.006164,1.006361,1.005348,1.005193,1.005213,1.005562,1.010853
2,0.996456,0.996402,0.994518,0.990775,0.999925,0.993281,0.997079,1.000049,0.993463,0.999046,1.000784,0.995080
3,1.011838,1.017436,1.039585,1.006729,1.008846,1.019985,1.023047,1.017100,1.026120,1.007812,1.006350,1.024439
4,1.006947,1.008611,1.021895,0.999836,1.006143,1.008186,1.009583,1.006771,1.013405,1.003679,1.007898,1.011972
...,...,...,...,...,...,...,...,...,...,...,...,...
2274,1.003678,1.002942,0.987572,0.999635,0.999745,1.001557,0.995198,1.011229,1.002504,1.003168,1.000657,1.003118
2275,1.008835,1.014644,1.012891,1.010606,1.009232,1.011363,1.007803,1.004850,1.014110,1.005812,1.010313,1.005826
2276,1.019246,1.019723,1.029361,1.025424,1.016335,1.033427,1.018821,1.024115,1.035583,1.010959,1.021961,1.023622
2277,1.011083,1.016498,1.014141,1.004401,1.005415,1.012908,1.012813,1.009868,1.012058,1.005009,1.006567,1.010932


In [17]:
pd.DataFrame(validation_metrics)

Unnamed: 0,0,1
0,0.694288,1.338749
1,0.43278,0.492133
2,0.290941,1.259
3,0.218924,0.669187
4,0.177799,0.640623
5,0.175398,0.46033
6,0.14106,0.558835
7,0.163898,0.621087
8,0.189551,0.537819
9,0.191965,0.634416


In [18]:
# define Trainable model
model = Sequential()
embedding_layer = Embedding(vocab_size,
                           size,
                           embeddings_initializer=Constant(embedding_matrix),
                           mask_zero = True,
                           input_length=None,
                           trainable=True)
# Add embedding layer
model.add(embedding_layer)

# Add a LSTM layer with 50 internal units.
model.add(LSTM(50, return_sequences=True, input_shape=(100,12), dropout = 0.2))
model.add(LSTM(50, return_sequences=True, input_shape=(100,12), dropout = 0.2))
model.add(LSTM(50, dropout = 0.2))
# Add a Dense layer with 12 units.
model.add(Dense(12))
# Add compiler with XXX
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
# Print summary of model
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         5477100   
_________________________________________________________________
lstm_3 (LSTM)                (None, None, 50)          30200     
_________________________________________________________________
lstm_4 (LSTM)                (None, None, 50)          20200     
_________________________________________________________________
lstm_5 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_1 (Dense)              (None, 12)                612       
Total params: 5,548,312
Trainable params: 5,548,312
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
# Train model and get MSE & Predicted values
train_pred_trained, val_pred_trained, validation_metrics_trained = walk_forward_validation(model = model, epochs = 7, x = pd.DataFrame(articles_pad[970:]), y = y[970:], step_size = 60, train_steps = 3, val_window = 60)
validation_metrics_trained_total = pd.DataFrame(validation_metrics_trained).mean(axis=0)
mean_train_trained_mse = validation_metrics_trained_total.iloc[0]
mean_val_trained_mse = validation_metrics_trained_total.iloc[1]

Train from 0 to 180 and validate for 181 to 240
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
     train: 0.6890397686291699 
validation: 1.4540047475432443 

Train from 60 to 240 and validate for 241 to 300
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
     train: 0.3190679527712455 
validation: 1.071912761662376 

Train from 120 to 300 and validate for 301 to 360
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
     train: 0.24110993966007563 
validation: 1.1555547810982063 

Train from 180 to 360 and validate for 361 to 420
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
     train: 0.17761496277705402 
validation: 0.5753965595960432 

Train from 240 to 420 and validate for 421 to 480
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
     train: 0.145711178903783 
validation: 0.6411284320881154 

Train from 300 to 480 and validate for 481 to 540
Epoch 1/7
Epoch 2/7
Epoch 3/

In [20]:
pd.DataFrame(scaler.inverse_transform(train_pred_trained)+1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1.008906,1.008663,1.016516,1.006302,1.006992,1.009500,1.010035,1.004776,1.010345,1.004994,1.005722,1.011072
1,1.015726,1.012708,1.030993,1.019245,1.013907,1.024698,1.017327,1.015534,1.018949,1.010103,1.013376,1.026368
2,1.010427,1.011111,1.015287,1.008230,1.006276,1.013577,1.010613,1.008247,1.009113,1.007270,1.009284,1.013095
3,1.003794,1.001518,1.007417,0.998763,1.003612,1.001927,1.003831,1.001268,1.002545,1.003609,1.003189,1.004831
4,0.988154,0.989676,0.986491,0.984382,0.991009,0.979820,0.987943,0.997553,0.979347,0.997385,0.996468,0.991854
...,...,...,...,...,...,...,...,...,...,...,...,...
7015,1.002259,1.007308,0.982462,0.989313,1.008395,0.998179,1.002035,1.007547,1.000570,1.009927,1.004589,1.001759
7016,0.990869,0.998825,0.971029,0.975947,0.987920,0.980295,0.993379,0.986257,0.985383,0.992853,0.990931,0.980423
7017,0.984941,0.992160,0.953221,0.963255,0.989227,0.972298,0.988260,0.993118,0.971921,0.994564,0.990175,0.988682
7018,0.960933,0.962267,0.913165,0.941112,0.969350,0.938529,0.960103,0.967008,0.946122,0.978518,0.960568,0.956065


In [21]:
pd.DataFrame(scaler.inverse_transform(val_pred_trained)+1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.980340,0.978696,0.977930,0.964980,0.981613,0.966929,0.976902,0.984380,0.978468,0.987711,0.983910,0.976516
1,1.007134,1.005375,1.012762,1.010246,1.005893,1.008425,1.007045,1.011490,1.003330,1.009841,1.010821,1.014881
2,0.985539,0.989130,0.980321,0.978383,0.989299,0.978776,0.986874,0.996835,0.982431,0.995107,0.993456,0.982913
3,1.005230,1.012054,1.013859,1.009503,1.006733,1.017796,1.013596,1.020197,1.008158,1.005533,1.009081,1.016441
4,0.995514,0.994851,0.993424,0.989105,0.995862,0.991546,0.994622,1.002550,0.990351,1.001488,0.999153,0.993511
...,...,...,...,...,...,...,...,...,...,...,...,...
2274,1.008601,1.014552,0.994254,1.001706,1.013179,1.004730,1.008902,1.008404,1.007071,1.013157,1.003748,1.008474
2275,0.979647,0.987842,0.955669,0.963329,0.982458,0.970807,0.980951,0.975655,0.967841,0.989195,0.982011,0.967332
2276,1.006228,1.013326,0.990102,0.993171,1.005824,0.999496,1.006580,1.008291,1.006303,1.010767,1.007494,1.001519
2277,1.020187,1.022118,1.005721,1.018025,1.017046,1.018635,1.019288,1.015648,1.019752,1.013630,1.017734,1.016327


In [22]:
pd.DataFrame(validation_metrics_trained)

Unnamed: 0,0,1
0,0.68904,1.454005
1,0.319068,1.071913
2,0.24111,1.155555
3,0.177615,0.575397
4,0.145711,0.641128
5,0.135061,0.380855
6,0.111592,0.575036
7,0.134223,0.498876
8,0.138715,0.412231
9,0.112647,0.521387


In [23]:
import plotly.express as px
# import untrainable validation metrics
#un_val = pd.read_csv('validation_metrics_trainable_false.csv', names=['quarters', 'train_un', 'validation_un'], header=0)
un_val = pd.DataFrame(validation_metrics)
un_val.columns = ['train_un', 'validation_un']
# import trainable validation metrics
#tr_val = pd.read_csv('validation_metrics_train.csv', names=['train', 'validation'], header=0, usecols=[1,2])
tr_val = pd.DataFrame(validation_metrics_trained)
tr_val.columns = ['train', 'validation']

# The two dataframes are merged in order to be able to plot training and validation together
val_metrics = pd.DataFrame(pd.concat([un_val, tr_val], axis=1))
val_metrics = val_metrics.reset_index(drop=False)

# Plot untrained and trained validation sets

fig4 = px.line(val_metrics, x='index', y=['validation_un', 'validation'], color_discrete_sequence=['cornflowerblue', 'indigo'])
fig4.update_xaxes(title_text='Quarters', showgrid=False)
fig4.update_yaxes(title_text='MSE')
fig4.show()

In [24]:
val_metrics

Unnamed: 0,index,train_un,validation_un,train,validation
0,0,0.694288,1.338749,0.68904,1.454005
1,1,0.43278,0.492133,0.319068,1.071913
2,2,0.290941,1.259,0.24111,1.155555
3,3,0.218924,0.669187,0.177615,0.575397
4,4,0.177799,0.640623,0.145711,0.641128
5,5,0.175398,0.46033,0.135061,0.380855
6,6,0.14106,0.558835,0.111592,0.575036
7,7,0.163898,0.621087,0.134223,0.498876
8,8,0.189551,0.537819,0.138715,0.412231
9,9,0.191965,0.634416,0.112647,0.521387
