# 1. Scraping

## 1.1 Crawling CNBC to get the article links

In [None]:
def get_links_to_scrape(start_year, end_year):

    import requests
    from bs4 import BeautifulSoup, SoupStrainer

    months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    years = [year for year in range(start_year, end_year+1)]
    
    strainer = SoupStrainer('a', class_ = 'SiteMapArticleList-link')
    article_links = []

    for year in years:
        for month in months:
            for day in range(1,32):
                link = f'https://www.cnbc.com/site-map/{year}/{month}/{day}/'

                page = str(requests.get(link).content)
                page = page[str(page).find('body'):]
                soup = BeautifulSoup(page, parse_only = strainer)

                links = soup.find_all('a', class_='SiteMapArticleList-link')
                if links != []:
                    for a_link in links:
                        article_links.append(a_link['href'])

    return article_links

In [None]:
article_links = get_links_to_scrape(2006, 2020)

## 1.2 Scrape articles from link list

In [None]:
def scrape_cnbc_articles(list_of_links):

    import requests
    from bs4 import BeautifulSoup, SoupStrainer
    import pandas as pd

    df = []
    index = 0
    request = requests.Session()

    for link in list_of_links:
        page = request.get(link)
        index += 1

        if page.status_code == 200:
            try:
                page = str(page.content)
                page = page[page.find('<div id="MainContent"'):]

                soup_link = BeautifulSoup(page)

                title = soup_link.find('h1', class_='ArticleHeader-headline').get_text()
                article = soup_link.find('div', class_='ArticleBody-articleBody').get_text()
                date = f'{link[29:31]}-{link[26:28]}-{link[21:25]}'
                topic = soup_link.find('a', class_='ArticleHeader-eyebrow').get_text()

                df.append([title, topic, date, article, link])

                print(f'({index}/{len(list_of_links)}) : {link}')
            except:
                print(f'({index}/{len(list_of_links)}) : Skipped')
        else:
            print(f'({index}/{len(list_of_links)}) : Skipped')

    return pd.DataFrame(df)

In [None]:
df = scrape_cnbc_articles(article_links)

## 1.3 Collecting the datasets, if collected over multiple times

In [None]:
def collect_dataset(datasets_path):

    import pandas as pd

    datasets = [ds for ds in datasets_path if '.csv' in ds]

    frames = []
    for dataset in datasets:
        df = pd.read_csv(dataset, error_bad_lines=False, index_col=False)
        df = df[df.columns[-5:]]
        df.columns = ['Title', 'Topic', 'Date', 'Content', 'Link']
        frames.append(df)

    return pd.concat(frames)

In [None]:
import os
df = collect_dataset(os.listdir())

# 2. Preprocessing & Cleaning

In [None]:
df['Content'] = df['Content'].astype('str')
df['Title'] = df['Title'].astype('str')

In [None]:
def remove_clutter(text):
    
    import re

    text = re.sub(r'\\x[A-Za-z0-9_]{2}', '', text)
    text = re.sub(r'VIDEO([0-9]|[0-9]{2}):[0-9]{4}:[0-9]{2}', ' ', text)
    text = text.replace('Getty Images', '')
    text = re.sub(r',','', text)
    text = re.sub(r"n'",'n', text)
    text = re.sub(r'  ',' ', text)

    return text

In [None]:
def cleaning(df,column):

    import spacy
    nlp = spacy.load('en_core_web_sm')
    import pandas as pd
 
    tokens = []

    df[column].apply(remove_clutter)

    index = 0

    for article in nlp.pipe(df[column], disable=['parser']):

        article_tok = [token.lemma_.lower() for token in article if _
            token.is_alpha _
            and not token.is_stop _
            and token.pos_ in ['NOUN', 'PROPN', 'ADJ', 'ADV', 'VERB'] _
            and token.ent_type_ not in ['PERSON', 'MONEY', 'PERCENT', 'LOC', 'DATE', 'TIME', 'QUANTITY', 'ORDINAL'] _
            and len(token)>1]

        tokens.append(article_tok)

        index += 1
        print(f'Processed {index}/{len(df[column])}')

    df['tokens'] = tokens

    df['clean_articles'] = df['tokens'].map(lambda row: " ".join(row))

    return df

In [None]:
df = cleaning(df, 'Content')

# 3. Attempting to classify topics

In [None]:
# Output topics to construct final_topic_list manually
pd.DataFrame(df.Topic.unique()).to_csv('unique_topics.csv')

In [None]:
# Read topic mapping
topic_list = pd.read_csv('final_topic_list.csv', sep = ";")
topic_list_clean = pd.DataFrame.dropna(topic_list)

In [None]:
# Map Topics to articles
predetermined = []
index = 0
for topic in df["Topic"]:
    index += 1
    if topic in list(topic_list_clean["Topic"]):
        predetermined.append(topic_list_clean[topic_list_clean["Topic"] == topic]["Predetermined topic"].to_numpy()[0])
    else:
        predetermined.append("Other")
    print(f'{index}')
df['final_topic'] = predetermined

In [None]:
# Splitting to train and predict
df_labelled = df[df['final_topic'] != 'Other']
df_predict = df[df['final_topic'] == 'Other']

In [None]:
# Split training into train and test and construct LSA
from sklearn.model_selection import train_test_split

r_state = 123

x_train, x_test, y_train, y_test = train_test_split(
    df_labelled['tokens'], df_labelled['final_topic'], test_size=0.25, random_state=r_state)

from gensim import models, corpora
import ast

data_processed = x_train.to_numpy()
data_conversion = []
for line in data_processed:
    line = ast.literal_eval(line)
    data_conversion.append(line)
data_processed = data_conversion

dictionary = corpora.Dictionary(data_processed)
corpus = [dictionary.doc2bow(line) for line in data_processed]
tfidf = models.TfidfModel(corpus, smartirs='ntc')
lsa_model = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=50)

In [None]:
# Evaluate XGBoost and conclude that it fails...
from xgboost import XGBClassifier

xgb_model = XGBClassifier(random_state = r_state)
xgb_model.fit(x_train, y_train)
xgb_prediction = xgb_model.predict(x_test)
print(classification_report(y_test,xgb_prediction))

## 3.1 Filtering articles instead

In [None]:
df = df[df['final_topic'] != 'Other']

# 4. Collecting datasets

## 4.1 Collecting articles

In [None]:
def rolling_articles(start_date, end_date, df, start_range, end_range):

    from datetime import timedelta, datetime
    import ast

    date_list = [start_date + timedelta(days=x) for x in range(0,int((end_date - start_date).days)+1)]

    df['Date'] = pd.to_datetime(df['Date']).dt.date

    date_index = []
    count = 0

    for date in date_list:

        articles = df[(df['Date'] <= date + timedelta(days=end_range)) & (df['Date'] >= date + timedelta(days=start_range))].head(30*(1+end_range-start_range))
        count += len(articles)
        processed = ""
        for article in articles['tokens']:
            try:
                
                article = str(article).replace("[", "").replace("]", "").replace(",", "").replace("'","")
                processed += article
                processed += " "
            except:
                pass
            
        date_index.append([date, processed])
        print(f'{date}: {count}')

    return date_index

In [None]:
from datetime import date

sdate = date(2006, 11, 27)
edate = date(2020, 11, 30)

x = rolling_articles(sdate, edate, df, 0, 0)

## 4.2 Calculating returns

In [None]:
bb = pd.read_csv('bb_prices.csv')

In [None]:
def calculate_returns(prices, interval):

    prices['Dates'] = pd.to_datetime(prices['Dates']).dt.date
    
    date_index = []
    for i in range(0,len(prices)):
        try:
            date = prices.iloc[i,0]
            prices_at_date = prices.iloc[i,1:]
            prices_at_future_date = prices.iloc[i+interval,1:]
            return_at_date = list(prices_at_future_date / prices_at_date)
            returns = [date]
            for sector in return_at_date:
                returns.append(sector)
            date_index.append(returns)
            print(f'{date}')
        except:
            pass
    df = pd.DataFrame(date_index, columns = prices.columns)
    return df

In [None]:
y = calculate_returns(bb, 7)

## 4.3 Collect dataset

In [None]:
x = x.set_index('Date')
y = y.set_index('Dates')
df = pd.concat([y, x.reindex(y.index)], axis = 1).dropna()

# 5. Neural Network Modelling

In [None]:
df['Dates'] = pd.to_datetime(df['Dates']).dt.date
df.head()

In [None]:
# list of tokens in list of articles(in a day) - We train the gensim model on all data prior to 2011
from datetime import date
token_list = list([token.split(" ") for token in df[df['Dates'] <= date(2010, 12, 31)]['tokens']])
size = 100

In [None]:
# Training gensim word2vec
import gensim

model = gensim.models.Word2Vec(sentences = token_list, size = size, window = 5, workers = 4, min_count = 20)
# Vocab size:
words = list(model.wv.vocab)
print('vocabulary size: %d' % len(words))

In [None]:
# save the model
filename = 'article_embeddings.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [None]:
# Load in the model
import os
import numpy as np

embeddings_index= {}
f = open(os.path.join('', 'article_embeddings.txt'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [None]:
from keras.preprocessing.text import Tokenizer
from datetime import date

# vectorise the text samples into a 2D integer tensor with the data for token list before 2011
tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_list)

# Now convert all the tokens to sequences
token_list = list([token.split(" ") for token in df['tokens']])
sequences = tokenizer.texts_to_sequences(token_list)

In [None]:
# pad sequences to make input length constant
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Get average of list
def Average(lst): 
    return sum(lst) / len(lst) 

max_length = int(Average([len(doc) for doc in token_list]))
articles_pad = pad_sequences(sequences, maxlen=max_length, padding='post')
print('Shape of article tensor:', articles_pad.shape)

In [None]:
# Constructing the embedding matrix
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, size))

for word, i in word_index.items():
    if i > vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
# Import neccesary packages for NN model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import LSTM
from keras.initializers import Constant

In [None]:
# define Untrainable model
model = Sequential()
embedding_layer = Embedding(vocab_size,
                           size,
                           embeddings_initializer=Constant(embedding_matrix),
                           mask_zero = True,
                           input_length=None,
                           trainable=False)
# Add embedding layer
model.add(embedding_layer)

# Add a LSTM layer with 50 internal units.
model.add(LSTM(50, return_sequences=True, input_shape=(100,12), dropout = 0.2))
model.add(LSTM(50, return_sequences=True, input_shape=(100,12), dropout = 0.2))
model.add(LSTM(50, dropout = 0.2))
# Add a Dense layer with 12 units.
model.add(Dense(12))
# Add compiler with XXX
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
# Print summary of model
print(model.summary())

In [None]:
# Scale our Y
from sklearn.preprocessing import StandardScaler

y = df.iloc[:,1:13]
y = y - 1

scaler = StandardScaler()
scaler.fit(y)
y = scaler.transform(y)
y = pd.DataFrame(y)

In [None]:
# Define walk_forward function for our model training
def walk_forward_validation(model, epochs, x, y, step_size, train_steps, val_window):
    
    from sklearn.metrics import mean_squared_error
    
    n_records = len(x)
    n_init_train = step_size * train_steps
    train_pred = []
    val_pred = []
    mse_scores = []
    for i in range(n_init_train, n_records, step_size):
      
        train_from = i-n_init_train
        train_to = i
        test_from = i+1
        test_to = i+val_window

        x_train, x_test = x[train_from:train_to], x[test_from:test_to]
        y_train, y_test = y[train_from:train_to], y[test_from:test_to]
        
        print(f'Train from {i-n_init_train} to {i} and validate for {i+1} to {i+val_window}')
        model.fit(x_train, y_train, epochs=epochs, verbose=1)

        y_train_pred = model.predict(x_train)
        for y_train_day in y_train_pred:
            train_pred.append(y_train_day.tolist())
        
        y_pred = model.predict(x_test)
        for y_test_day in y_pred:
            val_pred.append(y_test_day.tolist())

        train_mse = mean_squared_error(y_train,y_train_pred)
        val_mse = mean_squared_error(y_test,y_pred)
        mse_scores.append([train_mse, val_mse])

        print(f'     train: {train_mse} \nvalidation: {val_mse} \n')

    return train_pred, val_pred, mse_scores

In [None]:
# Epoch Hyper parameter tuning
epoch_tuning_performance = []
for epoch in range(1,11):
    train_pred, val_pred, validation_metrics = walk_forward_validation(model = model, epochs = epoch, x = pd.DataFrame(articles_pad[970:]), y = y[970:], step_size = 60, train_steps = 3, val_window = 60)
    validation_metrics_total = pd.DataFrame(validation_metrics).mean(axis=0)
    mean_train_mse = validation_metrics_total.iloc[0]
    mean_val_mse = validation_metrics_total.iloc[1]
    epoch_tuning_performance.append([mean_train_mse, mean_val_mse])

In [None]:
# Using plotly.express to display epoch tuning
import plotly.express as px
import pandas as pd

df = pd.DataFrame(epoch_tuning_performance)
fig = px.line(df, x='epochs', y=["train","validation"], color_discrete_sequence=['cornflowerblue', 'indigo'])
fig.update_xaxes(title_text='Epochs', showgrid=False)
fig.update_yaxes(title_text='MSE')
fig.show()

In [None]:
# Train model and get MSE & Predicted values
train_pred, val_pred, validation_metrics = walk_forward_validation(model = model, epochs = epoch, x = pd.DataFrame(articles_pad[970:]), y = y[970:], step_size = 60, train_steps = 3, val_window = 60)
validation_metrics_total = pd.DataFrame(validation_metrics).mean(axis=0)
mean_train_mse = validation_metrics_total.iloc[0]
mean_val_mse = validation_metrics_total.iloc[1]

In [None]:
pd.DataFrame(scaler.inverse_transform(train_pred)+1)

In [None]:
pd.DataFrame(scaler.inverse_transform(val_pred)+1)

In [None]:
pd.DataFrame(validation_metrics)

In [None]:
# define Trainable model
model = Sequential()
embedding_layer = Embedding(vocab_size,
                           size,
                           embeddings_initializer=Constant(embedding_matrix),
                           mask_zero = True,
                           input_length=None,
                           trainable=True)
# Add embedding layer
model.add(embedding_layer)

# Add a LSTM layer with 50 internal units.
model.add(LSTM(50, return_sequences=True, input_shape=(100,12), dropout = 0.2))
model.add(LSTM(50, return_sequences=True, input_shape=(100,12), dropout = 0.2))
model.add(LSTM(50, dropout = 0.2))
# Add a Dense layer with 12 units.
model.add(Dense(12))
# Add compiler with XXX
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
# Print summary of model
print(model.summary())

In [None]:
# Train model and get MSE & Predicted values
train_pred_trained, val_pred_trained, validation_metrics_trained = walk_forward_validation(model = model, epochs = epoch, x = pd.DataFrame(articles_pad[970:]), y = y[970:], step_size = 60, train_steps = 3, val_window = 60)
validation_metrics_trained_total = pd.DataFrame(validation_metrics_trained).mean(axis=0)
mean_train_trained_mse = validation_metrics_trained_total.iloc[0]
mean_val_trained_mse = validation_metrics_trained_total.iloc[1]

In [None]:
pd.DataFrame(scaler.inverse_transform(train_pred_trained)+1)

In [None]:
pd.DataFrame(scaler.inverse_transform(val_pred_trained)+1)

In [None]:
pd.DataFrame(validation_metrics_trained)