<a href="https://colab.research.google.com/github/KingYuanSun/Individual-Projects-NLP/blob/master/Sentiment_Analysis_StockTrendPrediction_Main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

BERT with Sentiment analysis¶
In this notebook I used a pretrained version of BERT avaliable as a huggingface transformed to classify the sentiment of news articles about Bitcoin and Tesla, and applied an LSTM to predict the stock returns

Sources
https://towardsdatascience.com/fine-tuning-bert-with-keras-and-tf-module-ed24ea91cff2

Bert for dummies: https://towardsdatascience.com/bert-for-dummies-step-by-step-tutorial-fb90890ffe03
Bert for long texts: https://medium.com/@armandj.olivares/using-bert-for-classifying-documents-with-long-texts-5c3e7b04573d
googles notebook: https://github.com/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb
strong.io: https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b
strong.io notebook : https://github.com/strongio/keras-bert
https://keras.io/layers/writing-your-own-keras-layers/

In [289]:
!pip install utils
!pip install bert-for-tf2
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install nltk
!pip install yfinance
!pip install news-please
!pip install google
!pip install transformers



In [290]:
##utils.py

import pandas as pd, numpy as np
try:
	import bert
except:
	print("bert-for-tf2 not installed")

# transforms sentences to ids, masks and segment ids prepared to feed bert
def convert_sentence_to_features(sentence, tokenizer, max_seq_len):
    tokens = ['[CLS]']
    tokens.extend(tokenizer.tokenize(sentence))
    if len(tokens) > max_seq_len-1:
        tokens = tokens[:max_seq_len-1]
    tokens.append('[SEP]')
    
    segment_ids = [0] * len(tokens)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_ids)

    #Zero Mask till seq_length
    zero_mask = [0] * (max_seq_len-len(tokens))
    input_ids.extend(zero_mask)
    input_mask.extend(zero_mask)
    segment_ids.extend(zero_mask)
    
    return input_ids, input_mask, segment_ids

def convert_sentences_to_features(sentences, tokenizer, max_seq_len=200):
    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    
    for sentence in sentences:
        input_ids, input_mask, segment_ids = convert_sentence_to_features(sentence, tokenizer, max_seq_len)
        all_input_ids.append(input_ids)
        all_input_mask.append(input_mask)
        all_segment_ids.append(segment_ids)
    
    return all_input_ids, all_input_mask, all_segment_ids

def generate_data_for_tokenizer(split_text,target_series):
    labels_list = []
    dates = []
    for date, arrays in split_text.itertuples():
        dates.extend([date]* len(arrays))
    for date in dates:
        labels_list.append(target_series.loc[date])
    
    split_text_flat = split_text.values.flatten()
    sentence_list = [sentence for array in split_text_flat for sentence in array]
    
    labels = pd.DataFrame(labels_list, index = dates)
    sentences  = pd.DataFrame(sentence_list, index = dates)
    return sentences, labels

# given an input text and a set of keywords, returns the top_n_terms which contain any of the keywords by frequency of appearance.
def find_new_token_with_custom_keywords(array_of_text, custom_keywords, top_n_terms, extra_tokens):
    
    def contains_keyword(word,keywords):
        for k in keywords:
            if word.find(k) >= 0:
                return True
        return False
    
    def count_frequency(my_list): 
        freq = {} 
        for item in my_list: 
            if (item in freq): 
                freq[item] += 1
            else: 
                freq[item] = 1
        return freq
    
    raw_text = "".join(array_of_text).replace(".com","-com").replace(".", "").replace(",", "").replace("\n", " ").replace("-com",".com")
    raw_words = raw_text.split(" ")
    matches = []
    for word in raw_words:
        if contains_keyword(word.lower(),custom_keywords):
            matches.append(word.lower())
    
    matches_count = count_frequency(matches)
    #sorts the counts
    #matches_dict = {k: v for k, v in sorted(matches_count.items(), key=lambda item: item[1], reverse = True)}
    # selects top n words from the list
    #new_tokens = list(matches_dict)[:top_n_terms]  + extra_tokens
    import operator
    sorted_x = sorted(matches_count.items(), key=operator.itemgetter(1), reverse = True)
    new_tokens = [ tup[0] for tup in sorted_x[:top_n_terms]]  + extra_tokens
    
    print("New tokens to be added: ",new_tokens)
    return new_tokens

# creates bert tokenizer
def create_tokenizer(vocab_file='vocab.txt', do_lower_case=True):
    return bert.bert_tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

# appends extra tokens to the vocab of the tokenizer
def add_new_tokens(new_vocab, tokenizer):
    for i in range(len(new_vocab)):
        new_key = new_vocab[i]
        old_key = "[unused{}]".format(i)
        value = tokenizer.vocab.pop(old_key)
        tokenizer.vocab[new_key] = value
    return tokenizer

# transforms bet output in one continuous series removing the padding
def bert_output_to_one_time_series_per_day(bert_inputs, bert_output, sentences):
    n_sentences = sentences.groupby(sentences.index).count()
    n_tokens = bert_inputs["input_mask"][:].sum(axis = 1)
    mask_out = [bert_output[1][counter,:length,:] for length, counter in zip(n_tokens,range(len(n_tokens)))]
    
    articles_per_day = []
    acc = 0
    for n in n_sentences.values:
        n = n[0]
        concat_articles = np.array(mask_out[acc:acc + n])
        flattened = []
        for sentence in concat_articles:
            for token in sentence:
                flattened.append(token)
        flattened = np.array(flattened)
        #flattened = np.array([token for token in sentence for sentence in concat_articles])
        articles_per_day.append(flattened)
        acc += n
    return np.array(articles_per_day)

# prepares_labels
def label_transformer(prices, mode = "returns", shift = 5, index = None, standarized = False):
    prices = pd.DataFrame(prices)
    prices.columns = ["today"]
    if index is not None:
        prices = prices[index]
    prices["tomorrow"] = prices.shift(1)
    prices["returns"] = prices["today"].pct_change()
    prices["diff"] = prices["today"] - prices["tomorrow"]
    def standard(df):
        return (df - df.mean())/df.std()
    output = prices[mode].shift(shift).dropna()
    return output if not standarized else standard(output)

# computes and returns intersection among series
def series_intersection(a,b):
    a.index = pd.DatetimeIndex(a.index)
    b.index = pd.DatetimeIndex(b.index)
    intersection = pd.DatetimeIndex([value for value in a.index if value in b.index])
    return a.loc[intersection], b.loc[intersection]



def rolling_window_bert_2nd_dim(a, window):
    shape = (a.shape[0] - window + 1, window, a.shape[1])
    strides = (a.strides[0], a.strides[1]*a.shape[1], a.strides[1])
    #print(shape, strides)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)


def dummy():
	return "dah"

In [291]:
from tensorflow import keras
import numpy as np
import time



class Classifier_INCEPTION:

    def __init__(self, output_directory, input_shape, nb_classes, verbose=False, build=True, batch_size=64,
                 nb_filters=32, use_residual=True, use_bottleneck=True, depth=6, kernel_size=41, nb_epochs=1500):

        self.output_directory = output_directory

        self.nb_filters = nb_filters
        self.use_residual = use_residual
        self.use_bottleneck = use_bottleneck
        self.depth = depth
        self.kernel_size = kernel_size - 1
        self.callbacks = None
        self.batch_size = batch_size
        self.bottleneck_size = 32
        self.nb_epochs = nb_epochs

        if build == True:
            self.model = self.build_model(input_shape, nb_classes)
            if (verbose == True):
                self.model.summary()
            self.verbose = verbose
            self.model.save_weights(self.output_directory + 'model_init.hdf5')

    def _inception_module(self, input_tensor, stride=1, activation='linear'):

        if self.use_bottleneck and int(input_tensor.shape[-1]) > 1:
            input_inception = keras.layers.Conv1D(filters=self.bottleneck_size, kernel_size=1,
                                                  padding='same', activation=activation, use_bias=False)(input_tensor)
        else:
            input_inception = input_tensor

        # kernel_size_s = [3, 5, 8, 11, 17]
        kernel_size_s = [self.kernel_size // (2 ** i) for i in range(3)]

        conv_list = []

        for i in range(len(kernel_size_s)):
            conv_list.append(keras.layers.Conv1D(filters=self.nb_filters, kernel_size=kernel_size_s[i],
                                                 strides=stride, padding='same', activation=activation, use_bias=False)(
                input_inception))

        max_pool_1 = keras.layers.MaxPool1D(pool_size=3, strides=stride, padding='same')(input_tensor)

        conv_6 = keras.layers.Conv1D(filters=self.nb_filters, kernel_size=1,
                                     padding='same', activation=activation, use_bias=False)(max_pool_1)

        conv_list.append(conv_6)

        x = keras.layers.Concatenate(axis=2)(conv_list)
        x = keras.layers.BatchNormalization()(x)
        x = keras.layers.Activation(activation='relu')(x)
        return x

    def _shortcut_layer(self, input_tensor, out_tensor):
        shortcut_y = keras.layers.Conv1D(filters=int(out_tensor.shape[-1]), kernel_size=1,
                                         padding='same', use_bias=False)(input_tensor)
        shortcut_y = keras.layers.BatchNormalization()(shortcut_y)

        x = keras.layers.Add()([shortcut_y, out_tensor])
        x = keras.layers.Activation('relu')(x)
        return x

    def build_layer_structure(self, input_tensor):
        x = input_tensor
        input_res = input_layer

        for d in range(self.depth):

            x = self._inception_module(x)

            if self.use_residual and d % 3 == 2:
                x = self._shortcut_layer(input_tensor, x)
                input_res = x

        gap_layer = keras.layers.GlobalAveragePooling1D()(x)


        return  gap_layer


    def build_model(self, input_shape, nb_classes):
        input_layer = keras.layers.Input(input_shape)

        x = input_layer
        input_res = input_layer

        for d in range(self.depth):

            x = self._inception_module(x)

            if self.use_residual and d % 3 == 2:
                x = self._shortcut_layer(input_res, x)
                input_res = x

        gap_layer = keras.layers.GlobalAveragePooling1D()(x)

        output_layer = keras.layers.Dense(nb_classes, activation='softmax')(gap_layer)

        model = keras.models.Model(inputs=input_layer, outputs=output_layer)

        model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(),
                      metrics=['accuracy'])

        reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=50,
                                                      min_lr=0.0001)

        file_path = self.output_directory + 'best_model.hdf5'

        model_checkpoint = keras.callbacks.ModelCheckpoint(filepath=file_path, monitor='loss',
                                                           save_best_only=True)

        self.callbacks = [reduce_lr, model_checkpoint]

        return model

    def fit(self, x_train, y_train, x_val, y_val, y_true, plot_test_acc=False):

        # x_val and y_val are only used to monitor the test loss and NOT for training

        if self.batch_size is None:
            mini_batch_size = int(min(x_train.shape[0] / 10, 16))
        else:
            mini_batch_size = self.batch_size

        start_time = time.time()

        if plot_test_acc:

            hist = self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=self.nb_epochs,
                                  verbose=self.verbose, validation_data=(x_val, y_val), callbacks=self.callbacks)
        else:

            hist = self.model.fit(x_train, y_train, batch_size=mini_batch_size, epochs=self.nb_epochs,
                                  verbose=self.verbose, callbacks=self.callbacks)

        duration = time.time() - start_time

        self.model.save(self.output_directory + 'last_model.hdf5')

        y_pred = self.predict(x_val, y_true, x_train, y_train, y_val)

        # save predictions
        np.save(self.output_directory + 'y_pred.npy', y_pred)

        # convert the predicted from binary to integer
        y_pred = np.argmax(y_pred, axis=1)

        keras.backend.clear_session()

        return hist

    def predict(self, x_test, y_true, x_train, y_train, y_test):
        start_time = time.time()
        model_path = self.output_directory + 'best_model.hdf5'
        model = keras.models.load_model(model_path)
        y_pred = model.predict(x_test, batch_size=self.batch_size)
        return y_pred

In [292]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')
# !ls "/content/drive/My Drive"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [293]:
import tensorflow as tf
import pandas as pd, numpy as np
from matplotlib import pyplot as plt
from utils import *

import importlib
import utils
importlib.reload(utils)
from utils import *

import yfinance as yf

start = "2019-01-01"
end   = "2020-03-20"

bitcoin = False
if not bitcoin:   
    stocks = ["TSLA"]
    keywords = {"TSLA": ["Tesla", "Elon Musk"]}
else:    
    stocks = ["BTC-USD"]
    keywords = {"BTC-USD" : ["Bitcoin", "Cryptocurrency"] }

df_financial = yf.download(stocks, 
                     #period = "1Y",
                      start= start, 
                      end= end, 
                      progress=False)
prices = df_financial["Close"]
prices = pd.DataFrame(data = prices, index = pd.date_range(start,end)).fillna(method = "bfill")

df_bitcoin = pd.read_csv("/content/drive/My Drive/ColabData/articles_Bitcoin-Cryptocurrency_start=2019-01-01_end=2020-12-31.csv", index_col = 0)
df_tesla = pd.read_csv("/content/drive/My Drive/ColabData/articles_Tesla-Elon_Musk_start=2019-01-01_end=2020-12-31.csv", index_col = 0)
df = df_bitcoin if bitcoin else df_tesla

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, AutoModelForSequenceClassification, TFBertForSequenceClassification 

tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")

Split sentences

In [294]:
main_text = False
if main_text:
    # main_text
    text = df.maintext #pd.DataFrame(df.maintext.values, index = df["date_google"]).dropna()
    text.index = df["date_google"]
else:    
    # titles and descriptions
    titles = df.title
    titles.index = df["date_google"]
    descriptions = df.description
    descriptions.index = df["date_google"]
    text = pd.concat([titles,descriptions]).sort_index().dropna()
    
print(text.values[0])

import nltk.data
nltk.download('punkt')
split_sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

split_sentences = text.apply(split_sentence_tokenizer.tokenize)
split_sentences = split_sentences.groupby(split_sentences.index).sum()
split_sentences = pd.DataFrame(split_sentences)

df_ret = label_transformer(prices.copy(), shift = 1)

split_sentences, df_ret = series_intersection(split_sentences, df_ret)
raw_sentences, raw_labels = generate_data_for_tokenizer(split_sentences,df_ret)

lengths = raw_sentences.apply(lambda x: len(x[0].split()), axis = 1)
sentences = raw_sentences[(lengths > 10) & (lengths < 120)] #filter short and long sentences
labels = raw_labels[(lengths > 10) & (lengths < 120)]

keywords_bitcoin = ["crypto", "BTC", "bitcoin", "blockchain"]
keywords_tesla =   ["tesla", "Elon", "Musk", "TSLA", "Tesla"]
keys = keywords_bitcoin if bitcoin else keywords_tesla
raw_text = "".join(text.values).replace(".com","-com").replace(".", "").replace(",", "").replace("\n", " ").replace("-com",".com")
new_tokens = find_new_token_with_custom_keywords(raw_text, keys , 6, [])
# new_tokens = new_tokens.append('tesla\'s')

print (new_tokens)
print (sentences[0])
print (sentences[0][19])

print (sentences.dtypes)

res = [sub.replace('Tesla\'s', 'tesla').replace('Tesla’s', 'tesla').replace('tesla\'s', 'tesla').replace('tesla’s', 'tesla').replace('Teslas', 'tesla').replace('teslaelon', 'tesla').replace('musktesla', 'tesla') for sub in sentences[0]] 

# tesla’s Teslas

sentences = pd.Series(res) 
print (sentences[19])
print (sentences[103])
# sentences = []
# sentences.append(res)
# print (sentences[0][19])
# print (res[19])

Conor McGregor compared himself to Elon Musk in fiery exchange with Floyd Mayweather's manager
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
New tokens to be added:  ['tesla', "tesla's", 'tesla’s', 'teslas', 'teslaelon', 'musktesla']
['tesla', "tesla's", 'tesla’s', 'teslas', 'teslaelon', 'musktesla']
2019-01-03    Tesla CEO Elon Musk doubled down on criticism ...
2019-01-03    It’s a counterintuitive result to mention toda...
2019-01-03    But over a century of research on work environ...
2019-01-03    Elon Musk tweeted that Singapore has been unwe...
2019-01-03    From a Japanese mathematician to Elon Musk, he...
                                    ...                        
2020-03-19    Tesla Cybertruck Gigafactory: Oklahoma pulls o...
2020-03-19    NYC mayor asks Elon Musk to manufacture ventil...
2020-03-19    Elon Musk downplays coronavirus as Tesla facto...
2020-03-19    Elon Musk took to Twitter yesterday to clai

In [295]:
#model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
model = TFAutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment", from_pt=True)
tokenizer.add_tokens(new_tokens)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

All the weights of TFBertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


5

In [296]:
example = "Bitcoin futures are trading below the cryptocurrency's spot price"
print(tokenizer.tokenize(example))
example_ids = tokenizer.encode(example)
print(example_ids)
output = model.predict([example_ids]) 
print("    1 star     2 stars     3 stars     4 stars     5 stars")
print(output[0])

['bit', '##co', '##in', 'futures', 'are', 'trading', 'below', 'the', 'cry', '##pt', '##oc', '##urre', '##ncy', "'", 's', 'spot', 'price']
[101, 16464, 10805, 10262, 42272, 10320, 34948, 16934, 10103, 29917, 15903, 20731, 46642, 19771, 112, 161, 24311, 16993, 102]
    1 star     2 stars     3 stars     4 stars     5 stars
[[ 1.1589032   0.6475529   0.23367804 -0.6871393  -1.1347752 ]]


In [297]:
example = "I am so disappointed with this product"
print(tokenizer.tokenize(example))
example_ids = tokenizer.encode(example)
print(example_ids)
output = model.predict([example_ids])
print("    1 star     2 stars    3 stars    4 stars    5 stars")
print(output[0])

['i', 'am', 'so', 'disa', '##ppo', '##inted', 'with', 'this', 'product']
[101, 151, 10345, 10297, 31021, 54894, 83912, 10171, 10372, 20058, 102]
    1 star     2 stars    3 stars    4 stars    5 stars
[[ 3.3838353  2.8609188  0.6469802 -2.6131783 -3.5148158]]


In [298]:
example = "Elon Musk's 'Blastar' would be a perfect addition to tesla Easter Eggs"


print(tokenizer.tokenize(example))
example_ids = tokenizer.encode(example)
print(example_ids)
output = model.predict([example_ids])
print("    1 star     2 stars    3 stars    4 stars    5 stars")
print(output[0])

['elo', '##n', 'mus', '##k', "'", 's', "'", 'blast', '##ar', "'", 'would', 'be', 'a', 'perfect', 'addition', 'to', 'tesla', 'easter', 'eggs']
[101, 21834, 10115, 23139, 10167, 112, 161, 112, 47732, 10370, 112, 11008, 10346, 143, 23021, 15000, 10114, 51571, 58776, 48540, 102]
    1 star     2 stars    3 stars    4 stars    5 stars
[[-1.48236    -0.895975    0.41832617  1.1279131   0.57270813]]


In [299]:
sentences = sentences.apply(lambda x : tokenizer.tokenize(x))

In [300]:
encoded_sentences = sentences.apply(lambda x : tokenizer.encode(x))
print (sentences)
print (encoded_sentences)
print (encoded_sentences[0])
print (encoded_sentences[1])
encoded_sentences.shape

0       [tesla, ceo, elo, ##n, mus, ##k, double, ##d, ...
1       [it, [UNK], s, a, counter, ##int, ##uit, ##ive...
2       [but, over, a, century, of, research, on, work...
3       [elo, ##n, mus, ##k, twee, ##ted, that, singap...
4       [from, a, japanese, math, ##ema, ##tician, to,...
                              ...                        
3181    [tesla, cyber, ##tru, ##ck, gi, ##ga, ##fa, ##...
3182    [nyc, mayor, asks, elo, ##n, mus, ##k, to, man...
3183    [elo, ##n, mus, ##k, down, ##play, ##s, corona...
3184    [elo, ##n, mus, ##k, took, to, twitter, yester...
3185    [the, need, is, clear, ,, let, ', s, hope, he,...
Length: 3186, dtype: object
0       [101, 51571, 23693, 21834, 10115, 23139, 10167...
1       [101, 10197, 100, 161, 143, 32964, 16790, 1754...
2       [101, 10502, 10323, 143, 11516, 10108, 11865, ...
3       [101, 21834, 10115, 23139, 10167, 12915, 11894...
4       [101, 10195, 143, 14201, 33508, 17291, 95605, ...
                              ...           

(3186,)

In [301]:
# if we have "tesla's", we failed

# print (encoded_sentences[0])
# print (model.predict([encoded_sentences[0]]))
# print (model.predict([encoded_sentences[0]])[0])

# print (sentences[0][3])
# print (sentences[0][4])
# print (sentences[0][5])
# print (sentences[0][6])
# print (sentences[0][7])
# print (sentences[0][9])
# print (sentences[0][10])
# print (sentences[0][16])
# print (sentences[0][17])
# print (sentences[0][18])
# print (sentences[0][19])
# print (sentences[0][20])

# print (sentences[0][18])
# print (encoded_sentences[18])
# print (model.predict([encoded_sentences[18]]))
# print (model.predict([encoded_sentences[18]])[0])

print (sentences[10])
print (encoded_sentences[10])
print (model.predict(encoded_sentences[10]))
print (model.predict([encoded_sentences[10]])[0])

print (sentences[19])
print (encoded_sentences[19])
print (model.predict([encoded_sentences[19]]))
print (model.predict([encoded_sentences[19]])[0])

# for i in range(1, len(encoded_sentences)): 

#     print (i)
#     print (sentences[i])
#     print(encoded_sentences[i]) 
#     output = model.predict([encoded_sentences[i]])[0]
#     print (output)




['investors', 'appeared', 'to', 'be', 'focusing', 'more', 'on', 'potential', 'road', '##bl', '##ock', '##s', ',', 'including', 'a', 'flood', 'of', 'new', 'competition', 'and', 'the', 'phase', '-', 'out', 'of', 'the', 'federal', 'tax', 'credits', 'that', 'offs', '##et', 'the', 'cost', 'of', 'tesla', 'battery', '-', 'electric', 'vehicles', '.']
[101, 67446, 14889, 10114, 10346, 67493, 10772, 10125, 21570, 11925, 31809, 21906, 10107, 117, 11371, 143, 40241, 10108, 10246, 14262, 10110, 10103, 17324, 118, 10871, 10108, 10103, 12501, 22389, 33896, 10203, 47117, 10337, 10103, 18153, 10108, 51571, 34794, 118, 15988, 25327, 119, 102]
(array([[ 2.10614756e-01,  2.56466150e-01,  1.39126316e-01,
        -2.67221451e-01, -1.65504664e-01],
       [-4.38403189e-01,  5.69914579e-01,  7.56413877e-01,
         2.80110668e-02, -6.12575054e-01],
       [ 1.57692879e-01,  2.46165216e-01,  1.76105902e-01,
        -2.54177243e-01, -1.54445529e-01],
       [ 8.45648944e-02,  2.18766451e-01,  1.93011418e-01,
 

In [302]:
predictions = encoded_sentences.apply(lambda x : model.predict([x])[0])


In [303]:
predictions.shape


(3186,)

In [304]:
avg_predictions = predictions.groupby(predictions.index).apply(np.mean)
avg_predictions_array = np.array([a for a in avg_predictions.values])
shape = avg_predictions_array.shape
avg_predictions = pd.DataFrame(avg_predictions_array.reshape((shape[0], shape[2])), index = avg_predictions.index)
pd.to_pickle(avg_predictions, "sentiment_predictions_tesla_with_tokens")

In [305]:
avg_predictions.shape

(3186, 5)

In [306]:
from tensorflow.keras.layers import LSTM, Dense, Input, Flatten
from tensorflow.keras.layers import TimeDistributed, Dropout, BatchNormalization, MaxPooling1D
from tensorflow.keras.models import Sequential

def diff2p(d, start):
    prices = []
    cum_diff = start
    for diff in d:
        prices.append(diff + cum_diff)
        cum_diff += diff
    return prices

def r2p(d, start = 100):
    return start * (1 + d).cumprod()

def rolling_window(a, window, step_size):
    shape = a.shape[:-1] + (a.shape[-1] - window + 1 - step_size + 1, window)
    strides = a.strides + (a.strides[-1] * step_size,)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

def rolling_window_bert_2nd_dim(a, window):
    shape = (a.shape[0] - window + 1, window, a.shape[1])
    strides = (a.strides[0], a.strides[1]*a.shape[1], a.strides[1])
    #print(shape, strides)
    return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)

def lstm_model(n_steps):
    n_features = 5
    model = Sequential()
    model.add(LSTM(100, input_shape = (n_steps, n_features), return_sequences = True))
    model.add(TimeDistributed(Dense(20, activation='elu')))
    model.add(Flatten())
    model.add(Dense(1, activation='elu'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    print(model.summary())
    return model

def dense_model():
    n_features = 5
    model = Sequential()
    #model.add(CuDNNLSTM(200, input_shape=(None, n_features), return_sequences= True))
    model.add(Input((n_features)))
    model.add(Dense(200))
    model.add(Dropout(0.4))
    model.add(Dense(50, activation='tanh'))
    model.add(Dense(1, activation='tanh'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    print(model.summary())
    return model



In [307]:
def destandarize(data, mean, std):
    return data*std + mean

def standarize(data):
    return (data - data.mean())/data.std(), data.mean(), data.std()

def lstm_train_and_predict(X_train, X_test, y_train, y_test, window, standarize_output = False, mode = "returns", start = 100, epochs = 10):
    
    y_test_orig = y_test.values
    y_train_orig = y_train.values
    
    if standarize_output:
        y_test, y_test_mean, y_test_std = standarize(y_test)
        y_train, y_train_mean, y_train_std = standarize(y_train)
    
    X_train_roll = rolling_window_bert_2nd_dim(X_train.values, window)
    X_test_roll = rolling_window_bert_2nd_dim(X_test.values, window)
    y_train_roll = y_train[window - 1:].values
    y_test_roll = y_test[window - 1:].values
    
    lstm = lstm_model(window)
    history = lstm.fit(X_train_roll,y_train_roll, validation_data=(X_test_roll, y_test_roll), epochs = epochs)
    pred_tr = lstm.predict(X_train_roll)
    pred_te = lstm.predict(X_test_roll)
    
    if standarize_output:
        pred_tr = destandarize(pred_tr,y_train_mean, y_train_std)
        pred_te = destandarize(pred_te,y_test_mean, y_test_std)
        y_train_roll = destandarize(y_train_roll,y_train_mean, y_train_std)
        y_test_roll = destandarize(y_test_roll,y_test_mean, y_test_std)
    
    if mode == "returns":
        pred_prices_tr = r2p(pred_tr, start)
        pred_prices_te = r2p(pred_te, start)
        real_prices_tr = r2p(y_train_orig, start)
        real_prices_te = r2p(y_test_orig, start)
        
    if mode == "diff":
        pred_prices_tr = diff2p(pred_tr, start)
        pred_prices_te = diff2p(pred_te, start)
        real_prices_tr = diff2p(y_train_orig, start)
        real_prices_te = diff2p(y_test_orig, start)
    
    print("TEST DATA")
    plt.plot(real_prices_te)
    plt.plot(pred_prices_te)
    plt.legend(["y_real", "y_pred"])
    plt.show()


    print("TRAIN DATA")
    plt.plot(real_prices_tr)
    plt.plot(pred_prices_tr)
    plt.legend(["y_real", "y_pred"])
    plt.show()

    plt.plot(history.history["loss"])
    plt.plot(history.history["val_loss"])
    plt.legend(["loss", "val_loss"])
    plt.show()
    return lstm, pred_tr, pred_te

In [308]:
mode = "returns"
shift = 1
df_prices = label_transformer(prices.copy(), shift = shift, mode = mode)
avg_predictions, df_prices = series_intersection(avg_predictions, df_prices)

threshold = "2019-11-01"
X_train = avg_predictions.loc[:threshold]
X_test = avg_predictions.loc[threshold:]
y_train = df_prices.loc[:threshold]
y_test = df_prices.loc[threshold:]

window = 7
ep = 15

# lstm, pred_tr, pred_te = lstm_train_and_predict(X_train, X_test, y_train, y_test, window, 
#                                                 standarize_output = False, mode = mode,
#                                                 start = 100, epochs = ep)
