Import relevant packages:

In [11]:
from numpy import array
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
import pandas as pd

Import dataset:

In [12]:
df = pd.read_csv('df_sample.csv')
df = df[:100]

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,Dates,SPTR INDEX,SPTRINFT INDEX,SPTRENRS INDEX,SPTRFINL INDEX,SPTRHLTH INDEX,SPTRINDU INDEX,SPTRCOND INDEX,SPTRUTIL INDEX,SPTRMATR INDEX,SPTRCONS INDEX,SPTRTELS INDEX,SPTRRLST INDEX,tokens
0,3168,2019-08-06,1.01612,1.030542,1.007738,0.995008,1.018123,1.005322,1.015987,1.012665,1.020694,1.023919,1.014964,1.025104,washington president ride golf cart brother la...
1,3169,2019-08-07,0.985862,0.992882,0.973589,0.971418,0.990021,0.976317,0.980954,1.001413,0.974974,0.996222,0.987926,1.00016,traffic jam noida india indias national state ...
2,3170,2019-08-08,0.97003,0.967738,0.941347,0.957846,0.979952,0.95866,0.96118,1.004087,0.956772,0.998857,0.97105,0.998822,director communication white house founder sky...
3,3171,2019-08-09,0.990585,0.997728,0.966574,0.978562,0.989894,0.98526,0.981242,1.008074,0.980349,1.015946,0.990384,1.004403,bmw hogan cnbcbmw long king compact sport seda...
4,3172,2019-08-12,1.01459,1.026144,0.998408,1.007702,1.009933,1.008907,1.00745,1.017959,1.004197,1.032372,1.014145,1.016112,general motors chevrolet volt electric vehicle...


## Define x and y variables:

In [27]:
# list of tokens in list of articles(in a day)
token_list = list([token.split(" ") for token in df['tokens']])

In [30]:
# day 1
token_list[0]

['washington',
 'president',
 'ride',
 'golf',
 'cart',
 'brother',
 'law',
 'nephew',
 'zack',
 'maple',
 'run',
 'golf',
 'course',
 'thurmont',
 'jaffe',
 'afp',
 'young',
 'brother',
 'die',
 'democratic',
 'presidential',
 'nominee',
 'say',
 'tweet',
 'call',
 'kind',
 'generous',
 'hard',
 'find',
 'word',
 'mind',
 'flood',
 'memory',
 'write',
 'post',
 'walk',
 'room',
 'light',
 'laughter',
 'cause',
 'death',
 'know',
 'bear',
 'young',
 'child',
 'brother',
 'old',
 'sibling',
 'tweethe',
 'marry',
 'follow',
 'divorce',
 'wife',
 'father',
 'child',
 'kind',
 'generous',
 'wonderful',
 'husband',
 'megan',
 'father',
 'zach',
 'simon',
 'fiona',
 'miss',
 'tweet',
 'job',
 'include',
 'stint',
 'prison',
 'guard',
 'private',
 'detective',
 'accord',
 'new',
 'york',
 'times',
 'president',
 'consultant',
 'deal',
 'broker',
 'outlet',
 'report',
 'presumptive',
 'speaker',
 'house',
 'minority',
 'leader',
 'president',
 'argue',
 'meeting',
 'white',
 'house',
 'washing

In [31]:
# define size of token_list
size = len(token_list)
size

100

In [32]:
import gensim

Train word2vec model:

In [33]:
model = gensim.models.Word2Vec(sentences = token_list, size = size, window = 5, workers = 4, min_count = 1)
# Vocab size:
words = list(model.wv.vocab)
print('vocabulary size: %d' % len(words))

vocabulary size: 28993


In [34]:
print(words)



Test the model:

In [51]:
model.wv.most_similar('pharma')

[('cbi', 0.9809474349021912),
 ('undertake', 0.9801974296569824),
 ('overwhelmed', 0.979841947555542),
 ('hyper', 0.9774317741394043),
 ('setback', 0.9773530960083008),
 ('irgc', 0.9760451912879944),
 ('modafl', 0.9756420254707336),
 ('melbourne', 0.9754774570465088),
 ('designate', 0.9750545024871826),
 ('redstone', 0.9750233888626099)]

### Importing word embeddings to Keras

In [52]:
# save the model
filename = 'article_embeddings.txt'
model.wv.save_word2vec_format(filename, binary=False)

In [54]:
type(model)

gensim.models.word2vec.Word2Vec

In [57]:
import os
import numpy as np

embeddings_index= {}
f = open(os.path.join('', 'article_embeddings.txt'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [58]:
from keras.preprocessing.text import Tokenizer

# vectorise the text samples into a 2D integer tensor
tokenizer = Tokenizer()
tokenizer.fit_on_texts(token_list)
sequences = tokenizer.texts_to_sequences(token_list)


In [71]:
# pad sequences
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

max_length = max([len(doc) for doc in token_list])
articles_pad = pad_sequences(sequences, maxlen=max_length)
print('Shape of article tensor:', articles_pad.shape)

Found 28993 unique tokens.
Shape of article tensor: (100, 12987)


In [72]:
vocab_size = len(word_index) + 1
embedding_matrix = np.zeros((vocab_size, size))

for word, i in word_index.items():
    if i > vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [73]:
print(vocab_size)

28994


In [77]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import LSTM
from keras.initializers import Constant

In [79]:
# define model
model = Sequential()
embedding_layer = Embedding(vocab_size,
                           size,
                           embeddings_initializer=Constant(embedding_matrix),
                           input_length=max_length,
                           trainable=False)
# Add embedding layer
model.add(embedding_layer)

# Add a LSTM layer with 50 internal units.
model.add(LSTM(50, input_shape=(100,12)))
# Add a Dense layer with 12 units.
model.add(Dense(12))
# Add compiler with XXX
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
# Print summary of model
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 12987, 100)        2899400   
_________________________________________________________________
lstm (LSTM)                  (None, 50)                30200     
_________________________________________________________________
dense (Dense)                (None, 12)                612       
Total params: 2,930,212
Trainable params: 30,812
Non-trainable params: 2,899,400
_________________________________________________________________
None


In [80]:
y = df.iloc[:,2:14]

In [81]:
y

Unnamed: 0,SPTR INDEX,SPTRINFT INDEX,SPTRENRS INDEX,SPTRFINL INDEX,SPTRHLTH INDEX,SPTRINDU INDEX,SPTRCOND INDEX,SPTRUTIL INDEX,SPTRMATR INDEX,SPTRCONS INDEX,SPTRTELS INDEX,SPTRRLST INDEX
0,1.016120,1.030542,1.007738,0.995008,1.018123,1.005322,1.015987,1.012665,1.020694,1.023919,1.014964,1.025104
1,0.985862,0.992882,0.973589,0.971418,0.990021,0.976317,0.980954,1.001413,0.974974,0.996222,0.987926,1.000160
2,0.970030,0.967738,0.941347,0.957846,0.979952,0.958660,0.961180,1.004087,0.956772,0.998857,0.971050,0.998822
3,0.990585,0.997728,0.966574,0.978562,0.989894,0.985260,0.981242,1.008074,0.980349,1.015946,0.990384,1.004403
4,1.014590,1.026144,0.998408,1.007702,1.009933,1.008907,1.007450,1.017959,1.004197,1.032372,1.014145,1.016112
...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.014603,1.027546,0.984551,1.010145,1.014173,1.008590,1.003711,1.017094,1.013828,1.017911,1.014926,1.022143
96,1.011528,1.017697,0.995243,0.999201,1.016243,1.007138,0.999620,1.031411,1.015282,1.019882,1.011398,1.025721
97,1.013015,1.020390,0.989093,0.999347,1.016806,1.012716,1.002937,1.032247,1.017791,1.015445,1.013259,1.033061
98,1.019861,1.029616,0.988904,1.011545,1.017006,1.020206,1.011266,1.037604,1.027085,1.019306,1.023903,1.024734


In [None]:
model.fit(articles_pad, y, epochs=2, verbose=1)

Epoch 1/2
