In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter

with open('stock_data.csv', encoding='utf8') as csvfile:
    df = pd.read_csv(csvfile, delimiter=',')

df.dropna(axis=0, how='any', inplace=True)                         # Excludes null-containing rows
print(df['Sentiment'].value_counts())

 1    3685
-1    2106
Name: Sentiment, dtype: int64


In [12]:
# Regex removal of various undesirable parts of a tweet
def clean_tweet(tweet):
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet) # Twitter handle removal
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet) # URL removal
  tweet = re.sub(r"[']", "", tweet) # Apostrophe removal
  tweet = re.sub(r"[^a-zA-Z.!?]", ' ', tweet) # Remove symbols that are not alphabetic or sentence endings
  tweet = re.sub(r"([^a-zA-Z])", r" \1 ", tweet) # Places spaces around sentence endings,
  # so they are encoded as their own words, rather than being lumped in with other words.
  tweet = re.sub(r" +", ' ', tweet) # Excess whitespace removal
  tweet = tweet.lower() # Send tweet to lowercase
  return tweet

In [13]:
# Prepare word lemmatizer and stopwords list for sanitisation
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words('english'))

def sanitise(tweet):
    tweet = clean_tweet(tweet)
    tweet = filter(lambda w: w not in stops, tweet.strip().split()) # Remove stopwords
    return list(map(lemmatizer.lemmatize, tweet)) # Lemmatize words.

In [14]:
df['Text'].map(sanitise)

0       [kicker, watchlist, xide, tit, soq, pnk, cpw, ...
1       [user, aap, movie, ., return, fea, geed, indic...
2       [user, id, afraid, short, amzn, looking, like,...
3                                               [mnta, .]
4                                                 [oi, .]
                              ...                        
5786    [industry, body, cii, said, discoms, likely, s...
5787    [gold, price, slip, r, investor, book, profit,...
5788    [worker, bajaj, auto, agreed, wage, cut, perio...
5789    [sharemarket, live, sensex, day, high, point, ...
5790    [sensex, nifty, climb, day, high, still, key, ...
Name: Text, Length: 5791, dtype: object

In [15]:
from gensim.models.word2vec import Word2Vec
sentences = list(df['Text'].map(sanitise))
model= Word2Vec(sentences)      # default size=100, sg=0 CBOW, min_count=5
model.save('word2vec.model')  # save model
loaded_model = Word2Vec.load('word2vec.model')  # load model

In [16]:
from gensim.models import KeyedVectors
wv = model.wv # get word vector
del model # delete model
wv.save('word_vector') # save word vectors
loaded_wv = KeyedVectors.load('word_vector', mmap='r') # load saved word vectors

In [9]:
wv['user'] # individual vector

array([-0.10369869,  0.53822696,  0.16187547,  0.08480155, -0.04335291,
       -0.5769972 ,  0.45708922,  1.0282371 , -0.17969818, -0.18205358,
       -0.12290734, -0.53705883, -0.18131688,  0.43425286, -0.1035592 ,
       -0.265191  ,  0.2188293 , -0.09301266,  0.13381553, -1.0547245 ,
        0.37065858,  0.01911997,  0.20705305, -0.1713221 , -0.15284374,
       -0.02800358, -0.23205748, -0.25530067, -0.5764056 ,  0.01211332,
        0.35646   , -0.15278113,  0.29542106, -0.39232957, -0.2796067 ,
        0.38709983,  0.01232796, -0.43148988, -0.18238744, -1.1717057 ,
       -0.02951602, -0.1951926 , -0.03740108, -0.08533804,  0.5236154 ,
       -0.02411415, -0.1464998 ,  0.10647264,  0.29031974,  0.12835954,
        0.0192575 , -0.33260703, -0.17543937, -0.02802198, -0.08372337,
        0.41721797,  0.10505386,  0.00385863, -0.45018554, -0.06618115,
       -0.15038341,  0.29844606, -0.28011253,  0.10995841, -0.5077389 ,
        0.38989237,  0.16870242,  0.35169712, -0.33229858,  0.47

In [10]:
dict(wv.key_to_index) # index of vector

{'.': 0,
 'aap': 1,
 '!': 2,
 'user': 3,
 'short': 4,
 '?': 5,
 'day': 6,
 'stock': 7,
 'today': 8,
 'volume': 9,
 'market': 10,
 'like': 11,
 'long': 12,
 'stop': 13,
 'good': 14,
 'high': 15,
 'goog': 16,
 'new': 17,
 'buy': 18,
 'watch': 19,
 'bac': 20,
 'still': 21,
 'nice': 22,
 'back': 23,
 'look': 24,
 'time': 25,
 'move': 26,
 'week': 27,
 'next': 28,
 'coronavirus': 29,
 'higher': 30,
 'see': 31,
 'break': 32,
 'go': 33,
 'price': 34,
 'trade': 35,
 'close': 36,
 'one': 37,
 'ong': 38,
 'point': 39,
 'sensex': 40,
 'triangle': 41,
 'nifty': 42,
 'breakout': 43,
 'weekly': 44,
 'call': 45,
 'f': 46,
 'could': 47,
 'looking': 48,
 'share': 49,
 'get': 50,
 'big': 51,
 'low': 52,
 'year': 53,
 'support': 54,
 'target': 55,
 'position': 56,
 'going': 57,
 'bullish': 58,
 'p': 59,
 'nfx': 60,
 'last': 61,
 'lower': 62,
 'green': 63,
 'u': 64,
 'earnings': 65,
 'gap': 66,
 'amzn': 67,
 'v': 68,
 'open': 69,
 'put': 70,
 'line': 71,
 'c': 72,
 'bank': 73,
 'rt': 74,
 'sell': 75,
 'ma