Implementation guided by https://towardsdatascience.com/word2vec-with-pytorch-implementing-original-paper-2cd7040120b0

You will need to install (via pip3): torch, matplotlib, numpy, nltk.  
You will also need to run (with python3 in terminal)  
`>>>import nltk`  
`>>>nltk.download('stopwords')`  
`>>>nltk.download('wordnet')`  
`>>>nltk.download('omw-1.4')`

In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader 
from functools import partial  
import numpy as np
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter

with open('stock_data.csv', encoding='utf8') as csvfile:
    df = pd.read_csv(csvfile, delimiter=',')

df.dropna(axis=0, how='any', inplace=True)                         # Excludes null-containing rows
print()
print(df['Sentiment'].value_counts())


 1    3685
-1    2106
Name: Sentiment, dtype: int64


In [32]:
# Hyperparameters
word_frequency_requirement = 8
embed_dimension = 300 
embed_max_norm = 1
bow_size = 4
cbow_batchsize = 32

In [33]:
# Regex removal of various undesirable parts of a tweet
def clean_tweet(tweet):
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet) # Twitter handle removal
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet) # URL removal
  tweet = re.sub(r"[']", "", tweet) # Apostrophe removal
  tweet = re.sub(r"[^a-zA-Z.!?]", ' ', tweet) # Remove symbols that are not alphabetic or sentence endings
  tweet = re.sub(r"([^a-zA-Z])", r" \1 ", tweet) # Places spaces around sentence endings,
  # so they are encoded as their own words, rather than being lumped in with other words.
  tweet = re.sub(r" +", ' ', tweet) # Excess whitespace removal
  tweet = tweet.lower() # Send tweet to lowercase
  return tweet

In [34]:
# Prepare word lemmatizer and stopwords list for sanitisation
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words('english'))

def sanitise(tweet):
    tweet = clean_tweet(tweet)
    tweet = filter(lambda w: w not in stops, tweet.strip().split()) # Remove stopwords
    return list(map(lemmatizer.lemmatize, tweet)) # Lemmatize words.

In [35]:
san_df = pd.DataFrame([
    df['Text'].map(sanitise),
    df['Sentiment'].map(lambda x: torch.tensor([1,0]) if (x==1) else torch.tensor([0,1]))
    ]).T
    
indexes = [i for i, x in enumerate(san_df['Text']) if len(x) <= 5]
san_df.drop(indexes, inplace=True)
san_df.reset_index(drop=True, inplace=True)

print(san_df.Text[0])
san_df

['kicker', 'watchlist', 'xide', 'tit', 'soq', 'pnk', 'cpw', 'bpz', 'aj', 'trade', 'method', 'method', 'see', 'prev', 'post']


Unnamed: 0,Text,Sentiment
0,"[kicker, watchlist, xide, tit, soq, pnk, cpw, ...","[tensor(1), tensor(0)]"
1,"[user, aap, movie, ., return, fea, geed, indic...","[tensor(1), tensor(0)]"
2,"[user, id, afraid, short, amzn, looking, like,...","[tensor(1), tensor(0)]"
3,"[aap, user, current, downtrend, break, ., othe...","[tensor(0), tensor(1)]"
4,"[monday, relative, weakness, ., nyx, win, tie,...","[tensor(0), tensor(1)]"
...,...,...
4747,"[industry, body, cii, said, discoms, likely, s...","[tensor(0), tensor(1)]"
4748,"[gold, price, slip, r, investor, book, profit,...","[tensor(0), tensor(1)]"
4749,"[worker, bajaj, auto, agreed, wage, cut, perio...","[tensor(1), tensor(0)]"
4750,"[sharemarket, live, sensex, day, high, point, ...","[tensor(1), tensor(0)]"


In [36]:
# Counter class counts number of appearances of all words
word_count = Counter()
for tweet in san_df['Text']:
    word_count.update(tweet)
        
# Create a dictionary that maps words to their one-hot vector indices
vocab = [word for word in word_count if word_count[word] >= word_frequency_requirement] # vocab contains all words meeting the word frequency requirement.

dictionary = {word : i+1 for i, word in enumerate(vocab)} # dicionary is a mapping of each vocab word to its vector index.The +1 reserves the zero index.

dictionary[None] = 0 # Index 0 is reserved to be a blanket classification for all words below the word frequency requirement.

word_count.most_common(10)

[('.', 6656),
 ('aap', 781),
 ('!', 687),
 ('user', 611),
 ('?', 430),
 ('short', 415),
 ('day', 375),
 ('stock', 363),
 ('today', 323),
 ('volume', 292)]

In [37]:
max_tweet_length = max(len(x) for x in san_df['Text'])

encoded_df = pd.DataFrame([[list(map(lambda w : dictionary.get(w, 0), tweet)) for tweet in san_df['Text']]]).T

encoded_df[0] = encoded_df[0].map( lambda x: x + [0] * (max_tweet_length - len(x)) )

onehot_df = pd.DataFrame([
    [F.one_hot(torch.LongTensor(enc_tweet), len(dictionary)) for enc_tweet in encoded_df[0]],
    san_df['Sentiment']
    ]).T

In [38]:
 
class CBOW_Model(nn.Module):
    def __init__(self, vocab_size: int):

        super(CBOW_Model, self).__init__()

        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dimension,
            max_norm=embed_max_norm,
        )

        self.linear = nn.Linear(
            in_features=embed_dimension,
            out_features=vocab_size,
        )

    def forward(self, inputs_):

        x = self.embeddings(inputs_)
        x = x.mean(axis=1)
        
        x = self.linear(x)
        return x

In [39]:
def collate_cbow(batch, text_pipeline):
     batch_input, batch_output = [], []
     for text in batch:
         text_tokens_ids = text_pipeline(text)
         if len(text_tokens_ids) < bow_size * 2 + 1:
             continue
         if max_tweet_length:
             text_tokens_ids = text_tokens_ids[:max_tweet_length]
         for idx in range(len(text_tokens_ids) - bow_size * 2):
             token_id_sequence = text_tokens_ids[idx : (idx + bow_size * 2 + 1)]
             output = token_id_sequence.pop(bow_size)
             input_ = token_id_sequence
             batch_input.append(input_)
             batch_output.append(output)
     
     batch_input = torch.tensor(batch_input, dtype=torch.long)
     batch_output = torch.tensor(batch_output, dtype=torch.long)
     return batch_input, batch_output

In [40]:


dataloader = DataLoader(
         onehot_df[0],
         batch_size=batch_size,
         shuffle=True,         
         # collate_fn=partial(collate_cbow, text_pipeline=text_pipeline),
)

NameError: name 'data_iter' is not defined