Data sourced from https://www.kaggle.com/datasets/utkarshxy/stock-markettweets-lexicon-data  
Some data processing abridged from https://www.kaggle.com/code/juniorbueno/stock-market-sentimen-bert-tokenizer  
Various code snippets from COMP9444 assignment 'paraphrased'

You will need to install (via pip3): torch, matplotlib, numpy, nltk.  
You will also need to run (with python3 in terminal)  
`>>>import nltk`  
`>>>nltk.download('stopwords')`  
`>>>nltk.download('wordnet')`  
`>>>nltk.download('omw-1.4')`

In [150]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from random import shuffle

with open('stock_data.csv', encoding='utf8') as csvfile:
    df = pd.read_csv(csvfile, delimiter=',')

df.dropna(axis=0, how='any', inplace=True)                         # Excludes null-containing rows
print(df['Sentiment'].value_counts())

 1    3685
-1    2106
Name: Sentiment, dtype: int64


In [151]:
# Hyperparameters
word_frequency_requirement = 4 # the number of times a word has to appear to be given
# it's own encoding. All words under this limit are encoded as the same 'unknown' word.
train_proportion = 0.8
hidden_layer_size = 10
learning_rate = 0.005
batch_size = 32
epochs = 1000

In [152]:
# Regex removal of various undesirable parts of a tweet
def clean_tweet(tweet):
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet) # Twitter handle removal
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet) # URL removal
  tweet = re.sub(r"[']", "", tweet) # Apostrophe removal
  tweet = re.sub(r"[^a-zA-Z.!?]", ' ', tweet) # Remove symbols that are not alphabetic or sentence endings
  tweet = re.sub(r"([^a-zA-Z])", r" \1 ", tweet) # Places spaces around sentence endings,
  # so they are encoded as their own words, rather than being lumped in with other words.
  tweet = re.sub(r" +", ' ', tweet) # Excess whitespace removal
  tweet = tweet.lower() # Send tweet to lowercase
  return tweet

In [153]:
# Prepare word lemmatizer and stopwords list for sanitisation
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words('english'))

def tokenize(tweet):
    tweet = clean_tweet(tweet)
    tweet = filter(lambda w: w not in stops, tweet.strip().split()) # Remove stopwords
    return list(map(lemmatizer.lemmatize, tweet)) # Lemmatize words.

In [154]:
san_df = pd.DataFrame([
    df['Text'].map(tokenize),
    #df['Sentiment'].map(lambda x: torch.tensor([1,0]) if (x==1) else torch.tensor([0,1]))
    df['Sentiment'].map(lambda x: torch.tensor([1]) if (x==1) else torch.tensor([0]))
    ]).T
    
indexes = [i for i, x in enumerate(san_df['Text']) if len(x) <= 5]
san_df.drop(indexes, inplace=True)
san_df.reset_index(drop=True, inplace=True)

print(san_df.Text[0])
san_df

['kicker', 'watchlist', 'xide', 'tit', 'soq', 'pnk', 'cpw', 'bpz', 'aj', 'trade', 'method', 'method', 'see', 'prev', 'post']


Unnamed: 0,Text,Sentiment
0,"[kicker, watchlist, xide, tit, soq, pnk, cpw, ...",[tensor(1)]
1,"[user, aap, movie, ., return, fea, geed, indic...",[tensor(1)]
2,"[user, id, afraid, short, amzn, looking, like,...",[tensor(1)]
3,"[aap, user, current, downtrend, break, ., othe...",[tensor(0)]
4,"[monday, relative, weakness, ., nyx, win, tie,...",[tensor(0)]
...,...,...
4747,"[industry, body, cii, said, discoms, likely, s...",[tensor(0)]
4748,"[gold, price, slip, r, investor, book, profit,...",[tensor(0)]
4749,"[worker, bajaj, auto, agreed, wage, cut, perio...",[tensor(1)]
4750,"[sharemarket, live, sensex, day, high, point, ...",[tensor(1)]


In [155]:
# Counter class counts number of appearances of all words
word_count = Counter()
for tweet in san_df['Text']:
    word_count.update(tweet)
        
# Create a dictionary that maps words to their one-hot vector indices
vocab = [word for word in word_count if word_count[word] >= word_frequency_requirement] # vocab contains all words meeting the word frequency requirement.

dictionary = {word : i+1 for i, word in enumerate(vocab)} # dicionary is a mapping of each vocab word to its vector index.The +1 reserves the zero index.

dictionary[None] = 0 # Index 0 is reserved to be a blanket classification for all words below the word frequency requirement.

word_count

Counter({'kicker': 2,
         'watchlist': 27,
         'xide': 2,
         'tit': 2,
         'soq': 1,
         'pnk': 1,
         'cpw': 2,
         'bpz': 1,
         'aj': 4,
         'trade': 155,
         'method': 3,
         'see': 163,
         'prev': 4,
         'post': 49,
         'user': 611,
         'aap': 781,
         'movie': 6,
         '.': 6656,
         'return': 22,
         'fea': 1,
         'geed': 1,
         'indicator': 17,
         'year': 126,
         'awesome': 8,
         'id': 11,
         'afraid': 3,
         'short': 415,
         'amzn': 90,
         'looking': 107,
         'like': 256,
         'near': 45,
         'monopoly': 2,
         'ebooks': 1,
         'infrastructure': 2,
         'service': 13,
         'current': 15,
         'downtrend': 13,
         'break': 148,
         'otherwise': 3,
         'term': 85,
         'correction': 12,
         'med': 3,
         'monday': 28,
         'relative': 7,
         'weakness': 19,
     

In [156]:
max_tweet_length = max(len(x) for x in san_df['Text'])

# Map tokenized tweets to lists of dictionary indices
encoded_df = pd.DataFrame([[list(map(lambda w : dictionary.get(w, 0), tweet)) for tweet in san_df['Text']]]).T

# Pad encoded tweets with trailing zeros
encoded_df[0] = encoded_df[0].map( lambda x: x + [0] * (max_tweet_length - len(x)) )

# Map encoded tweets to onehot vector sequences and recombine with sentiment data
onehot_df = pd.DataFrame([
    [F.one_hot(torch.LongTensor(enc_tweet), len(dictionary)) for enc_tweet in encoded_df[0]],
    san_df['Sentiment']
    ]).T

In [157]:
# Shuffle data and split into training and testing data
train_dataset = onehot_df.sample(frac = train_proportion)
test_dataset = onehot_df.drop(train_dataset.index)

train_size = train_dataset.shape[0]
test_size = test_dataset.shape[0]

train_tensor = torch.utils.data.TensorDataset(torch.stack(tuple(train_dataset[0])).type(torch.float32), torch.stack(tuple(train_dataset[1])))
test_tensor = torch.utils.data.TensorDataset(torch.stack(tuple(test_dataset[0])).type(torch.float32), torch.stack(tuple(test_dataset[1])))

train_loader = torch.utils.data.DataLoader(dataset = train_tensor, batch_size = batch_size, shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset = test_tensor, batch_size = 1, shuffle = False)

In [158]:
class SRN_model(nn.Module):
    def __init__(self, num_input, num_hid, num_out):
        super().__init__()
        self.num_hid = num_hid
        self.batch_size = 1
        self.H0= nn.Parameter(torch.Tensor(num_hid))
        self.W = nn.Parameter(torch.Tensor(num_input, num_hid))
        self.U = nn.Parameter(torch.Tensor(num_hid, num_hid))
        self.hid_bias = nn.Parameter(torch.Tensor(num_hid))
        self.V = nn.Parameter(torch.Tensor(num_hid, num_out))
        self.out_bias = nn.Parameter(torch.Tensor(num_out))

        # Various initialisation schemes. Initialisation is important.
        nn.init.zeros_(self.H0)
        nn.init.xavier_normal_(self.W)
        nn.init.xavier_normal_(self.U)
        nn.init.zeros_(self.hid_bias)
        nn.init.xavier_normal_(self.V)
        nn.init.zeros_(self.out_bias)

    def init_hidden(self):
        H0 = torch.tanh(self.H0)
        return(H0.unsqueeze(0))
 
    def forward(self, seq):
        seq_size, _ = seq.size()
        h_t = self.init_hidden()
        for t in range(seq_size):
            x_t = seq[t]
            c_t = x_t @ self.W + h_t @ self.U + self.hid_bias
            h_t = torch.tanh(c_t)
        output = h_t @ self.V + self.out_bias
        return output

In [159]:
def train(net, criterion, optimizer, data, label):

    loss = 0
    outputs = []

    for i in range(data.shape[0]):
        net.init_hidden()

        # Forward
        output = net(data[i])

        # Apply output nonlinearity. Log_softmax chosen as it is suited for classification tasks
        outputs.append(F.log_softmax(output, dim=1))
    
    loss = criterion(torch.cat(outputs, dim=0), torch.squeeze(label,1))
    
    loss.backward()

    optimizer.step()

    return loss.data.item()

In [160]:
net = SRN_model(len(dictionary),hidden_layer_size,2)

# Negative log likelihood loss. Suited for classification tasks.
criterion = F.nll_loss

optimizer = optim.Adam(net.parameters(), lr=learning_rate, weight_decay=0.00001)

In [162]:
plot_loss = []
plot_correct = []

num_batches = train_size//batch_size

for e in range(epochs):
    loss = 0.

    # Trains on every training data item individually each epoch
    for data, label in train_loader:
        optimizer.zero_grad()
        loss += train(net, criterion, optimizer, data, label)

    # Evaluate proportion of the test set correctly predicted.
    correct = 0
    for data, label in test_loader:
        output = net(data[0])
        if (torch.argmax(output.data) == label[0][0]): correct += 1
    accuracy = correct/test_size*100

    # Append loss and accuracy results to lists for later plotting.
    plot_loss.append(loss/num_batches)
    plot_correct.append(accuracy)
    
    # Print loss and accuracy every epoch.
    print("Epoch %02d, loss = %f, accuracy = %.2f%%" % (e+1, loss / num_batches, accuracy))

Epoch 01, loss = 0.663568, accuracy = 62.74%
Epoch 02, loss = 0.662446, accuracy = 62.74%


KeyboardInterrupt: 

In [None]:
# Plot results
plt.plot(plot_loss)
plt.xlabel('Epoch')
plt.ylabel('Avg. Loss per Epoch (on Training Set)')
plt.show()

plt.plot(plot_correct)
plt.xlabel('Epoch')
plt.ylabel('Accuracy per Epoch (on Test Set)')
plt.show()