You will need to install (via pip3): gensim, nltk.  
You will also need to run (with python3 in terminal)  
`>>>import nltk`  
`>>>nltk.download('stopwords')`  
`>>>nltk.download('wordnet')`  
`>>>nltk.download('omw-1.4')`

In [5]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\roger\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\roger\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\roger\AppData\Roaming\nltk_data...


True

In [9]:
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
import torch
import pickle

with open('stock_data.csv', encoding='utf8') as csvfile:
    df = pd.read_csv(csvfile, delimiter=',')

df.dropna(axis=0, how='any', inplace=True)                         # Excludes null-containing rows
num_positive = df['Sentiment'].value_counts()[1]
num_negative = df['Sentiment'].value_counts()[-1]
print(df['Sentiment'].value_counts())

154
 1    3685
-1    2106
Name: Sentiment, dtype: int64


In [3]:
# Hyperparameters
word_frequency_requirement = 0.0013*(df['Sentiment'].size) # the number of times a word has to appear to be given
# it's own encoding. All words under this limit are encoded as the same 'unknown' word.
sg = 0
vector_size = 1000

In [4]:
# Regex removal of various undesirable parts of a tweet
def clean_tweet(tweet):
  tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet) # Twitter handle removal
  tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet) # URL removal
  tweet = re.sub(r"[']", "", tweet) # Apostrophe removal
  tweet = re.sub(r"[^a-zA-Z.!?]", ' ', tweet) # Remove symbols that are not alphabetic or sentence endings
  tweet = re.sub(r"([^a-zA-Z])", r" \1 ", tweet) # Places spaces around sentence endings,
  # so they are encoded as their own words, rather than being lumped in with other words.
  tweet = re.sub(r" +", ' ', tweet) # Excess whitespace removal
  tweet = tweet.lower() # Send tweet to lowercase
  return tweet

In [5]:
# Prepare word lemmatizer and stopwords list for sanitisation
lemmatizer = WordNetLemmatizer()
stops = set(stopwords.words('english'))

def sanitise(tweet):
    tweet = clean_tweet(tweet)
    tweet = filter(lambda w: w not in stops, tweet.strip().split()) # Remove stopwords
    return list(map(lemmatizer.lemmatize, tweet)) # Lemmatize words.

In [6]:
sentences = list(df['Text'].map(sanitise))
model= Word2Vec(sentences, min_count=1, vector_size=vector_size, sg=sg)      # default size=100, sg=0 CBOW, min_count=5
wv = model.wv # get word vector

In [8]:
encoded_df = pd.DataFrame([
    [[wv[word] for word in sentence] for sentence in sentences], # Encode each word of each tweet
    df['Sentiment'].map(lambda x: torch.tensor([1]) if (x==1) else torch.tensor([0])) # Map positive and negative sentiment to class-indicative tensors
]).T

max_tweet_length = max(len(x) for x in encoded_df[0])
print(max_tweet_length)
# zero vector padding
encoded_df[0] = encoded_df[0].map( lambda x: x + [[0]*vector_size] * (max_tweet_length - len(x)) )

encoded_df[0] = encoded_df[0].map(lambda x: torch.FloatTensor(x))
print(encoded_df[0])
with open("encoded_dataframe", "wb") as encoded_dataframe:
    pickle.dump(encoded_df, encoded_dataframe)

37
0       [[tensor(0.0115), tensor(0.0052), tensor(0.009...
1       [[tensor(0.2940), tensor(0.1189), tensor(0.233...
2       [[tensor(0.2940), tensor(0.1189), tensor(0.233...
3       [[tensor(0.0067), tensor(0.0027), tensor(0.005...
4       [[tensor(0.0643), tensor(0.0267), tensor(0.050...
                              ...                        
5786    [[tensor(0.1110), tensor(0.0445), tensor(0.087...
5787    [[tensor(0.0804), tensor(0.0314), tensor(0.063...
5788    [[tensor(0.0379), tensor(0.0162), tensor(0.029...
5789    [[tensor(0.0010), tensor(0.0017), tensor(0.001...
5790    [[tensor(0.2201), tensor(0.0922), tensor(0.182...
Name: 0, Length: 5791, dtype: object
