### Word2Vec 
word embedding using Stanford's word embedding tool (Google's Word2Vec is too slow and memory consuming)

In [3]:
from gensim.models import KeyedVectors
filename = "./embedding/GoogleNews-vectors-negative300.bin"
model = KeyedVectors.load_word2vec_format(filename, binary = True)

In [5]:
### STANFORD WORD2VEC ###
#create word2vec model:

from gensim.scripts.glove2word2vec import glove2word2vec

####SINCE ALREADY CREATED EMBEDDING FILE, DON'T NEED TO RUN CODE BELOW ###

#glove_input_file = './embedding/glove.6B.50d.txt'
#word2vec_output_file = './embedding/word2vec.txt'
#glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 50)

In [8]:
#create word2vec model: 

from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('./embedding/word2vec.txt', binary = False)

In [20]:
#https://stackoverflow.com/questions/49710537/pytorch-gensim-how-to-load-pre-trained-word-embeddings
# import weights from embedder into torch
weights = torch.FloatTensor(model.vectors)

In [23]:
len(weights[0])

50

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

money = pd.read_csv("./final_csvs/money.csv")
text = money["text"].str.strip("['[").str.strip("']") #.str.strip("By w+,")

In [10]:
result = model.most_similar(positive = ['woman', 'king'], negative = ['man'], topn=1)

In [29]:
model.

### Building the model using PyTorch tutorial by Robert Guthrie at
https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html

In [12]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

In [13]:
money = pd.read_csv("./final_csvs/money.csv")

In [59]:
text = money["text"]


#### My Code:

In [15]:
### initialize LSTM:
lstm = nn.LSTM(2,2) #input 2, output 2


For any embedding, need to get a dictionary of words that have values (index/position and embedding vector). Actually gensim does this for us automatically. example below: just use "model" as a dictionary and you get an embedding back:

In [35]:
model["jelly"]

array([-0.047361, -0.61568 , -0.46639 ,  0.12377 ,  0.82296 ,  0.4487  ,
       -0.50167 , -0.38806 ,  0.17726 ,  0.4479  ,  0.58177 ,  0.40022 ,
        1.0288  ,  0.48169 ,  0.014657,  0.12539 , -0.48038 ,  0.64939 ,
       -0.19735 , -1.0002  , -0.17713 , -0.51028 ,  1.359   , -0.026779,
       -0.29515 ,  0.22049 , -1.2501  ,  0.57992 ,  0.6166  , -0.67485 ,
        0.48102 , -0.45007 , -0.86688 ,  1.614   ,  0.13927 , -0.17601 ,
       -0.51826 ,  0.14967 ,  1.3347  ,  0.18456 ,  0.5102  ,  0.2141  ,
       -0.84844 , -0.35747 ,  0.703   ,  0.94865 ,  0.11861 , -0.50851 ,
       -0.89103 , -0.14026 ], dtype=float32)

references: https://medium.com/@martinpella/how-to-use-pre-trained-word-embeddings-in-pytorch-71ca59249f76
Used above link as starter code to try and make RNN

right now we have a dictionary, need to create an embedding layer. need a matrix of weights for embedding layer. shape = (dataset vocab length, word vec dimension)

In [61]:
%time
#get all the text into one vector... might take a little while.

full_text = text.sum()

Wall time: 1 ms


In [47]:
full_words = set(pd.Series(full_text.split(" ")).str.strip(",").str.strip(".").str.strip(" ").str.lower())

In [49]:
import re

In [67]:
full_text[:200]

"['[Potential ways to Save using a Free Library Card', 'Last year, I was contemplating to buy a premium subscription from one of the well-known investment research journals to improve my investment IQ."

In [98]:
#removing alphanumeric:  https://stackoverflow.com/questions/1276764/stripping-everything-but-alphanumeric-chars-from-a-string-in-python

#get rid of unnecessary punctuation
pattern = re.compile("[^\w\.\!\?-]+")
words = pattern.sub(" ", full_text)

#keep ending punctuation
pattern = re.compile("[\.]+")
periods = pattern.sub(" .", words)

pattern = re.compile("[\!]+")
exclaim = pattern.sub(" !", periods)

pattern = re.compile("[\?]+")
questions = pattern.sub(" ?", exclaim)

all_words = questions.lower().strip().split(" ")

In [99]:
article_words = list(set(all_words))

Making embedding layer

In [141]:
#need to make a matrix with the length of all the words in the vocab
len_matrix = len(article_words)

#should be a matrix of vocab across, word vector dimension down
weight_matrix = np.zeros((len_matrix, 50))

#words not found in Glove
words_not_found = 0
weird_words = []

#make a matrix
for i, word in enumerate(article_words):
    try: 
        weight_matrix[i] = model[word]
    except:
        weight_matrix[i] = np.random.normal(scale =0.6, size = (50,))
        words_not_found +=1
        weird_words.append(word)
        

In [142]:
#seems like there are a few words which aren't present in the encoding
"The amount of words not found is equal to {} which is {}%".format(words_not_found, round(words_not_found/len_matrix*100, 2))

#weird_words[:100]

'The amount of words not found is equal to 6533 which is 16.98%'

Make a NN with embedding layer = first layer. Needs to make the words quantitative (ie make them into vectors like one-hot encoding but with embedding rather than binary values.) Transforms the original input words:

In [150]:
weight_in_tensor = torch.from_numpy(weight_matrix)

In [154]:
def embedding_layer(weights, train=True):
    """Takes in a numpy array of weights as created by above and creates
    an embedding layer. Returns layer and dimensions"""
    
    #pytorch needs tensors:
    weights = torch.from_numpy(weights)
    
    #get dimensions of weight vector, words vs dimension vector
    embed_num, embed_dim = weights.shape[0], weights.shape[1]
    
    #the embedding layer needs to be this dimension of weights matrix
    embed_layer = nn.Embedding(embed_num, embed_dim)
    
    #load in the dictionary of weights from the embedder
    embed_layer.load_state_dict({"weight": weights})
    
    #updates conditions if the layers should be trained or not
    if train:
        embed_layer.weight.requires_grad = True
    
    #return the layer with the embedding, and its dimensions
    return embed_layer, embed_num, embed_dim

In [156]:
embedding_layer(weight_matrix, False)

(Embedding(38476, 50), 38476, 50)

Making the RNN: based on the code in the article:

In [157]:
class NN(nn.Module):
    """Making a NN LSTM model with embedding layer"""
    def __init__(self, weights, hidden_layer_size, num_layers):
        #use the super class (nn.Module) initializer
        super(self).__init__()
        self.embedding, embed_num, embed_dim = embedding_layer(weight_matrix)
        self.hidden_size = hidden_layer_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(embed_dim, hidden_layer_size, num_layers, batch_first = True)
    
    def forward(self, input_values, hidden):
        return self.lstm(self.embedding(input_values), hidden)
    
    def init_hidden(self, batch_size):
        return Variable(torch.zeros(self,nunm_laeyrs, batch_size, self.hidden_size))
        
        

In [159]:
lstm = NN(weight_matrix, 100, 2)

TypeError: super() argument 1 must be type, not NN