In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
#Necessary Library
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
#Import stopwords
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess

import torch
torch.manual_seed(10)
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##Helper Function

In [None]:
stop_words = set(stopwords.words('english'))

def preprocess(corpus):
    result = []
    for i in corpus:
        i = gensim.utils.simple_preprocess(i, deacc=True) #Remove Punctuation
        i = " ".join(i)
        out = nltk.word_tokenize(i)
        out = [x.lower() for x in out]
        out = [x for x in out if x not in stop_words]
        result.append(" ". join(out))
    return result

def create_vocabulary(corpus):
    '''Creates a dictionary with all unique words in corpus with id'''
    id2word = {}
    vocabulary = {}
    i = 0
    for d in corpus:
        for w in d.split():
            if w not in vocabulary:
                vocabulary[w] = i
                i+=1
                id2word[i] = w
    return vocabulary, id2word

def prepare_dataset(corpus, window_size):
  '''
  neighbor to look at each direction
  total neighbor is neighbor*2, window size is neighbor*2+1
  '''
  neighbor = window_size//2
  total_neighbor = neighbor*2

  columns = ['Input', 'Output']
  

  all_row = []

  for doc in corpus:
      for i, w in enumerate(doc.split()):
          inp = w
          for n in range(1, neighbor+1):

              #look left
              if (i-n) >= 0:
                out = doc.split()[i-n]

              #look right
              if (i+n)<len(doc.split()):
                out = doc.split()[i+n]

              row = [inp,out]
              all_row.append(row)
  dataset = pd.DataFrame(all_row, columns=columns)
  return dataset

##Load Data

In [None]:
df = pd.read_csv('Review_word2vec_v1.csv')
data = df["Text"].values.tolist()

#Set Corpus
corpus=data

##Start Here

In [None]:
#Hyper-parameter
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
window = 3
embedding_dim = 50
batch = 128
num_epochs = 5
load_model = True

In [None]:
corpus = preprocess(corpus)
vocabulary, id2word = create_vocabulary(corpus)

In [None]:
train_data = prepare_dataset(corpus, window_size=window)

#Replace word with idx
train_data.Input = train_data.Input.map(vocabulary)
train_data.Output = train_data.Output.map(vocabulary)
# print(train_data.head())


train_data_input_loaded = DataLoader(train_data.Input.values, batch_size=batch) #.values Return a Numpy representation of the DataFrame.
train_data_output_loaded = DataLoader(train_data.Output.values, batch_size=batch)

In [None]:
#More Helper
vocab_size = len(vocabulary)

def convert_one_hot_tensor(tensor):
    '''Transform 1D tensor of word indexes to one-hot encoded 2D tensor'''
    size = [*tensor.shape][0]
    inp = torch.zeros(size, vocab_size).scatter_(1, tensor.unsqueeze(1), 1.)
    return inp.float()

In [None]:
class SG(nn.Module):
  def __init__(self, vocab_size, embedding_dim):
    super(SG, self).__init__()
    self.FC1 = nn.Linear(vocab_size, embedding_dim)
    self.FC2 = nn.Linear(embedding_dim, vocab_size)

  def forward(self, x):
    x = F.relu(self.FC1(x))
    x = self.FC2(x)
    return x

def save_checkpoint(checkpoint, filename="my_checkpoint.pth.tar"):
  # print("=> Saving Checkpoint")
  torch.save(checkpoint, filename)

def load_checkpoint(checkpoint):
  # print("=> Loading Checkpoint")
  model.load_state_dict(checkpoint["model"])
  optimizer.load_state_dict(checkpoint["optimizer"])

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
model = SG(vocab_size, embedding_dim ).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

##Train

In [None]:
if load_model:
  load_checkpoint(torch.load("my_checkpoint.pth.tar"))
for epoch in range(num_epochs):
  total_loss = 0
  if epoch%2==0:
    checkpoint ={'model': model.state_dict(), 'optimizer': optimizer.state_dict()}
    save_checkpoint(checkpoint)

  for x, y in zip(train_data_input_loaded, train_data_output_loaded):
    x = convert_one_hot_tensor(x)
    x = x.to(device)
    y = y.to(device)

    #forward
    scores = model(x)
    loss = loss_function(scores, y)

    #backward
    optimizer.zero_grad()
    loss.backward()

    #weight update SGD
    optimizer.step()
    total_loss += loss.item()
  if epoch%2==0:
    print(f'Epoch: {epoch} Loss: {total_loss}')


Epoch: 0 Loss: 153287.1627960205
Epoch: 2 Loss: 140333.50424957275
Epoch: 4 Loss: 133264.14703845978


#Load Trained File

In [None]:
vocab_size = 39859
embedding_dim = 50

checkpoint = torch.load("my_checkpoint.pth.tar")
model = checkpoint["model"]
optimizer = checkpoint["optimizer"]

print(model['FC1.weight'].size())
print(model['FC2.weight'].size())

word_vectors = model['FC1.weight']

word_vectors = np.transpose(word_vectors.cpu().numpy())

print(f'Vacabulary Size: {vocab_size} Embedding Dimension: {embedding_dim}')
print(word_vectors.shape)


torch.Size([50, 39859])
torch.Size([39859, 50])
Vacabulary Size: 39859 Embedding Dimension: 50
(39859, 50)


#Convert words to Tensor

In [None]:
words = ["Coffee", "Pasta" ,"Tuna", "Cookies"]
top = 10

word_idx = [vocabulary[word.lower()] for word in words]

words_hot = []
for id in word_idx:
  x = np.zeros(vocab_size)
  x[id] = 1

  words_hot.append(x)
words_hot = torch.tensor(words_hot, device=device).float()

W1 = torch.transpose(model['FC1.weight'], 0, 1).float()
W2 = torch.transpose(model['FC2.weight'],0, 1).float()

h = words_hot.mm(W1)
y_pred = h.mm(W2)

res_val, res_ind = y_pred.sort(descending=True, dim=1)
res_ind = res_ind[:][:top]

# res_arg = torch.argmax(y_pred, dim=1)

# res_idx = res_arg.cpu().numpy()

res_idx = res_ind.cpu().numpy()
for i in range(len(res_idx)):
  print(f'Top 10 word for {words[i]}')
  for j in range(top):
    print(id2word[res_idx[i][j]])

Top 10 word for Coffee
craving
items
quality
stores
mints
looks
thanks
trips
clock
great
Top 10 word for Pasta
feet
althoug
coconout
cashier
drown
lime
riddled
considered
unseasoned
amsterdan
Top 10 word for Tuna
sudorific
crisp
numb
packers
undertones
grinder
sometimes
incur
languages
determination
Top 10 word for Cookies
present
craving
trees
cup
clean
easy
straw
quality
items
since


#Word Analogy with Glove

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip

--2021-09-21 03:43:10--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-09-21 03:43:11--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-09-21 03:45:51 (5.14 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [None]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
import numpy as np

In [None]:
def read_data(file_name):
    with open(file_name,'r') as f:
        vocabluary = set()
        word2vec = {}

        for line in f:
          line = line.strip() #remove unecessary whitespce in the begining
          row = line.split()
          word = row[0] #first element is word
          vocabluary.add(word)
          word2vec[word] = np.array(row[1:], dtype=float)
    return vocabluary, word2vec

In [None]:
vocab, w2v = read_data("./glove.6B.300d.txt")
print(f'Vocabulary Size: {len(vocab)}')

Vocabulary Size: 400000


In [None]:
def cos_sim(u,v):
    """
    u: vector of 1st word
    v: vector of 2nd Word
    """
    numerator_ = u.dot(v)
    denominator_= np.sqrt(np.sum(np.square(u))) * np.sqrt(np.sum(np.square(v)))
    return numerator_/denominator_

In [None]:
def find_word4(word1, word2, word3):
    word4 = ""

    word1, word2, word3 = word1.lower(), word2.lower(), word3.lower()
    diffVec = w2v[word3] - (w2v[word1] - w2v[word2])#word1 - word2 = word3 - word4

    max_sim = -100000
    for word in vocab:
      vec = w2v[word]
      sim_ = cos_sim(diffVec, vec)
      if sim_> max_sim:
        max_sim = sim_
        word4 = word

    return word4

In [None]:
print("Spain is to Spanish as Germany is to"+find_word4('Spain','Spanish','Germany'))
print("Japan is to Tokyo as France is to "+find_word4('Japan','Tokyo','France'))
print("Woman is to Man as Queen is to "+find_word4('Woman','Man','Queen'))
print("Australia is to Hotdog as Italy is to "+find_word4('Australia','Hotdog','Italy'))

Spain is to Spanish as Germany is togerman
Japan is to Tokyo as France is to paris
Woman is to Man as Queen is to queen
Australia is to Hotdog as Italy is to hotdog


[1. SkipGram Intution Chris Tutorial](http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/)

[2. PyTorch Tutorial CBOW and N-gram](https://pytorch.org/tutorials/beginner/nlp/word_embeddings_tutorial.html)

[3. Skipgram Explained with Code (***)](https://www.kaggle.com/karthur10/skip-gram-implementation-with-pytorch-step-by-step#Skip-Gram-example-with-PyTorch)

[4. Skipgram and CBOW Pytorch clean code (***)](https://srijithr.gitlab.io/post/word2vec/)
