In [1]:
from __future__ import division
import argparse
import pandas as pd

# useful stuff
import numpy as np
from scipy.special import expit
from sklearn.preprocessing import normalize
from random import choices
from tqdm.notebook import tqdm
import nltk 
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from collections import defaultdict
import random
import re
import pickle
import time


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
## MOUNT DRIVE 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import spacy
nlp = spacy.load('en')

In [4]:
def preprocess(sentence) :
  sentence = re.sub("[^a-zA-Z0-9]"," ", sentence) ## remove non alpha numerical  caracters
  sentence = re.sub(' +', ' ', sentence ) ## remove unnecessary spaces
  sentence = re.sub("[0-9]+","NUMTOKEN", sentence) ## create NUMTOKEN word for numbers
  return sentence

def text2sentences(path) :
    sentences = []
    with open(path) as f :
      for l in tqdm(f) :
        sentence = []
        doc = nlp(preprocess(l.lower()))
        for token in doc:
          sentence.append(token.lemma_)
        sentences.append(sentence)
    return sentences

def loadPairs(path):
	data = pd.read_csv(path, delimiter='\t')
	pairs = zip(data['word1'],data['word2'],data['similarity'])
	return pairs

In [14]:
class SkipGram():
    def __init__(self, sentences, path_save  , lr = 1e-2 , nEmbed=100, 
                 negativeRate=5, winSize = 5, minCount = 2):
      self.w2id = {} # word to ID mapping
      
      self.vocab = [] # list of valid words
      self.occurences = defaultdict(int)
      self.minCount = minCount
      self.get_vocab(sentences)
      self.trainset = self.subsample(sentences)
      ## Initialize weights 
      self.W_  = np.random.normal(0, 0.1, (nEmbed,len(self.vocab)))
      self.C_ = np.random.normal(0, 0.1, (nEmbed,len(self.vocab)))
      ## learning rate
      self.lr_ = lr
      self.neg_ = negativeRate
      self.winSize = winSize
      self.loss= []
      self.path_save = path_save
      ## random vector for unknown words in test time.
      self.random_vector = np.random.normal(0,1,(nEmbed,1))
    def get_vocab(self,sentences) :
        """
        Create vocab , word to id dict and a dict of occurences
        """ 
        for sentence in sentences : 
          for word in sentence :
            self.occurences[word]+=1

        self.occurences  = dict(self.occurences)
        ## filter by mincount
        self.occurences  = {k: v for k, v in self.occurences.items() if v>= self.minCount}
        self.vocab = list(self.occurences.keys())
        self.w2id = dict(zip(self.vocab,range(len(self.vocab))))

        ## occurences for negative sampling
        sum_neg = np.sum(np.array(list(self.occurences.values())) **(3/4))
        self.occurences_neg = {k:(v**(3/4))/sum_neg for k,v in self.occurences.items()}
    def subsample(self,sentences) :
      """
      This function subsamples words that are frequent in the dataset.
      """
      trainset_ = []
      total_occurences = sum(list(self.occurences.values())) ## normalize
      occurences_ratio = {k:v/total_occurences for k,v in self.occurences.items()} ## normalize 
      for sentence in sentences :
        current = []
        for word in sentence :
          ## if not in vocab (filtered by min count)
          if word not in self.vocab : 
            continue
          ## This is the probability to keep the word. The formulation is taken from word2vec paper. 
          prob = (np.sqrt(occurences_ratio[word]/1e-3)+1) * (1e-3/occurences_ratio[word])
          k = random.random()
          if k<prob :
            current.append(word) 
        trainset_.append(current)
      return trainset_


    def sample(self, omit):
    
        ## we extract self.neg_ + 2 samples because in the worst case, two samples correspond to omit set.
        rand_ = np.random.multinomial(n=self.neg_+2, pvals = list(self.occurences_neg.values()))
        rand_ = list(np.where(rand_>=1)[0])
        rand_ = [id_ for id_ in rand_ if id_ not in omit]
        return rand_[:self.neg_]

    def sigmoid(self,x) :
        return (1/(1+np.exp(-x)))
    def train(self):
        self.trainWords , self.acc = 0 , 0 
        for counter, sentence in tqdm(enumerate(self.trainset) , total = len(self.trainset)):
            sentence = list(filter(lambda word: word in self.vocab, sentence))

            for wpos, word in enumerate(sentence):
                wIdx = self.w2id[word]
                winsize = np.random.randint(self.winSize) + 1
                start = max(0, wpos - winsize)
                end = min(wpos + winsize + 1, len(sentence))

                for context_word in sentence[start:end]:
                    ctxtId = self.w2id[context_word]
                    if ctxtId == wIdx: continue
                    negativeIds = self.sample({wIdx, ctxtId})
                    self.trainWord(wIdx, ctxtId, negativeIds)
                    self.trainWords += 1
                  
            if counter % 1000 == 0:
                print (' > training %d of %d' % (counter, len(self.trainset)))
                acc_norm = self.acc/((1+self.neg_) * self.trainWords)
                print('loss : ',acc_norm)
                self.loss.append(acc_norm)
                self.trainWords = 0
                self.acc = 0.
                self.save(self.path_save)

    def trainWord(self, wordId, contextId, negativeIds):
        t1 = time.time()

        z_negatives = np.zeros((len(self.vocab),len(negativeIds)))
        for k,index in enumerate(negativeIds) :
            z_negatives[index,k] = 1
        x_w = self.W_[:,wordId].reshape(-1,1)
        y_c = self.C_[:,contextId].reshape(-1,1)
        Z_c = self.C_[:,[neg_id for neg_id in negativeIds]]
        t2 = time.time()
        ## compute gradients 
        negative_grad_W = np.zeros((self.W_.shape[0],1))
        
        
        sig_neg = self.sigmoid(x_w.T @ Z_c) 
        negative_grad_W =  - np.sum(sig_neg * Z_c, axis = 1).reshape(-1,1)
        negative_grad_C =  - np.sum(sig_neg * z_negatives ,axis=1).reshape(1,-1)

        self.acc -= np.sum(np.log(1-sig_neg + 1e-6),axis=1)[0]
        t3 = time.time()
        
        sig_pos = self.sigmoid( - x_w.T @ y_c)[0][0]
        self.acc -= np.log(1-sig_pos + 1e-6) 
        grad_w = np.zeros_like(self.W_)
        grad_w [:,wordId] = (sig_pos * y_c   + negative_grad_W).squeeze()
        grad_c = np.zeros_like(self.C_)
        
        for neg_id in negativeIds:
          grad_c[:,neg_id ] = x_w[:,0]*negative_grad_C[:,neg_id]
        grad_c[:,contextId]=sig_pos*x_w[:,0]

        t4 = time.time()

        ## update weights 
        #only one column in W_ needs update, the rest are zeros
        self.W_[:,wordId]+=self.lr_ * grad_w[:,wordId] ## gradient ascent since we want the argmax
        #only modified context words will be updated
        for c_id in negativeIds+[contextId]:
          self.C_[:,c_id]+=self.lr_ * grad_c[:,c_id]
        """
        print('x_w',t2 - t1)
        print("negative grad",t3-t2)
        print("grad" , t4 - t3)
        """

    def save(self,path):
        with open(path, 'wb') as f:
            pickle.dump(self, f)

    def similarity(self,word1,word2):
        """
        computes similiarity between the two words. unknown words are mapped to one common vector
        :param word1:
        :param word2:
        :return: a float \in [0,1] indicating the similarity (the higher the more similar)
        """
        if word1 not in self.vocab  : 
          x_w = self.random_vector
        else : 
          id1 = self.w2id[word1]
          x_w = self.W_ [id1]

        if word2 not in self.vocab  : 
          x_c = self.random_vector
        else : 
          id2 = self.w2id[word2]
          x_c = self.W_ [id2]

        return self.sigmoid(x_w.T @ x_c)[0][0]
        

    @staticmethod
    def load(path):
        with open(path, 'rb') as f:
            return pickle.load(f)




# Main

In [6]:
test = False
text_path = '/content/drive/MyDrive/data_CS_MVA/NLP/train_20000.txt'

In [7]:
sentences = text2sentences(text_path)


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
#sg = SkipGram.load('/content/drive/MyDrive/model_skip_10000_2epoch.pkl')
sg = SkipGram(sentences , lr= 1e-2 , nEmbed = 100, negativeRate= 5 , 
              minCount= 5 , path_save ='/content/drive/MyDrive/model_skip_20000_1epoch.pkl' )
sg.train()

HBox(children=(FloatProgress(value=0.0, max=20000.0), HTML(value='')))

 > training 0 of 20000
loss :  0.6937914073659904


In [None]:
sg.save('/content/drive/MyDrive/model_skip_20000_1epoch.pkl')

NameError: ignored

In [None]:
def similarity_(word1,word2):
        """
        computes similiarity between the two words. unknown words are mapped to one common vector
        :param word1:
        :param word2:
        :return: a float \in [0,1] indicating the similarity (the higher the more similar)
        """
        if word1 not in sg.vocab  : 
          x_w = sg.random_vector
        else : 
          id1 = sg.w2id[word1]
          x_w = sg.W_[:,id1].reshape(-1,1)

        if word2 not in sg.vocab  : 
          x_c = sg.random_vector
        else : 
          id2 = sg.w2id[word2]
          x_c = sg.W_[:,id2].reshape(-1,1)

        return sg.sigmoid(x_w.T @ x_c)[0][0]


In [None]:
sg.vocab

In [None]:
key = "woman"
n_best = 15
all_similarities = np.array([similarity_(key,query) for query in sg.vocab])
best_argmaxes = all_similarities.argsort()[-n_best:][::-1]
for argmax in best_argmaxes :
  print(sg.vocab[argmax],all_similarities[argmax])

-PRON- 0.5292255896784983
the 0.5286880197584928
of 0.5285503032997688
and 0.5277966444149033
be 0.5275484744018022
s 0.5245874339006774
in 0.5245120601573754
NUMTOKEN 0.524376608274769
to 0.5222469721850704
that 0.5214711250668732
a 0.5208129189213827
from 0.520420156003861
have 0.5203921570852976
with 0.5191866305574115
for 0.5181405734003708


In [None]:
sg.similarity('nurse','woman')

0.9607249778445325

In [None]:
sg.w2id['new']

68

In [None]:
sg.save('/content/drive/MyDrive/model_skip_10000.pkl')