# Word2Vec

This notebook is a cleaned version of the Word2Vect practical, adapted to a French dataset, in order to get usable embeddings

In [1]:
import collections

import math
import numpy as np

import random
import sys
import time
import zipfile

import os
from pathlib import Path

In [4]:
ROOT_DIR='content'
ROOT_DIR = Path.home()
data_path = os.path.join(ROOT_DIR,'Desktop/NLProj/data/')
file = 'wikipediaTXT.txt'
with open(data_path+file, 'r',encoding='ISO-8859-1') as f:
    lines = f.readlines()
    # st is the abbreviation of "sentence" in the loop
    raw_dataset = [st.split() for st in lines]

'# sentences: %d' % len(raw_dataset)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc5 in position 6: invalid continuation byte

In [8]:
# tk is an abbreviation for "token" in the loop
counter = collections.Counter([tk for st in raw_dataset for tk in st])
counter = dict(filter(lambda x: x[1] >= 5, counter.items()))

In [9]:
idx_to_token = [tk for tk, _ in counter.items()]
token_to_idx = {tk: idx for idx, tk in enumerate(idx_to_token)}
dataset = [[token_to_idx[tk] for tk in st if tk in token_to_idx]
           for st in raw_dataset]
num_tokens = sum([len(st) for st in dataset])
'# tokens: %d' % num_tokens

'# tokens: 887100'

In [12]:
def discard(idx):
    return random.uniform(0, 1) < 1 - math.sqrt(
        1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[tk for tk in st if not discard(tk)] for st in dataset]
'# tokens: %d' % sum([len(st) for st in subsampled_dataset])

'# tokens: 375392'

In [218]:
def compare_counts(token):
    return '# %s: before=%d, after=%d' % (token, sum(
        [st.count(token_to_idx[token]) for st in dataset]), sum(
        [st.count(token_to_idx[token]) for st in subsampled_dataset]))

In [15]:
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        # Each sentence needs at least 2 words to form a
        # "central target word - context word" pair
        if len(st) < 2:
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                 min(len(st), center_i + 1 + window_size)))
            # Exclude the central target word from the context words
            indices.remove(center_i)
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

In [19]:
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)

In [20]:
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                # An index of k words is randomly generated as noise words
                # based on the weight of each word (sampling_weights). For
                # efficient calculation, k can be set slightly larger
                i, neg_candidates = 0, random.choices(population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            # Noise words cannot be context words
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)

In [23]:
import torch
import torch.nn as nn

In [24]:
from torch.utils.data import Dataset, DataLoader

In [25]:
class PTB_dataset(Dataset):
    
    def __init__(self, all_centers, all_contexts, all_negatives):
        self.all_centers, self.all_contexts_negatives, self.all_masks, self.all_labels = self.batchify(list(zip(all_centers,all_contexts,all_negatives)))
        
    def __len__(self):
        return len(self.all_centers)
    
    def __getitem__(self,idx):
        return self.all_centers[idx], self.all_contexts_negatives[idx], self.all_masks[idx], self.all_labels[idx]
        
    def batchify(self,data):
        max_len = max(len(c) + len(n) for _, c, n in data)
        centers, contexts_negatives, masks, labels = [], [], [], []
        for center, context, negative in data:
            cur_len = len(context) + len(negative)
            centers += [center]
            contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
            masks += [[1] * cur_len + [0] * (max_len - cur_len)]
            labels += [[1] * len(context) + [0] * (max_len - len(context))]
        return (torch.tensor(centers).view((-1, 1)), torch.tensor(np.array(contexts_negatives)),
            torch.tensor(np.array(masks)), torch.tensor(np.array(labels)))
        

In [26]:
ptbdata = PTB_dataset(all_centers, all_contexts, all_negatives)

In [28]:
batch_size = 512

data_loader = DataLoader(ptbdata, batch_size, shuffle=True,
                              num_workers=4)
for batch in data_loader:
    for name, data in zip(['centers', 'contexts_negatives', 'masks',
                           'labels'], batch):
        print(name, 'shape:', data.shape)
    break

centers shape: torch.Size([512, 1])
contexts_negatives shape: torch.Size([512, 60])
masks shape: torch.Size([512, 60])
labels shape: torch.Size([512, 60])


In [29]:
# taken from the spotlight library, see
# https://github.com/maciejkula/spotlight/blob/master/spotlight/layers.py
class ScaledEmbedding(nn.Embedding):
    """
    Embedding layer that initialises its values
    to using a normal variable scaled by the inverse
    of the emedding dimension.
    """
    def reset_parameters(self):
        """
        Initialize parameters.
        """
        self.weight.data.normal_(0, 1.0 / self.embedding_dim)
        if self.padding_idx is not None:
            self.weight.data[self.padding_idx].fill_(0)

In [95]:
class Skip_gram(nn.Module):
    def __init__(self, input_dim, embed_size = 100):
        super(Skip_gram, self).__init__()
        self.input_dim = input_dim
        self.embed_size = embed_size
        self.central_emb = ScaledEmbedding(self.input_dim,self.embed_size)
        self.context_emb = ScaledEmbedding(self.input_dim,self.embed_size)
        
    def forward(self, icent, icont):
        # (hint: dimensions are for icent (bs,1) for icont (bs,max_len) and output (bs,1,max_len))
        output = torch.einsum('aij,akj->aik', self.central_emb(icent), self.context_emb(icont))
        return output

In [98]:
net = Skip_gram(len(idx_to_token))

In [163]:
loss_fn = nn.BCEWithLogitsLoss(reduction='none')
def criterion(pred, label, mask):
    #
    return [loss_fn(p, l.type_as(p)) * m for p, l, m in zip(pred, label, mask)]

In [165]:
optimizer = torch.optim.Adam(net.parameters(),lr=0.005)

In [179]:
def train(n_epochs):
    
    for epoch in range(n_epochs):
        start, loss = time.time(), 0
        print(len(data_loader))
        for batch in data_loader:
            #
            cent, cont, mask, label = batch
            pred = torch.squeeze(net(cent, cont))
            loss = torch.sum(sum(criterion(pred, label, mask)))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
        print('epoch %d, loss %.2f, time %.2fs'
              % (epoch + 1, loss, time.time() - start))

In [181]:
train(6)

732
epoch 1, loss 2001.61, time 177.00s
732
epoch 2, loss 1889.13, time 170.48s
732
epoch 3, loss 1886.85, time 180.14s
732
epoch 4, loss 1813.41, time 170.84s
732
epoch 5, loss 1725.01, time 166.89s
732
epoch 6, loss 1662.56, time 171.37s


In [205]:
def get_similar_tokens(query_token, k, W):
    #
    # your code here
    #
    token_id = token_to_idx[query_token]
    cos_sim = nn.CosineSimilarity()
    cos = torch.Tensor([cos_sim(net.central_emb(torch.LongTensor([token_id])), net.central_emb(torch.LongTensor([i]))) for i in range(len(idx_to_token))])
    
    _,topk = torch.topk(cos, k=k+1,)
    for i in topk[1:]:# Remove the input words
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))

get_similar_tokens('chip', 3, net.central_emb.weight.data)

cosine sim=0.516: intel
cosine sim=0.438: screens
cosine sim=0.430: microprocessor


In [213]:
embs = [net.central_emb(torch.LongTensor([i])).tolist() for i in range(len(idx_to_token))]

In [215]:
%store embs

Stored 'embs' (list)


In [217]:
%store token_to_idx
%store idx_to_token

Stored 'token_to_idx' (dict)
Stored 'idx_to_token' (list)
