In this Notebook we will learn how to implement subsampling in a word2vec model

# Importing The Necessary Stuff

In [1]:
import numpy as np
from urllib.request import urlretrieve
from os.path import isfile, isdir
from tqdm import tqdm
import zipfile
from collections import Counter
import random

# Necessary Functions

In [2]:
def create_lookup_tables(words):
    """
    Create lookup tables for vocabulary
    :param words: Input list of words
    :return: Two dictionaries, vocab_to_int, int_to_vocab
    """
    word_counts = Counter(words)
    # sorting the words from most to least frequent in text occurrence
    sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
    # create int_to_vocab dictionaries
    int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)}
    vocab_to_int = {word: ii for ii, word in int_to_vocab.items()}

    return vocab_to_int, int_to_vocab

# Downloading The Dataset

In [3]:
dataset_folder_path = 'data'
dataset_filename = 'text8.zip'
dataset_name = 'Text8 Dataset'

class DLProgress(tqdm):
    last_block = 0

    def hook(self, block_num=1, block_size=1, total_size=None):
        self.total = total_size
        self.update((block_num - self.last_block) * block_size)
        self.last_block = block_num

if not isfile(dataset_filename):
    with DLProgress(unit='B', unit_scale=True, miniters=1, desc=dataset_name) as pbar:
        urlretrieve(
            'http://mattmahoney.net/dc/text8.zip',
            dataset_filename,
            pbar.hook)

if not isdir(dataset_folder_path):
    with zipfile.ZipFile(dataset_filename) as zip_ref:
        zip_ref.extractall(dataset_folder_path)
        

# Reading words from the given file

In [4]:
words=[]
with open('data/text8') as f:
   
    # reading each line     
    for line in f: 
   
        # reading each word         
        for word in line.split():
            words.append(word)
print(words[:30])

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst']


In [5]:
print("Total words: {}".format(len(words)))
print("Unique words: {}".format(len(set(words))))

Total words: 17005207
Unique words: 253854


# Creating Dictionaries for Simplicity

This is done using `create_lookup_tables` function which we created above.


In [6]:
vocab_to_int, int_to_vocab = create_lookup_tables(words)
int_words = [vocab_to_int[word] for word in words]

# Subsampling

Words that show up often such as "a", "an", "the", "of", and "for" don't provide much context to the nearby words. If we discard some of them, we can remove some of the noise from our data and in return get faster training and better representations. This process is called subsampling by Mikolov. For each word  wi  in the training set, we'll discard it with probability given by:

<img src="http://www.sciweavers.org/tex2img.php?eq=P%28wi%29%3D1-%20%20%5Csqrt%7Bt%2Ff%28wi%29%7D%20&bc=White&fc=Black&im=jpg&fs=12&ff=arev&edit=0" align="center" border="0" alt="P(wi)=1-  \sqrt{t/f(wi)} " width="175" height="26" />

where  t  is a threshold parameter and  f(wi)  is the frequency of word  wi  in the total dataset.

In [7]:
threshold = 1e-5
word_counts = Counter(int_words)
total_count = len(int_words)
freqs = {word: count/total_count for word, count in word_counts.items()}
p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts}
train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]
sampled_words=[int_to_vocab[train_word] for train_word in train_words]

# Get Sampled words

In [8]:
print(sampled_words[:30])

['anarchism', 'originated', 'term', 'radicals', 'including', 'diggers', 'revolution', 'sans', 'culottes', 'revolution', 'pejorative', 'violent', 'destroy', 'organization', 'society', 'taken', 'self', 'anarchists', 'word', 'anarchism', 'derived', 'archons', 'political', 'belief', 'rulers', 'unnecessary', 'abolished', 'differing', 'anarchism', 'social']


In [9]:
print("Total words: {}".format(len(train_words)))

Total words: 4981605


As you can see here not only the no. of total words have decreased, but most of the redundant words have been removed.
This will make the model more suitable to use since the model will now be faster and will have less bias.