Importing necessary libraries and modules

In [None]:
from functools import partial
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch import optim

In [None]:
!pip install torchtext==0.15.1

Collecting torchtext==0.15.1
  Downloading torchtext-0.15.1-cp310-cp310-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting torch==2.0.0 (from torchtext==0.15.1)
  Downloading torch-2.0.0-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchdata==0.6.0 (from torchtext==0.15.1)
  Downloading torchdata-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (919 bytes)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.0->torchtext==0.15.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.0->torchtext==0.15.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.0->torchtext==0.15.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.0->torc

Installing required packages

In [None]:
!pip install 'portalocker>=2.0.0'
import portalocker



Importing torchtext modules

In [None]:
from torchtext import datasets
#from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator


Setting up the device (GPU or CPU)

In [None]:
if torch.cuda.is_available():
    device=torch.device(type='cuda',index=0)
else:
    device=torch.device(type='cpu',index=0)

In [None]:
train_data = datasets.IMDB(split='train')  # data is ShardingFilterIterDataPipe
# DataPipe that yields tuple of label 1 or 2 and text containing the movie review

eval_data = datasets.IMDB(split='test')

mapped_train_data = []
for label, review in train_data:
    mapped_train_data.append(review)  # we just need reviews for word2vec

eval_data_list = list(eval_data)  # convert eval_data to a list
mapped_eval_data = []
for label, review in eval_data_list[:5000]:
    mapped_eval_data.append(review)

mapped_train_data[0]  # checking

print(type(mapped_train_data[0]))  # string

mapped_train_data[0:2]  # list of string

tokenizer = get_tokenizer("basic_english", language="en")
print(mapped_train_data[0:2])

<class 'str'>
['I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and 

Building the vocabulary

In [None]:
#build the vocab now
min_word_freq=20
def build_vocab(mapped_train_data, tokenizer):
    vocab = build_vocab_from_iterator(
        map(tokenizer, mapped_train_data),
        specials=["<unk>"],
        min_freq=min_word_freq
    )
    vocab.set_default_index(vocab["<unk>"])
    return vocab

In [None]:
vocab=build_vocab(mapped_train_data,tokenizer)  #Creating the vocabulary
vocab_size=len(vocab)
print(vocab_size)

#Defining hyperparameters
window_size=4 #leads to context window size of 9
max_seq_len=256
max_norm=1
embed_dim=300
batch_size=64
text_pipeline = lambda x: vocab(tokenizer(x))
sample=text_pipeline("Hello World")
print(sample)
print(type(sample))

13351
[4644, 185]
<class 'list'>


Defining the collate_cbow function

In [None]:
def collate_cbow(batch, text_pipeline):

     batch_input_words, batch_target_word = [], []

     for review in batch:

         review_tokens_ids = text_pipeline(review)

         if len(review_tokens_ids) < window_size * 2 + 1:
             continue

         if max_seq_len:
             review_tokens_ids = review_tokens_ids[:max_seq_len]

         for idx in range(len(review_tokens_ids) - window_size * 2):
             current_ids_sequence = review_tokens_ids[idx : (idx + window_size * 2 + 1)]
             target_word = current_ids_sequence.pop(window_size)
             input_words = current_ids_sequence
             batch_input_words.append(input_words)
             batch_target_word.append(target_word)

     batch_input_words = torch.tensor(batch_input_words, dtype=torch.long)
     batch_target_word = torch.tensor(batch_target_word, dtype=torch.long)

     return batch_input_words, batch_target_word

Defining the collate_skipgram function

In [None]:
def collate_skipgram(batch, text_pipeline):

    batch_input_word, batch_target_words = [], []

    for review in batch:
        review_tokens_ids = text_pipeline(review)
        if len(review_tokens_ids) < window_size * 2 + 1:
            continue
        if max_seq_len:
            review_tokens_ids = review_tokens_ids[:max_seq_len]
        for idx in range(len(review_tokens_ids) - window_size * 2):
            current_ids_sequence = review_tokens_ids[idx : (idx + window_size * 2 + 1)]
            input_word = current_ids_sequence.pop(window_size)
            target_words = current_ids_sequence

            for target_word in target_words:
                batch_input_word.append(input_word)
                batch_target_words.append(target_word)

    batch_input_word = torch.tensor(batch_input_word, dtype=torch.long)
    batch_target_words = torch.tensor(batch_target_words, dtype=torch.long)
    return batch_input_word, batch_target_words

Creating DataLoaders for CBOW and Skip-Gram models

In [None]:
#Training DataLoaders
traindl_cbow = DataLoader(
        mapped_train_data,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=partial(collate_cbow,text_pipeline=text_pipeline)
    )


traindl_skipgram = DataLoader(
        mapped_train_data,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=partial(collate_skipgram,text_pipeline=text_pipeline)
    )

Evaluation DataLoaders

In [None]:
evaldl_cbow = DataLoader(
        mapped_eval_data,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=partial(collate_cbow,text_pipeline=text_pipeline)
    )

evaldl_skipgram = DataLoader(
        mapped_eval_data,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=partial(collate_skipgram,text_pipeline=text_pipeline)
    )

Defining the CBOW class

In [None]:
class CBOW(nn.Module):
    def __init__(self, vocab_size):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        # Other layers or initialization code
        self.linear = nn.Linear(
            in_features=embed_dim,
            out_features=vocab_size,
        )

#Forward pass
    def forward(self, x):
        #print("Shape of x before embedding:",x.shape)
        x = self.embeddings(x)
        #print("Shape of x after embedding:",x.shape)
        x = x.mean(axis=1)
        #print("Shape of x after mean:",x.shape)
        x = self.linear(x)
        #print("Shape of x at the end of forward:",x.shape)
        return x


Defining the SkipGram class

In [None]:
class SkipGram(nn.Module):

    def _init_(self, vocab_size):
        super()._init_()
        self.embeddings = nn.Embedding(
            num_embeddings=vocab_size,
            embedding_dim=embed_dim,
            max_norm=max_norm
        )
        self.linear = nn.Linear(
            in_features=embed_dim,
            out_features=vocab_size,
        )

#Forward pass
    def forward(self, x):
        #print("Shape of x before embedding:",x.shape)
        x = self.embeddings(x)
        #print("Shape of x after embedding:",x.shape)
        x = self.linear(x)
        #print("Shape of x at the end of forward:",x.shape)
        return x

Defining the train_one_epoch function

In [None]:
def train_one_epoch(model,dataloader):
    model.train()
    running_loss = []

    for i, batch_data in enumerate(dataloader):
        inputs = batch_data[0].to(device)
        targets = batch_data[1].to(device)
        #print("Input Shape:",inputs.shape, "Target Shape:",targets.shape)
        opt.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        opt.step()

        running_loss.append(loss.item())

    epoch_loss = np.mean(running_loss)
    print("Train Epoch Loss:",round(epoch_loss,3))
    loss_dict["train"].append(epoch_loss)

Defining the validate_one_epoch function

In [None]:
def validate_one_epoch(model,dataloader):
    model.eval()
    running_loss = []

    with torch.no_grad():
        for i, batch_data in enumerate(dataloader, 1):
            inputs = batch_data[0].to(device)
            targets = batch_data[1].to(device)

            outputs = model(inputs)
            loss = loss_fn(outputs, targets)

            running_loss.append(loss.item())


    epoch_loss = np.mean(running_loss)
    #print("Validation Epoch Loss:",round(epoch_loss,3))
    loss_dict["val"].append(epoch_loss)

loss_fn=nn.CrossEntropyLoss()
n_epochs=1
loss_dict={}
loss_dict["train"]=[]
loss_dict["val"]=[]

choice=input("Enter cbow/skipgram:")
if choice=="cbow":
    model=CBOW(vocab_size).to(device)
    dataloader_train=traindl_cbow
    dataloader_val=evaldl_cbow
elif choice=="skipgram":
    model=SkipGram(vocab_size).to(device)
    dataloader_train=traindl_skipgram
    dataloader_val=evaldl_skipgram

opt=optim.Adam(params=model.parameters(),lr=0.001)

for e in range(n_epochs):
    print("Epoch=",e+1)
    train_one_epoch(model,dataloader_train)
    validate_one_epoch(model,dataloader_val)

for name,child in model.named_children():
    print(name,child)

trimmed_model=model.embeddings
print(trimmed_model)

print(vocab.get_itos())
print(vocab.lookup_indices(["film","movie"]))

emb1=trimmed_model(torch.tensor([23]))
emb2=trimmed_model(torch.tensor([17]))
cos=torch.nn.CosineSimilarity()
print(cos(emb1,emb2))


Enter cbow/skipgram:cbow
Epoch= 1


KeyboardInterrupt: 