# CBoW model (Word2Vec)

In [2]:
import torch
import numpy as np
import os
import pandas as pd


os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

file_path = r"C:\Users\jites\Desktop\Project_folder\complaint_data_4_columns.xlsx"
df = pd.read_excel(file_path)

df = df.astype(str)
df.head(2)


Unnamed: 0,ChiefComplaint,Symptoms,Medicines,InvestigationName,Paragraph
0,Ho intermittent pain abdomen and flatulence X ...,"itching , vomiting , fatigue , weight_loss , h...","NEOROF 1 20ML INJECTION, URSODIL 300MG TABLETS","PACEMAKER ( DUAL ), MRI THORACIC SPINE (FULL S...",Ho intermittent pain abdomen and flatulence X ...
1,During routine health checkup gallstone detect...,"fatigue , weight_loss , restlessness , sweatin...","RYZODEG FLEXTOUCH PEN, COVATIL 250MG TABS CEFU...","( BED SIDE ) COLOR DOPPLER (CAROTID), GGT, MRI...",During routine health checkup gallstone detect...


In [3]:
df = df.iloc[:1000]

In [4]:
sentences = [sent.split() for sent in df['Paragraph'].to_list()]

In [4]:
len(sentences[7])

80

In [5]:
from collections import Counter

# Build the vocabulary and encode the words as integers
vocab = Counter(word for sentence in sentences for word in sentence)
word2idx = {word: i for i, (word, _) in enumerate(vocab.items())}
idx2word = {i: word for word, i in word2idx.items()}
vocab_size = len(word2idx)

In [6]:
print(vocab_size)

5901


In [7]:
# vocab

In [8]:
def generate_training_data(sentences, word2idx, window_size=2):
    pairs = []
    for sentence in sentences:
        indices = [word2idx[word] for word in sentence]
        for center_word_pos in range(len(indices)):
            for offset in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + offset
                if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                    continue
                pairs.append((indices[center_word_pos], indices[context_word_pos]))
    return pairs

training_data_pair = generate_training_data(sentences, word2idx)


In [9]:
len(training_data_pair)

321736

In [6]:
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from sklearn.preprocessing import LabelEncoder

In [11]:
# class CBOWModel(nn.Module):
#     def __init__(self, vocab_size, embed_size):
#         super(CBOWModel, self).__init__()
#         self.embeddings = nn.Embedding(vocab_size, embed_size)
#         self.linear = nn.Linear(embed_size, vocab_size)

#     def forward(self, context):
#         context_embeds = self.embeddings(context).sum(dim=1)
#         output = self.linear(context_embeds)
#         return output


class Word2VecModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2VecModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, center_word):
        embeds = self.embeddings(center_word)
        output = self.linear(embeds)
        return output

# Parameters
embedding_dim = 100

# Instantiate the model
model = Word2VecModel(vocab_size, embedding_dim).to(device= device)


In [12]:
# Convert training data to tensors
training_data_tensor_list = [(torch.tensor(center, dtype=torch.long, device=device), torch.tensor(context, dtype=torch.long, device=device))
                  for center, context in training_data_pair]


In [13]:
len(training_data_tensor_list)

321736

In [14]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
epochs = 3
for epoch in range(epochs):
    total_loss = 0
    for i,(center_word, context_word) in enumerate(training_data_tensor_list):
        optimizer.zero_grad()
        output = model(center_word.unsqueeze(0))
        loss = criterion(output, context_word.unsqueeze(0))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")

Epoch 1, Loss: 1461307.245071739
Epoch 2, Loss: 1283014.0503216684
Epoch 3, Loss: 1219265.5340221524


In [18]:
word = "("
word_idx = word2idx[word]
embedding_vector = model.embeddings(torch.tensor(word_idx, device=device)).detach()
print(f"Embedding vector for '{word}': {embedding_vector}")


Embedding vector for '(': tensor([ 0.4380,  0.0652,  0.6872, -1.6896,  1.1090, -1.5445,  0.5307,  0.5110,
        -1.0391,  0.3894,  1.6048, -1.6513,  0.3369,  0.7499, -0.2149,  1.2155,
         1.0319, -1.7604,  0.4340,  0.8499, -0.9372, -1.1845,  2.6167,  0.3947,
         0.2730,  0.9358,  0.3197,  0.2985,  0.4007,  0.1425, -2.6224,  0.7531,
         0.6541, -0.0580, -0.3898, -0.9394, -1.1541, -0.6402, -1.3745,  1.4276,
        -1.3954, -1.3844,  1.1818,  0.0890, -0.2538, -0.1800, -0.1370,  1.0432,
         0.0836, -0.7071, -0.1433,  0.1105, -0.0124, -0.5025,  0.5373, -0.0951,
        -0.6760,  0.4060, -0.5962, -0.5114, -0.7854,  0.2070,  0.8996, -1.6358,
        -3.6977, -0.0247,  0.0085, -0.4834,  0.2817, -1.6896, -0.1866, -0.3705,
        -1.1271, -0.3929, -0.6774, -0.4786, -0.8275, -0.2733,  1.5070, -0.3191,
         1.1795,  0.6773, -0.4716, -1.1100, -0.4058, -0.2168,  1.0003, -0.2376,
        -0.8260, -0.1713,  1.5718,  0.1994, -0.7791, -1.4182, -0.3075, -1.1429,
         0.739

In [None]:
# import pickle

# model_path = "custom_word2vec_model.pth"
# vocab_path = "vocab.pkl"

#     ### Save the model state dictionary
# torch.save(model.state_dict(), model_path)

#     ### Save the vocabulary mappings
# with open(vocab_path, 'wb') as f:
#     pickle.dump((word2idx, idx2word), f)


In [None]:
# model_path = "custom_word2vec_model.pth"
# vocab_path = "vocab.pkl"

#     ### Load the vocabulary mappings
# with open(vocab_path, 'rb') as f:
#     word2idx, idx2word = pickle.load(f)

#     ### Recreate the model instance
# vocab_size = len(word2idx)
# embedding_dim = 100            ## This should match the dimension used during training
# model = Word2VecModel(vocab_size, embedding_dim)

#     ### Load the saved state dictionary into the model
# model.load_state_dict(torch.load(model_path))

#     ### Set the model to evaluation mode (optional, depends on your use case)
# model.eval()


# Skip-grams model (Word2Vec)

In [7]:
def generate_skipgram_training_data(sentences, word2idx, window_size=2):
    pairs = []
    for sentence in sentences:
        indices = [word2idx[word] for word in sentence]
        for center_word_pos in range(len(indices)):
            center_word = indices[center_word_pos]
            context_words = []
            for offset in range(-window_size, window_size + 1):
                context_word_pos = center_word_pos + offset
                if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                    continue
                context_words.append(indices[context_word_pos])
            for context_word in context_words:
                pairs.append((center_word, context_word))
    return pairs

training_data_skip_gram_pair = generate_skipgram_training_data(sentences, word2idx)


In [8]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
    
    def forward(self, center_word):
        embeds = self.embeddings(center_word)
        output = self.linear(embeds)
        return output

embedding_dim = 100

model = SkipGramModel(vocab_size, embedding_dim)

In [9]:
# Convert training data to tensors
training_data_skip_gram_tensor_list = [(torch.tensor(center, dtype=torch.long, device=device), 
                                        torch.tensor(context, dtype=torch.long, device=device)) 
                                       for center, context in training_data_skip_gram_pair]


In [10]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

epochs = 3
for epoch in range(epochs):
    total_loss = 0
    for center_word, context_word in training_data_skip_gram_tensor_list:
        optimizer.zero_grad()
        output = model(center_word.unsqueeze(0))
        loss = criterion(output, context_word.unsqueeze(0))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss}")


Epoch 1, Loss: 1460225.8742566854
Epoch 2, Loss: 1282904.5380657166
Epoch 3, Loss: 1219169.7831053138


In [12]:
word = "fatigue"
word_idx = word2idx[word]
embedding_vector = model.embeddings(torch.tensor(word_idx, device=device)).detach()
print(f"Embedding vector for '{word}': {embedding_vector}")

Embedding vector for 'fatigue': tensor([ 0.6784, -1.6613,  0.9200,  1.1474, -0.1259,  0.0896, -1.5391, -0.2119,
         0.0292,  1.0094,  0.2427,  0.8272,  0.8194,  1.3292, -0.0508,  1.0731,
        -0.3961,  0.0055,  0.8425,  0.2499, -1.3036,  0.1286,  0.8677, -1.2540,
         0.7143,  1.3858,  0.5125, -0.5405, -0.5231, -0.5525,  1.0788, -0.4218,
        -0.7048, -1.0020, -0.6864,  1.0180,  1.0795,  1.2585,  0.2061, -0.7169,
         0.5437,  1.3036,  1.0163, -0.3995, -1.1355,  0.6984,  0.6052, -0.4189,
         0.2060,  1.1750,  1.8251, -0.4229,  0.1029, -0.0470,  0.4065, -0.2243,
        -1.1510,  0.2954, -0.7789,  0.7182, -0.2550, -0.9052,  0.4521, -0.7722,
         0.7700, -0.7393, -0.3932,  0.3719,  0.7401,  2.9336,  0.9360,  0.0822,
         0.7936,  0.4257,  0.4557, -0.2305, -0.1107,  0.4488,  0.0765,  0.4439,
        -0.0958,  0.6893, -1.1718, -0.2316,  0.2174,  1.4285,  0.8127,  0.8827,
        -0.5775,  2.3087,  0.2505,  1.1557,  1.7782,  0.7352, -1.4089, -0.0446,
        

In [None]:
"""
While the underlying architecture of the CBOW and Skip-gram models might look similar when implemented, they fundamentally differ in 
how they approach learning word representations and in the details of their training objectives. Let's explore the differences.

1. Objective Function:
    CBOW (Continuous Bag of Words):
        Objective: Predict the center word given the context words.
        Input: Multiple context words (within a window size) surrounding a target word.
        Output: The model predicts the probability distribution over the vocabulary, aiming to maximize the probability of the 
                correct center word given the context words.
        Loss Function: The loss is calculated between the predicted center word and the actual center word.
    The CBOW model tends to be faster to train because it averages the embeddings of multiple context words and predicts a 
    single word (center word).

    Skip-gram:
        Objective: Predict context words given a single center word.
        Input: A single center word.
        Output: The model predicts multiple context words within the window size.
        Loss Function: The loss is calculated between the predicted context words and the actual context words. The model is trained 
                        to maximize the probability of predicting the correct context words for a given center word.
    The Skip-gram model typically requires more computations because it predicts multiple words (context words) for each input (center word).

2. Data Preparation:
    CBOW:
        The model is trained with input consisting of multiple context words and the target is the center word.
        Example: For the sentence "The cat sits on the mat," with a window size of 2, the model may see [the, sits] as input 
                and cat as the output.
    Skip-gram:
        The model is trained with input consisting of a single center word and the target is the surrounding context words.
        Example: For the sentence "The cat sits on the mat," with a window size of 2, the model may see cat as input and [the, sits] 
                as outputs.

3. Training Process:
    CBOW:
        The model aggregates the embeddings of context words (often by averaging) and uses them to predict the center word.
        Example:
            Input: [the, sits]
            Model: Embedding + Average (or sum) + Linear + Softmax
            Output: Probability distribution over the vocabulary for predicting the word "cat."
    Skip-gram:
        The model takes a single word and predicts the surrounding context words one by one.
        Example:
            Input: cat
            Model: Embedding + Linear + Softmax
            Output: Probability distribution over the vocabulary for predicting each word in the context window [the, sits].

4. Efficiency and Suitability:
    CBOW:
        Typically faster to train.
        More suitable when your dataset is smaller or when speed is a priority.
        Tends to smooth the representations of words because of the averaging step.
    Skip-gram:
        Typically slower to train because it predicts multiple outputs per input word.
        More suitable for large datasets and when detailed, high-quality word representations are desired.
        Often better at capturing rare word representations since each word is treated independently.

5. Architecture Overview:
    Despite the similar architecture (an embedding layer followed by a linear layer), the primary difference lies in:

    Input/Output handling:
        CBOW uses multiple words as input to predict a single word.
        Skip-gram uses a single word as input to predict multiple words.
    Training Focus:
        CBOW is context-to-center focused.
        Skip-gram is center-to-context focused.

    Visual Comparison:
        CBOW:
            Input: [context1, context2, ... contextN]
            Model: Embedding -> Aggregate (average/sum) -> Linear -> Softmax
            Output: center word
        Skip-gram:
            Input: center word
            Model: Embedding -> Linear -> Softmax
            Output: [context1, context2, ... contextN]
"""