# Custom Word2Vec Models

For Zhao et al.'s and Savani et al.'s debiasing methods, we created our own model and skip-gram traning loops below. 

In [53]:
#imports
import torch
from torch import nn, optim, sigmoid, softmax
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
import torch.nn.functional as F
import torch.fft

#to generate ski
import tensorflow
from keras.preprocessing.sequence import skipgrams 

In [21]:
#check which device pytorch will use
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')

Using cpu device


### Defining the model 

2 layers: embedding and linear 

- **max_norm=1:** prevents embedding layer to grow uncontrollably -regularization 
- **embedding_dim:** dimension of each embedding vector (eg.300)
- **size_vocab:** size of vocabulary


Implementation as described in https://arxiv.org/abs/1301.3781 

Architecture:

![Screen%20Shot%202022-03-25%20at%2011.07.41%20AM.png](attachment:Screen%20Shot%202022-03-25%20at%2011.07.41%20AM.png)

Steps:
- Build the corpus vocabulary
- Build a skip-gram generator
- Build the skip-gram model architecture
- Train the Model
- Get Word Embeddings

In [56]:
class skipgram(nn.Module):
    
    def __init__(self, size_vocab, embedding_dim):
        super(skipgram, self).__init__()
        self.embeddings_target =  nn.Embedding(size_vocab, embedding_dim, max_norm=1).to(device) #what we care about
        self.embeddings_context = nn.Embedding(size_vocab, embedding_dim, max_norm=1).to(device) #used in loss calculation
        self.linear = nn.Linear(embedding_dim,1)
        

    def forward(self, target_tensor, context_tensor): #loss
        print ("target tensor: ", target_tensor)
        embedding_t = self.embeddings_target( torch.tensor([target_tensor]).int)
        embedding_c = self.embeddings_context( torch.tensor([context_tensor]).int)
        print("in forward: ", embedding_t, embedding_c)
        
        return F.sigmoid(self.linear(torch.mul(embedding_t, embedding_c))).squeeze()
        

In [60]:
class Word2Vec:
    
    def __init__(self, sentance_tokens, embedding_dim=300, LR=0.01, window_size=10, EPOCHS=10):
        #hyperparamters
        self.window_size = window_size
        self.embedding_dim = embedding_dim
        self.lr = LR
        self.epochs = EPOCHS
        
        #data, corpus
        self.sentance_tokens = sentance_tokens
        self.corpus_vocab = self.corpus_vocab()
        self.size_vocab = len(self.corpus_vocab)
        
        #model, loss, optimizer
        self.model = skipgram(self.size_vocab, self.embedding_dim)
        self.loss_fcn = nn.BCELoss()
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=self.lr)

    
    def corpus_vocab(self):

        #count frequency of each word
        vocab_counts = {}
        for sentance in self.sentance_tokens:
            for word in sentance:
                vocab_counts[word] = vocab_counts.get(word, 0) + 1


        #create corpus by assigning unique ids
        i = 1
        corpus_vocab = {}
        for k, v in sorted(vocab_counts.items(), key=lambda item: item[1]):
            corpus_vocab[k] = i
            i+=1

        return corpus_vocab

    def create_target_context_pairs(self):
        # generate [(target, context), 1] pairs as positive samples - contextually relevant pair
        # and [(target, random), 0] pairs as negative samples - contextually irrelevant pair
        
        #get the word ids from the corpus for all the sentances
        word_ids_datatset = [[self.corpus_vocab[word] for word in sentance] for sentance in self.sentance_tokens]
        
        #generate skipgrams (pairs) for all sentances
        skip_grams = [skipgrams(word_ids, vocabulary_size=self.size_vocab, window_size=self.window_size) for word_ids in word_ids_datatset]
        
        return skip_grams
    
    
    def train(self):
        
        skip_grams = self.create_target_context_pairs()
        total_loss = 0
        for epoch in range(self.epochs):
            
            #iterate through all target, context pairs
            for pairs, labels in skip_grams:
                
                # zero the gradients
                self.optimizer.zero_grad()
                
                # calculate loss, step optimizer  
                sentance_loss = 0
                for i in range (len(pairs)): #pairs in a sentance
                    target_tensor = pairs[i][0] #target word
                    context_tensor = pairs[i][1] #context word (true or random)
                    label = torch.Tensor([labels[i]]).int # 1- relevant, 0 - irrelevent
                    
                    print("before output")
                    output = self.model(target_tensor, context_tensor)
                    print("after output")
                    loss = self.loss_fcn(output,label)
                    sentance_loss += loss.data
                
                loss.backward()
                total_loss+= sentance_loss
                optimizer.step()
            
        print('Epoch:', epoch+1, ' Training Loss:', total_loss)
        
                
                    

    

In [61]:
word_2_vec = Word2Vec([['he', 'was', 'cool'], ['she', 'loved', 'meat'], ['you', 'do', 'nothing']])

word_2_vec.train()

before output
target tensor:  2


AttributeError: 'builtin_function_or_method' object has no attribute 'contiguous'

### Skip-gram Training 

(to be done in the future)

In [None]:
def train_word2vec(EPOCHS, LR, VOCAB_SIZE, EMBEDDING_DIM):
    
    #define model and optimizer
    model = skipgram(VOCAB_SIZE, EMBEDDING_DIM)
    optimizer = optim.Adam(model.parameters(),lr=LR)
    
    
    #loop over epochs
    for epoch in EPOCHS:
        
        
        
    
    
    
    
    #retrieve embeddings
    embeddings = list(model.parameters())[0]
    embeddings = embeddings.cpu().detach().numpy()
    return embeddings 
        