### Developer: Mani kanta

### Aim: Embedding generation using Skip Gram Model

# Importing Libraries

In [4]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm
import zipfile
import collections
import math
import random

from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as Func
from torch.optim.lr_scheduler import StepLR
import time

from s3fs.core import S3FileSystem
import io
s3 = S3FileSystem()


In [5]:
train_data  = np.load(s3.open('prod-search-ranking-ml/data/Umang/pos_sample_3.npy'),allow_pickle=True)
within_city = np.load(s3.open('prod-search-ranking-ml/data/Umang/neg_sample_city_3.npy'),allow_pickle=True)
within_country = np.load(s3.open('prod-search-ranking-ml/data/Umang/neg_sample_country_3.npy'),allow_pickle=True)

    

In [9]:
len(train_data)

1966945

In [10]:
len(within_city)

1966945

In [11]:
len(within_country)

1966945

In [12]:
import pandas as pd
hotel=pd.read_csv("Hotel.csv")
hotel.shape

(81417, 5)

In [13]:
hotel

Unnamed: 0.1,Unnamed: 0,hotel_id,city_id,city_name,country_id
0,0,94319,678.0,Nha-Trang,217
1,1,13195,4.0,Bangalore,1
2,2,89191,4.0,Bangalore,1
3,3,86999,421.0,Karimnagar,1
4,4,87204,677.0,Krabi,8
...,...,...,...,...,...
81412,81412,62801,1.0,Gurgaon,1
81413,81413,97059,14.0,Kolkata,1
81414,81414,110181,1.0,Gurgaon,1
81415,81415,5298,5.0,Mumbai,1


In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [15]:
class skipgram(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(skipgram, self).__init__()
        self.u_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True)   
        self.v_embeddings = nn.Embedding(vocab_size, embedding_dim, sparse=True) 
        self.embedding_dim = embedding_dim
        self.init_emb()
    
    def init_emb(self):
        initrange = 0.5 / self.embedding_dim
        self.u_embeddings.weight.data.uniform_(-initrange, initrange)
        self.v_embeddings.weight.data.uniform_(-0, 0)
    
    def forward(self, u_pos, v_pos, v_neg_city, v_neg_country):

        embed_u = self.u_embeddings(u_pos)
        embed_v = self.v_embeddings(v_pos)
        
        embed_u = embed_u.unsqueeze(0)

        score  = torch.mul(embed_u, embed_v)
        score = torch.sum(score, dim=1)
        log_target = F.logsigmoid(score).squeeze()

        neg_embed_v_city = self.v_embeddings(v_neg_city)
        neg_embed_v_country = self.v_embeddings(v_neg_country)

        neg_score_city = torch.mul(neg_embed_v_city, embed_u)
        neg_score_city = torch.sum(neg_score_city, dim=1)
        sum_log_neg_score_city = F.logsigmoid(-1*neg_score_city).squeeze()
        
        neg_score_country = torch.mul(neg_embed_v_country, embed_u)
        neg_score_country = torch.sum(neg_score_country, dim=1)
        sum_log_neg_score_country = F.logsigmoid(-1*neg_score_country).squeeze()

        loss = log_target.sum() + sum_log_neg_score_city.sum() + sum_log_neg_score_country.sum()

        return -1*loss
    
    def input_embeddings(self):
        return self.u_embeddings.weight.data.cpu().numpy()
   
    def save_embedding(self, file_name, id2word):
        embeds = self.u_embeddings.weight.data
        fo = open(file_name, 'w')
        for idx in range(len(embeds)):
            word = id2word(idx)
            embed = ' '.join(embeds[idx])
            fo.write(word+' '+embed+'\n')

In [None]:
model = skipgram(81417, 200).to(device)
if torch.cuda.is_available():
    model.cuda()

optimizer = optim.SGD(model.parameters(),lr=0.2)

for epoch in range(10):
    start = time.time()     
    total_loss = 0
    batch_num = 0
    batch_new = 0

    for i in tqdm(range(len(train_data))):
        center_id = torch.tensor(train_data[i][0]).to(device)
        positive = torch.tensor(train_data[i][1]).to(device)
        neg_within_city = torch.tensor(within_city[i]).to(device)
        neg_within_country = torch.tensor(within_country[i]).to(device)
        
        optimizer.zero_grad()
        
        loss = model(center_id, positive, neg_within_city, neg_within_country)
        
        total_loss += loss
#         print(loss)
        loss.backward()

        optimizer.step()
            
        batch_num = batch_num + 1 
    print("epoch done")
    print("Epoch Loss ", epoch ," ", total_loss)
print("Optimization Finished!")

 19%|█▊        | 368130/1966945 [08:06<35:14, 756.03it/s]


KeyboardInterrupt: 

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

Tesla T4
Memory Usage:
Allocated: 0.1 GB
Cached:    0.1 GB


7.064330460110516

In [1]:
167115/143509

1.1644914256248737

In [None]:
0.01