In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

In [2]:
from torch.utils.data import DataLoader
import torch.optim as optim
import torch
import torch.nn.functional as F

import numpy as np
import pandas as pd

from src.ml.data_loader import Sequences, SequencesDataset
from src.ml.skipgram import SkipGram
from src.utils.logger import logger
from src.utils.io_utils import load_model

In [3]:
batchsize = 8
shuffle = False
num_workers = 4
emb_dim = 8
epochs = 3
initial_lr=0.025
MODEL_PATH = '../model'

In [4]:
dataset = 'electronics'

In [7]:
sequences = Sequences('../data/{}_sequences_samp.npy'.format(dataset),
                      '../data/{}_edges_val_samp.csv'.format(dataset))

2019-12-05 18:08:35,095 - Sequences loaded (length = 5,000)
2019-12-05 18:08:35,193 - Validation set loaded: (100000, 4)
2019-12-05 18:08:35,200 - Word frequency calculated
2019-12-05 18:08:35,238 - Adding val products to word2id, original size: 28695
2019-12-05 18:08:35,307 - Added val products to word2id, updated size: 133050
2019-12-05 18:08:35,311 - No. of unique tokens: 133050
2019-12-05 18:08:36,542 - Model saved to model/word2id
2019-12-05 18:08:37,750 - Model saved to model/id2word
2019-12-05 18:08:37,751 - Word2Id and Id2Word created and saved
2019-12-05 18:08:37,775 - Convert sequence and wordfreq to ID
2019-12-05 18:08:37,832 - Discard probability calculated
2019-12-05 18:08:39,397 - Negative sample table created


In [8]:
sequences_dset = SequencesDataset(sequences)

In [9]:
sequences_dload = DataLoader(sequences_dset, batch_size=batchsize, shuffle=shuffle, num_workers=num_workers, collate_fn=sequences_dset.collate)

In [10]:
device = 'cpu'

In [11]:
skipgram = SkipGram(sequences.n_unique_tokens, emb_dim).to(device)

In [12]:
optimizer = optim.SparseAdam(skipgram.parameters(), lr=initial_lr)

for epoch in range(epochs):
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, len(sequences_dload))
    
    running_loss = 0
    for i, batches in enumerate(sequences_dload):

        centers = batches[0].to(device)
        contexts = batches[1].to(device)
        neg_contexts = batches[2].to(device)

        optimizer.zero_grad()
        loss = skipgram.forward(centers, contexts, neg_contexts)
        loss.backward()
        optimizer.step()

        scheduler.step()
        running_loss = running_loss * 0.9 + loss.item() * 0.1

        if i % 1000 == 0:
            logger.info('Epoch: {:,}, Seq Count: {:,}/{}, Loss: {:.4f}, Lr: {:.6f}'.format(epoch, i, len(sequences_dload), running_loss,
                                                                                        optimizer.param_groups[0][
                                                                                            'lr']))
            running_loss = 0

    skipgram.save_embeddings(file_name='{}/skipgram_epoch_{}.npy'.format(MODEL_PATH, epoch))

2019-12-05 18:08:43,723 - Epoch: 0, Seq Count: 0/625, Loss: 0.4159, Lr: 0.025000


KeyboardInterrupt: 

In [14]:
emb_center = skipgram.center_embeddings(centers)  # Get embeddings for center word
emb_context = skipgram.context_embeddings(contexts)  # Get embeddings for context word
emb_neg_context = skipgram.context_embeddings(neg_contexts)  # Get embeddings for negative context words

In [16]:
emb_center.shape

torch.Size([514, 8])

In [17]:
emb_context.shape

torch.Size([514, 8])

In [18]:
emb_neg_context.shape

torch.Size([514, 5, 8])

### Save torch params

In [21]:
torch.save(skipgram.state_dict(), '../model/skipgram_sample.pt')

In [22]:
model = SkipGram(sequences.n_unique_tokens, emb_dim).to(device)

In [23]:
model.load_state_dict(torch.load('../model/skipgram_sample.pt'))

<All keys matched successfully>

In [24]:
model.eval()

SkipGram(
  (center_embeddings): Embedding(7757, 8, sparse=True)
  (context_embeddings): Embedding(7757, 8, sparse=True)
)

### Check with validation

In [5]:
val = pd.read_csv('../data/{}_edges_val.csv'.format(dataset), dtype={'product1': 'object', 'product2': 'object'})

In [6]:
sample_idx = np.random.randint(0, val.shape[0], 100000)

In [7]:
val_samp = val.iloc[sample_idx]

In [8]:
val_samp.head()

Unnamed: 0,product1,product2,edge
1055342,b002goovnk,b008mrzsh8,1
535317,b00aodd3js,b00f0rrcqi,1
737360,b005abj0h8,b00dzrguao,1
1333506,b0002exjra,b000067rrx,0
2376672,b00dziz6qc,b008mogskm,0


In [63]:
val_samp = pd.read_csv('../data/books_edges_train.csv', nrows=100, dtype={'product1': 'object', 'product2': 'object'})
val_samp['edge'] = np.where(val_samp['weight'] > 1, 1, 0)
val_samp.to_csv('../data/books_edges_train_samp.csv')

In [50]:
val_samp = pd.read_csv('../data/books_edges_val_samp.csv', dtype={'product1': 'object', 'product2': 'object'})

In [51]:
word2id = load_model('../model/word2id')

2019-12-03 15:39:30,897 - Model loaded from: ../model/word2id (Size: 969863 bytes)


In [52]:
word2id_func =  np.vectorize(sequences.get_product_id)

In [53]:
val_samp['product1_id'] = word2id_func(val_samp['product1'].values)
val_samp['product2_id'] = word2id_func(val_samp['product2'].values)

In [54]:
val_samp = val_samp[(val_samp['product1_id'] > -1) & (val_samp['product2_id'] > -1)]

In [55]:
val_samp

Unnamed: 0.1,Unnamed: 0,product1,product2,edge,product1_id,product2_id
2834,2460300,0062060244,1578643031,1,2516,7166
5158,4981598,0060501960,0439441609,1,167,259
5954,2408135,1606903888,1616553707,1,3733,3745
6342,1651623,0989103137,1492206601,1,1519,1480
6352,7430236,1440213747,1607058529,1,5796,5781
...,...,...,...,...,...,...
96025,2622723,0071819541,0875632157,1,469,610
96551,4896177,0060580461,0812980557,1,3226,3115
97112,5555367,014311753X,0385349580,1,1188,1053
97705,4657423,0822572257,0824603621,1,6256,6231


In [43]:
product1_emb = model.get_center_emb(torch.LongTensor(product1_id))
product2_emb = model.get_center_emb(torch.LongTensor(product2_id))

RuntimeError: index out of range: Tried to access index -1 out of table with 7756 rows. at ../aten/src/TH/generic/THTensorEvenMoreMath.cpp:418

In [32]:
product1_emb

tensor([[-0.2244,  0.1584, -0.2018,  0.2065,  0.2086, -0.2308,  0.2165, -0.2227],
        [-0.2875, -0.1795,  0.0093,  0.2784,  0.2273, -0.2611,  0.0504, -0.2687],
        [-0.2470, -0.2357, -0.0490, -0.3754,  0.3267,  0.3455, -0.0134, -0.3363],
        [-0.2553, -0.1330, -0.1388, -0.1701,  0.2362, -0.2577,  0.0798, -0.0122],
        [-0.1892, -0.2706, -0.3198, -0.3673, -0.3417, -0.3226,  0.1317, -0.3117],
        [ 0.0570,  0.0895, -0.1997, -0.2253,  0.0674, -0.2361, -0.2316, -0.2013],
        [-0.1330, -0.2262, -0.3651, -0.0874, -0.0853, -0.3225, -0.3004, -0.2572],
        [-0.2493, -0.0244, -0.2456,  0.2454, -0.0101, -0.2079, -0.1544, -0.0653],
        [-0.3194, -0.3137,  0.3596, -0.2866, -0.2485,  0.0837, -0.2537, -0.3440],
        [-0.2278, -0.2868, -0.3016,  0.0993, -0.0986, -0.3850,  0.3154, -0.3355]],
       grad_fn=<EmbeddingBackward>)

In [None]:
cos_sim = F.cosine_similarity(product1_emb, product2_emb)
cos_sim

In [None]:
cos_sim.detach().numpy()

In [None]:
x = np.array([-0.2257,  0.2379, -0.2139,  0.2115,  0.2185, -0.2326,  0.2114, -0.2235])
y = np.array([-0.2150, -0.1220,  0.0284,  0.2917,  0.1297, -0.2589, -0.1423, -0.2585])

In [None]:
np.inner(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

In [None]:
product1_tensor

In [None]:
print(emb)

In [None]:
skipgram.state_dict()