In [1]:
import os
import json

import torch
import numpy as np
import pandas as pd

from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from tqdm.notebook import tqdm

# Setup

In [2]:
embeddings = np.random.randn(4000, 10)

In [3]:
embeddings[30]

array([-1.30504386,  0.90032524, -1.31469559, -1.22241271, -2.15079437,
        1.32285032,  0.39819086,  1.3206229 , -2.7392509 , -0.03906275])

In [4]:
data_dir = os.path.join(os.curdir, "data")
vocab_path = os.path.join(data_dir, "word-level-vocab.json")
dataset_path = os.path.join(data_dir, "clean-tweets.tsv")

with open(vocab_path, "rt") as f:
    vocab = json.load(f)
    
dataset = pd.read_csv(filepath_or_buffer=dataset_path, sep="\t")

In [5]:
tweets = dataset["clean_text"].tolist()

In [6]:
OOV_TOKEN = "[OOV]"
PAD_TOKEN = "[PAD]"

OOV_INDEX = vocab.get(OOV_TOKEN)

print(f"Vocab Size = {len(vocab)}")

Vocab Size = 10998


In [7]:
tokenized_tweets = [[vocab.get(token) for token in tweet.split(" ") if token in vocab] for tweet in tweets]

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Skip Gram with Negative Sampling

In [9]:
tokenized_tweets

[[4, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17],
 [18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
 [32, 33, 35, 36, 39, 40, 41, 43, 45, 47, 48],
 [49, 50, 51, 52, 53, 54, 55, 56, 52, 57, 58, 59, 52, 60, 61, 62],
 [63, 64, 65, 66, 67, 68, 70],
 [71, 72, 73, 74, 75, 76, 77, 80],
 [83, 84, 85, 86, 87, 88, 89],
 [91, 92],
 [93,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  61,
  104,
  105,
  106,
  107,
  108,
  109],
 [57,
  110,
  111,
  112,
  113,
  61,
  75,
  114,
  115,
  116,
  117,
  61,
  118,
  119,
  120,
  121,
  122],
 [123, 124, 126, 127, 128, 129, 131, 132, 134, 135, 138],
 [140, 142, 143, 144, 147, 149, 150, 152],
 [156, 157, 158, 161, 23, 161],
 [163, 166, 168, 169, 172, 174, 175],
 [176, 177, 179, 180, 182, 183, 185, 186, 187, 188, 189, 190, 191, 192, 195],
 [196, 95, 198, 199, 200, 201, 203, 204, 205, 206, 207, 208, 209, 210, 211],
 [57, 110, 119, 212, 214, 216, 218],
 [220, 221, 222, 224, 225, 226, 227, 231],
 [235, 237, 238, 239, 240],
 [

In [10]:
WINDOW_SIZE = 4
NEGATIVE_SAMPLES_COUNT = 5

In [11]:
samples = []
actual_tokens = sorted(vocab.values())[2:]

for tweet in tqdm(tokenized_tweets):
    for index in range(WINDOW_SIZE, len(tweet) - WINDOW_SIZE):
        target = tweet[index]
        context = tweet[index - WINDOW_SIZE: index] + tweet[index + 1: index + WINDOW_SIZE + 1]
        samples.extend([(target, c, 1) for c in context])
        
        negative_samples = np.random.choice(a=actual_tokens, size=WINDOW_SIZE* 2, replace=False)
        samples.extend([(target, n, 0) for n in negative_samples])

samples = np.random.permutation(samples)

  0%|          | 0/4000 [00:00<?, ?it/s]

In [12]:
class Word2VecNgram(nn.Module):
    def __init__(self, embedding_size: int, vocab_size: int, padding_idx: int):
        super().__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.padding_idx = padding_idx
        
        self.target_embedding = nn.Embedding(num_embeddings=self.vocab_size, 
                                      embedding_dim=self.embedding_size, 
                                      padding_idx=self.padding_idx, max_norm=1)
        self.context_embedding = nn.Embedding(num_embeddings=self.vocab_size, 
                                      embedding_dim=self.embedding_size, 
                                      padding_idx=self.padding_idx, max_norm=1)
        
    
    def forward(self, target, context):
        target_embedding = self.target_embedding(target)
        context_embedding = self.context_embedding(context)
        
        output = torch.sum(target_embedding * context_embedding, -1).unsqueeze(1)
        
        return torch.sigmoid(output)

In [13]:
model = Word2VecNgram(embedding_size=256, vocab_size=len(vocab), padding_idx=vocab.get(PAD_TOKEN)).to(device)


In [14]:
dataset = torch.tensor(samples)
dataset = TensorDataset(dataset[:, 0], dataset[:, 1], dataset[:, 2])
dataloader = DataLoader(dataset=dataset, batch_size=64, shuffle=True)

In [15]:
critertion = nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-3)

epochs = 5

In [None]:
for epoch in range(epochs):
    epoch_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        target, context, label = batch

        target, context, label = target.to(device), context.to(device), label.to(device)

        prediction = model(target, context)

        loss = critertion(prediction, label.unsqueeze(1).float())

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    print(epoch_loss / len(dataloader))



In [None]:
vocab

In [None]:
embed = nn.Embedding.from_pretrained(embeddings=model.target_embedding.weight).cpu()

In [None]:
torch.argmax(torch.matmul(embed.weight , embed.weight[168].unsqueeze(1))) 

In [None]:
itos = {value: key for key, value in vocab.items()}

In [None]:
[itos[index.item()] for index in  torch.topk(torch.nn.functional.cosine_similarity(embed.weight, embed.weight[6487]), k=10).indices] 

In [None]:
all_words = [word for tweet in tweets for word in tweet.split(" ") if word in vocab]

In [None]:
from collections import Counter

In [None]:
counter = Counter(all_words)

In [None]:
vocab["النصر"]

In [None]:
counter.most_common()