<a href="https://colab.research.google.com/github/JonathanSum/TorchAudio_and_TorchTextNotes/blob/main/Represent_words_with_embeddings(0_10)).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget -q https://raw.githubusercontent.com/MicrosoftDocs/pytorchfundamentals/main/nlp-pytorch/torchnlp.py

In [27]:
import torch
import torchtext
import numpy as np
from torchnlp import *
train_dataset, test_dataset, classes, vocab = load_dataset()
vocab_size = len(vocab)
print("Vocab size = ",vocab_size)

Loading dataset...
Building vocab...
Vocab size =  95811


In [4]:
class EmbedClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embed_dim)
        self.fc = torch.nn.Linear(embed_dim, num_class)

    def forward(self, x):
        x = self.embedding(x)
        x = torch.mean(x,dim=1)
        return self.fc(x)

In [5]:
def padify(b):
    # b is the list of tuples of length batch_size
    #   - first element of a tuple = label, 
    #   - second = feature (text sequence)
    # build vectorized sequence
    v = [encode(x[1]) for x in b]
    # first, compute max length of a sequence in this minibatch
    l = max(map(len,v))
    return ( # tuple of two tensors - labels and features
        torch.LongTensor([t[0]-1 for t in b]),
        torch.stack([torch.nn.functional.pad(torch.tensor(t),(0,l-len(t)),mode='constant',value=0) for t in v])
    )

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, collate_fn=padify, shuffle=True)

In [5]:
torchtext.__version__

'0.10.0'

In [6]:
class EmbedClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = torch.nn.EmbeddingBag(vocab_size, embed_dim)
        self.fc = torch.nn.Linear(embed_dim, num_class)

    def forward(self, text, off):
        x = self.embedding(text, off)
        return self.fc(x)

In [7]:
net = EmbedClassifier(vocab_size,32,len(classes)).to(device)

In [8]:
def offsetify(b):
    # first, compute data tensor from all sequences
    x = [torch.tensor(encode(t[1])) for t in b]
    # now, compute the offsets by accumulating the tensor of sequence lengths
    o = [0] + [len(t) for t in x]
    o = torch.tensor(o[:-1]).cumsum(dim=0)
    return ( 
        torch.LongTensor([t[0]-1 for t in b]), # labels
        torch.cat(x), # text 
        o
    )

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, collate_fn=offsetify, shuffle=True)

In [None]:
# i = 1
# box1 = None
# box2 = None
# for labels,text,off in train_loader:
#     box1 = text
#     box2 = off
#     print(text.shape)
#     print(off)
#     break

torch.Size([778])
tensor([  0,  37,  62,  90, 134, 176, 320, 383, 430, 471, 510, 558, 604, 644,
        699, 738])


In [None]:
# len(box1)

688

In [9]:
net = EmbedClassifier(vocab_size,32,len(classes)).to(device)

def train_epoch_emb(net,dataloader,lr=0.01,optimizer=None,loss_fn = torch.nn.CrossEntropyLoss(),epoch_size=None, report_freq=200):
    optimizer = optimizer or torch.optim.Adam(net.parameters(),lr=lr)
    loss_fn = loss_fn.to(device)
    net.train()
    total_loss,acc,count,i = 0,0,0,0
    for labels,text,off in dataloader:
        optimizer.zero_grad()
        labels,text,off = labels.to(device), text.to(device), off.to(device)
        out = net(text, off)
        loss = loss_fn(out,labels) #cross_entropy(out,labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss
        _,predicted = torch.max(out,1)
        acc+=(predicted==labels).sum()
        count+=len(labels)
        i+=1
        if i%report_freq==0:
            print(f"{count}: acc={acc.item()/count}")
        if epoch_size and count>epoch_size:
            break
    return total_loss.item()/count, acc.item()/count


# train_epoch_emb(net,train_loader, lr=4, epoch_size=25000)

In [10]:
import gensim.downloader as api
w2v = api.load('word2vec-google-news-300')

KeyboardInterrupt: ignored

In [None]:
for w,p in w2v.most_similar('neural'):
    print(f"{w} -> {p}")

In [None]:
w2v.most_similar(positive=['king','woman'],negative=['man'])[0]

In [None]:
# vocab.get_itos()

In [None]:
embed_size = len(w2v.get_vector('hello'))
print(f'Embedding size: {embed_size}')

net = EmbedClassifier(vocab_size,embed_size,len(classes))

print('Populating matrix, this will take some time...',end='')
found, not_found = 0,0
for i,w in enumerate(vocab.get_itos()):
    try:
        net.embedding.weight[i].data = torch.tensor(w2v[w])
        found+=1
    except:
        net.embedding.weight[i].data = torch.normal(0.0,1.0,(embed_size,))
        not_found+=1

print(f"Done, found {found} words, {not_found} words missing")
net = net.to(device)

In [None]:
train_epoch_emb(net,train_loader, lr=4, epoch_size=25000)

In [28]:
vocab = torchtext.vocab.GloVe(name='6B', dim=50)
# vocab = torchtext.vocab.FastText(language='en')


.vector_cache/glove.6B.zip: 0.00B [00:00, ?B/s][A
.vector_cache/glove.6B.zip:   0%|          | 8.19k/862M [00:01<37:39:07, 6.36kB/s][A
.vector_cache/glove.6B.zip:   0%|          | 16.4k/862M [00:01<27:40:59, 8.65kB/s][A
.vector_cache/glove.6B.zip:   0%|          | 49.2k/862M [00:01<19:42:35, 12.2kB/s][A
.vector_cache/glove.6B.zip:   0%|          | 98.3k/862M [00:01<14:01:04, 17.1kB/s][A
.vector_cache/glove.6B.zip:   0%|          | 205k/862M [00:01<9:54:48, 24.2kB/s]  [A
.vector_cache/glove.6B.zip:   0%|          | 426k/862M [00:02<6:59:12, 34.3kB/s][A
.vector_cache/glove.6B.zip:   0%|          | 860k/862M [00:02<4:54:48, 48.7kB/s][A
.vector_cache/glove.6B.zip:   0%|          | 1.73M/862M [00:02<3:26:54, 69.3kB/s][A
.vector_cache/glove.6B.zip:   0%|          | 3.47M/862M [00:02<2:24:55, 98.8kB/s][A
.vector_cache/glove.6B.zip:   1%|          | 6.34M/862M [00:02<1:41:20, 141kB/s] [A
.vector_cache/glove.6B.zip:   1%|          | 9.27M/862M [00:02<1:10:55, 200kB/s][A
.vector_cac

In [29]:
# get the vector corresponding to kind-man+woman
qvec = vocab.vectors[vocab.stoi['king']]-vocab.vectors[vocab.stoi['man']]+1.3*vocab.vectors[vocab.stoi['woman']]
# find the index of the closest embedding vector 
d= torch.sum((vocab.vectors-qvec)**2,dim=1)
min_idx = torch.argmin(d)
# find the corresponding word
vocab.itos[min_idx]

'queen'

In [86]:
def offsetify(b):
    # first, compute data tensor from all sequences
    # for t in b:
    #   print(t)
    x = [torch.tensor(encode(t[1],voc=vocab)) for t in b] # pass the instance of vocab to encode function!
    # now, compute the offsets by accumulating the tensor of sequence lengths
    o = [0] + [len(t) for t in x]
    o = torch.tensor(o[:-1]).cumsum(dim=0)
    return ( 
        torch.LongTensor([t[0]-1 for t in b]), # labels
        torch.cat(x), # text 
        o
    )

In [87]:
net = EmbedClassifier(len(vocab),len(vocab.vectors[0]),len(classes))
net.embedding.weight.data = vocab.vectors
net = net.to(device)

In [43]:
words1 = encode("Happy Sugar Sugar",voc=vocab)

In [52]:
t1 ='Lessons from Hollinger fiasco Hollinger International Inc. #39;s special committee #39;s report that details the apparent abuse by senior executives at the company, primarily Conrad Black and David Radler, is the latest in a series of scandals that have shocked corporate boardrooms across the '

In [53]:
words1 = encode(t1,voc=vocab)

In [81]:
words1.shape

AttributeError: ignored

In [83]:
len(words1)

49

In [85]:
torch.cat(words1,0).size()

torch.Size([2450])

In [91]:
type(words1)

49

In [95]:
 torch.tensor([t.numpy() for t in words1]).shape

torch.Size([49, 50])

In [90]:
words1[0].shape

torch.Size([50])

In [None]:
x1 = [torch.tensor(encode("Sugar",voc=vocab).item)]


In [88]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, collate_fn=offsetify, shuffle=True)
train_epoch_emb(net,train_loader, lr=4, epoch_size=25000)

ValueError: ignored