In [3]:
import torch
import numpy as np

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device is: {device}\n")

Device is: cuda



# Tensors

#1. The only datatype the model will understand is tensor (multi dimensional matrix)

#2. You will have to convert the feature vector to a tensor 

#3. The entries in the tensor could be float, int,....

#4. https://pytorch.org/docs/stable/tensors.html (lists all the available types)

In [5]:
# general way of creating tensor - torch.tensor(<list/numpy array>, dtype=<dtype>)
x = torch.tensor([1,2,3,4])
y = torch.tensor([1,2,3,4],dtype=torch.float32)
z = torch.FloatTensor([1,2,3,4])

In [6]:
z

tensor([1., 2., 3., 4.])

In [7]:
z.dtype

torch.float32

In [8]:
x = np.array([1,2,3,4])
x_t = torch.tensor(x)

In [9]:
x_t

tensor([1, 2, 3, 4])

In [10]:
# you can get back numpy array back from tensor
x_n = x_t.numpy()

In [11]:
x_n

array([1, 2, 3, 4])

In [12]:
# similar to numpy there are different ways of creating tensors
x = torch.ones((1,8))
y = torch.zeros((1,2))

In [13]:
x

tensor([[1., 1., 1., 1., 1., 1., 1., 1.]])

In [14]:
# similar way of accessing elements as numpy
y[0][0] = 1.0

In [15]:
y[0,0] # this also works

tensor(1.)

In [16]:
# reshaping matrix
# We want to reshape the matrix of form 3x3 to 1x9
z = torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
z.shape

torch.Size([3, 3])

In [17]:
w = z.reshape(1,9)
w.shape

torch.Size([1, 9])

In [18]:
# Now that we know how to create and manipulate tensors, we move onto the second part which is calculating gradients
# You donot need to calculate gradients from scratch..
# Autograd module in Pytorch does it for you

In [19]:
x.requires_grad

False

In [20]:
x = torch.ones((1,7),requires_grad=True)

In [21]:
y = torch.sum(x*x)

In [22]:
y.backward() # calculates gradient of y wrt all variables (x in this example)

In [23]:
print(x.grad) # dy/dx

tensor([[2., 2., 2., 2., 2., 2., 2.]])


# Creating a neural network

In [24]:
# We will implement the word2vec algorthm which is a simple feed-forward network

<img src="word2vec.png">

In [25]:
import torch.nn as nn

In [26]:
class Word2vec(nn.Module):
    def __init__(self, v_size, dimension):
        super().__init__()
        self.v_size = v_size # vocabulary size
        self.dim = dimension # dimension of embedding
        self.W_e = nn.Parameter(torch.rand((self.v_size, self.dim)))
        self.W_c = nn.Parameter(torch.rand((self.dim, self.v_size)))
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, t_w):
        e_w = self.W_e[t_w]
        out = torch.matmul(e_w, self.W_c)
        out = self.softmax(out)
        return out 

# Pre-processing

In [27]:
from nltk.tokenize import word_tokenize

In [28]:
# We will now generate center/target-context word pairs

In [29]:
sentences = []
with open('abc_news/abcnews-date-text.csv', 'r') as fs:
    for line in fs:
        #print(line)
        ind = line.find(",")                   
        text = line[ind+1:]
        sentences.append(text)

In [30]:
import string
puncts = string.punctuation

In [31]:
from nltk.corpus import stopwords
s_words = stopwords.words('english')

In [32]:
n_sentences = []
i = 0
for sent in sentences:
    words = word_tokenize(sent)
    words = [w.lower() for w in word_tokenize(sent) if w not in puncts and w.lower() not in s_words]
    n_sentences.append(words)
    i+=1
    if i==10000:
        break

In [33]:
n_sentences[0]

['aba', 'decides', 'community', 'broadcasting', 'licence']

In [34]:
len(n_sentences)

10000

In [35]:
from collections import Counter

In [36]:
word_count = Counter()

In [37]:
for sent in n_sentences:
    for w in sent:
        word_count[w]+=1

In [38]:
len(word_count)

9939

In [39]:
# Filtering out the words that occurred less than 5 times
word2index = {}
index2word = {}
i=0
for w in word_count:
    if word_count[w]>5:
        word2index[w] = i
        index2word[i] = w
        i+=1

In [40]:
len(word2index), len(index2word)

(1986, 1986)

In [41]:
# now we remove the words which appeared less than 5 times in the corpus
all_sentences = []
for sent in n_sentences:
    s = [w for w in sent if w in word2index]
    all_sentences.append(s)

In [42]:
from tqdm import tqdm

In [43]:
# we consider a context window of 2
window_size = 2
word_context_pairs = []
for sent in tqdm(all_sentences):
    #print(sent)
    for i, word in enumerate(sent):
        t_word = word2index[word]
        for j in range(1,window_size+1):
            if i-j>=0:
                c_word = word2index[sent[i-j]]
                word_context_pairs.append((t_word, c_word))
            if i+j<len(sent):
                c_word = word2index[sent[i+j]]
                word_context_pairs.append((t_word, c_word))

100%|██████████| 10000/10000 [00:00<00:00, 97242.54it/s]


In [44]:
len(word_context_pairs)

95444

# Training

In [45]:
def get_batches(batch_size=64):
    for i in range(0, len(word_context_pairs), batch_size):
        yield word_context_pairs[i:i+batch_size]

In [46]:
for batch in get_batches():
    print(batch)
    break

[(0, 1), (1, 0), (2, 3), (2, 4), (3, 2), (3, 4), (3, 5), (4, 3), (4, 5), (4, 2), (5, 4), (5, 3), (6, 7), (6, 8), (7, 6), (7, 8), (7, 9), (8, 7), (8, 9), (8, 6), (9, 8), (9, 7), (10, 11), (10, 12), (11, 10), (11, 12), (11, 13), (12, 11), (12, 13), (12, 10), (12, 14), (13, 12), (13, 14), (13, 11), (13, 15), (14, 13), (14, 15), (14, 12), (14, 16), (15, 14), (15, 16), (15, 13), (16, 15), (16, 14), (10, 11), (10, 14), (11, 10), (11, 14), (11, 17), (14, 11), (14, 17), (14, 10), (14, 18), (17, 14), (17, 18), (17, 11), (18, 17), (18, 14), (19, 20), (19, 21), (20, 19), (20, 21), (21, 20), (21, 19)]


In [47]:
X,y = list(zip(*batch))

In [48]:
from torch.optim import SGD # for optimization

In [49]:
model = Word2vec(len(word2index), 100)
epochs = 5
optimizer = SGD(model.parameters(), lr=0.001)
criterion = nn.NLLLoss() # negative log-likelihood loss
for e in range(epochs):
    for i, batch in tqdm(enumerate(get_batches())):
        X,y = list(zip(*batch))
        X = torch.tensor(X)
        y = torch.tensor(y)
        out = model(X)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

1492it [00:00, 2644.09it/s]
1492it [00:00, 2533.31it/s]
1492it [00:00, 2576.79it/s]
1492it [00:00, 2568.53it/s]
1492it [00:00, 2506.17it/s]


In [50]:
def generate_embeddings(model, word):
    index = word2index[word]
    return model.W_e[index].detach()

In [51]:
index2word[1]

'licence'

In [52]:
generate_embeddings(model, 'licence')

tensor([0.5728, 0.6929, 0.3726, 0.4095, 0.7465, 0.4397, 0.3540, 0.1893, 0.2499,
        0.1384, 0.1784, 0.9345, 0.5357, 0.7073, 0.0518, 0.8645, 0.8236, 0.6776,
        0.9739, 0.8267, 0.8614, 0.3344, 0.1896, 0.6776, 0.1808, 0.9190, 0.6413,
        0.7563, 0.8817, 0.1728, 0.5030, 0.0912, 0.7279, 0.1359, 0.4017, 0.7962,
        0.6545, 0.0746, 0.9268, 0.1942, 0.5248, 0.7611, 0.1689, 0.7647, 0.7647,
        0.9167, 0.0886, 0.9614, 0.3198, 0.2808, 0.9089, 0.4913, 0.6085, 0.7777,
        0.1603, 0.1803, 0.4515, 0.7839, 0.3217, 0.6338, 0.9672, 0.8717, 0.5108,
        0.0908, 0.4319, 0.0722, 0.4457, 0.8063, 0.1259, 0.7349, 0.9336, 0.4338,
        0.2817, 0.6508, 0.2732, 0.5957, 0.7631, 0.1359, 0.9362, 0.3379, 0.0844,
        0.5994, 0.9929, 0.6280, 0.5156, 0.1353, 0.6112, 0.5196, 0.8100, 0.4631,
        0.6538, 0.7504, 0.7148, 0.6516, 0.8599, 0.2443, 0.8138, 0.0674, 0.6872,
        0.8376])

# Tasks

Download the Game of Thrones Corpus. Perform pre-processing to remove stopwords and punctuations.   

In [53]:
from nltk.corpus import stopwords
import string
import os
import re

In [54]:
stop_words = stopwords.words('english')
punctuations = string.punctuation
punctuations = punctuations + '”“’'

In [55]:
whole_text = []
for filename in os.listdir('GOT'):
    if filename.endswith('.txt'):
        with open(os.path.join('GOT', filename)) as file:
            for line in file:
                whole_text.append(line)
whole_text[:10]

['This edition contains the complete text of the original hardcover edition.\n',
 '\n',
 'NOT ONE WORD HAS BEEN OMITTED.\n',
 '\n',
 'A CLASH OF KINGS\n',
 '\n',
 'A Bantam Spectra Book\n',
 '\n',
 'PUBLISHING HISTORY\n',
 '\n']

In [56]:
tokenized_lines = []
for line in whole_text:
    words = [w.lower() for w in word_tokenize(line) if w not in punctuations and w.lower() not in s_words]
    tokenized_lines.append(words)
tokenized_lines[:5]

[['edition',
  'contains',
  'complete',
  'text',
  'original',
  'hardcover',
  'edition'],
 [],
 ['one', 'word', 'omitted'],
 [],
 ['clash', 'kings']]

In [57]:
with open('cleaned_text.txt', 'w') as file:
    for line in tokenized_lines:
        if not line:
            continue
        file.write(' '.join(line) + '\n')

Obtain the word2index and index2word maps for all the unique words in the corpus

In [58]:
word_count = Counter()
for line in tokenized_lines:
    for word in line:
        word_count[word] += 1
print(len(word_count))

27848


In [59]:
word2index = {}
index2word = {}
i = 0
for w in word_count:
    if word_count[w]>=5:
        word2index[w] = i
        index2word[i] = w
        i += 1

In [60]:
cleaned_text = []
for line in tokenized_lines:
    l = [w for w in line if w in word2index]
    cleaned_text.append(l)
print(cleaned_text[:5])

[['edition', 'complete', 'edition'], [], ['one', 'word'], [], ['clash', 'kings']]


In [61]:
with open('cleaned_text.txt', 'w') as file:
    for line in cleaned_text:
        if not line:
            continue
        file.write(' '.join(line) + '\n')

Train word2vec algorithm on this corpus.

In [62]:
# we consider a context window of 2
window_size = 2
word_context_pairs = []
for line in tqdm(cleaned_text):
    for i, word in enumerate(line):
        t_word = word2index[word]
        for j in range(1,window_size+1):
            if i-j>=0:
                c_word = word2index[line[i-j]]
                word_context_pairs.append((t_word, c_word))
            if i+j<len(line):
                c_word = word2index[line[i+j]]
                word_context_pairs.append((t_word, c_word))

100%|██████████| 92493/92493 [00:01<00:00, 85553.83it/s] 


In [63]:
def get_batches(batch_size=64):
    for i in range(0, len(word_context_pairs), batch_size):
        yield word_context_pairs[i:i+batch_size]

In [65]:
model = Word2vec(len(word2index), 100).to(device)
epochs = 5
optimizer = SGD(model.parameters(), lr=0.001)
criterion = nn.NLLLoss() # negative log-likelihood loss
for e in range(epochs):
    for i, batch in tqdm(enumerate(get_batches())):
        X,y = list(zip(*batch))
        X = torch.tensor(X).to(device)
        y = torch.tensor(y).to(device)
        out = model(X)
        loss = criterion(out, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

51502it [00:27, 1865.33it/s]
51502it [00:27, 1858.92it/s]
51502it [00:26, 1934.79it/s]
51502it [00:27, 1889.08it/s]
51502it [00:26, 1916.63it/s]


In [66]:
def generate_embeddings(model, word):
    index = word2index[word]
    return model.W_e[index].detach()

In [67]:
print(generate_embeddings(model, 'sansa'))

tensor([0.3638, 0.3721, 0.3662, 0.2150, 0.3640, 0.5664, 0.9018, 0.2554, 0.9234,
        0.1999, 0.5275, 0.1165, 0.6489, 0.3467, 0.6048, 0.0961, 0.0184, 0.7646,
        0.2026, 0.9545, 0.9708, 0.1207, 0.8303, 0.9559, 0.8189, 0.3335, 0.1994,
        0.6889, 0.8779, 0.5850, 0.6059, 0.3932, 0.9026, 0.1976, 0.8631, 0.9511,
        0.3445, 0.7543, 0.9619, 0.5030, 0.6944, 0.9572, 0.2966, 0.2563, 0.6139,
        0.1872, 0.2527, 0.9310, 0.7091, 0.6723, 0.7228, 0.0354, 0.2942, 0.4424,
        0.7459, 0.8883, 0.2450, 0.8049, 0.0234, 0.5913, 0.7089, 0.4237, 0.9630,
        0.7962, 0.9481, 0.2312, 0.8720, 0.2341, 0.2099, 0.3356, 0.7303, 0.0973,
        0.3383, 0.8200, 0.7255, 0.5373, 0.7961, 0.9242, 0.1236, 0.8007, 0.4954,
        0.6857, 0.2211, 0.5509, 0.6727, 0.8436, 0.8034, 0.2699, 0.9625, 0.9039,
        0.0415, 0.1289, 0.2299, 0.8344, 0.3997, 0.0583, 0.2278, 0.1008, 0.1953,
        0.1639], device='cuda:0')


Write a function to compute similarity between embeddings of two words

In [68]:
from sklearn.metrics.pairwise import cosine_similarity

Compute similarity between two characters in GOT.

In [76]:
embedding_1 = generate_embeddings(model, 'daenerys').to('cpu')
embedding_2 = generate_embeddings(model, 'tyrion').to('cpu')
print(cosine_similarity(embedding_1.reshape(1, -1), embedding_2.reshape(1, -1)))

[[0.77543]]
