# Import Necessary Libraries

In [1]:
import torch
import torchtext
print(torch.__version__)
print(torchtext.__version__)

1.13.1
0.14.1


# Set device as GPU if available

In [2]:
if torch.cuda.is_available():
    dev = 'cuda'
elif torch.backends.mps.is_available():
    dev = 'mps'
else:
    dev = 'cpu'
device = torch.device(dev)    
print(f'Device: {device}')

Device: mps


# PyTorch warmup

### 1. Use torch.randn to create two tensors of size (29, 30, 32) and (32, 100)

In [3]:
tensor_a = torch.randn(29, 30, 32)
print('Tensor of Size (29, 30, 32): \n' , tensor_a)

tensor_b = torch.randn(32, 100)
print('Tensor of Size (32, 100): \n' , tensor_b)

Tensor of Size (29, 30, 32): 
 tensor([[[ 0.3367, -0.0778,  0.0546,  ...,  0.8098, -0.2722,  0.0287],
         [-0.2116, -0.6549,  1.6391,  ..., -0.4656, -0.7392, -0.2576],
         [ 0.5508,  0.4069,  1.3977,  ...,  0.3417, -0.3126, -1.4643],
         ...,
         [ 0.0874,  0.4584,  0.2170,  ..., -1.2799,  0.0577,  0.3215],
         [-0.6971,  1.4693,  0.2514,  ..., -0.1359, -0.6667, -0.1972],
         [ 1.1785, -0.2125,  0.2029,  ...,  0.2271,  1.0148, -0.5254]],

        [[-1.1548, -0.0106,  0.3458,  ...,  1.6871,  0.0092, -0.1228],
         [ 0.6263,  0.4055, -0.6232,  ..., -0.1954, -0.6879, -1.0670],
         [ 1.0371,  0.0730, -0.2725,  ...,  0.5270, -0.3998,  1.4140],
         ...,
         [ 0.7513,  1.6772,  0.2534,  ..., -0.1046, -0.6020,  1.3206],
         [-1.1698,  0.9286, -1.1145,  ...,  0.6316,  0.8283,  1.4860],
         [-1.1292, -1.1039, -0.5689,  ..., -0.9585, -0.6074,  0.7523]],

        [[-0.7420, -1.1133,  0.5136,  ..., -1.4099, -1.0709,  0.5335],
         [-0.1

### 2. Use  torch.matmul  to matrix multiply the two tensors

In [4]:
product = torch.matmul(tensor_a, 
                       tensor_b)
print('Product of tensor_a and tensor_b: \n' , product)

print('Shape of Tensor: ' , product.shape)

Product of tensor_a and tensor_b: 
 tensor([[[-5.9818e+00, -1.7661e+00, -1.6034e+00,  ..., -3.0387e+00,
          -3.7300e-01, -1.1283e+00],
         [ 4.9403e+00, -2.2627e-01,  1.4862e+00,  ..., -1.2954e+01,
           1.8034e+00,  2.4297e+00],
         [-1.8042e+00, -1.0814e+01,  1.9619e+00,  ...,  2.5432e-01,
          -2.8195e-01, -1.5294e+00],
         ...,
         [-1.9089e-01,  8.1660e+00,  3.3491e+00,  ...,  9.2679e+00,
           5.1710e+00, -5.0976e+00],
         [-3.1682e-01, -9.1286e-01, -4.4653e+00,  ...,  3.5076e+00,
          -5.4038e+00,  8.5440e-01],
         [-7.3638e+00,  2.7562e+00,  1.4914e+00,  ..., -2.3446e+00,
          -5.7287e+00,  7.7993e-01]],

        [[-4.8814e+00, -4.5118e+00,  5.8500e+00,  ..., -1.7505e+00,
          -9.7674e+00,  6.5590e+00],
         [-1.1605e+01, -8.1115e-01, -2.7348e+00,  ...,  3.6612e+00,
          -1.6930e+00,  2.9778e+00],
         [ 1.3404e+00,  5.8532e+00, -1.3133e+01,  ...,  2.7410e+00,
          -3.1024e+00, -2.4988e+00],
   

### 3. What is the difference between torch.matmul , torch.mm , torch.bmm , and torch.einsum , and the @ operator?

1. torch.matmul() ->
2. torch.mm() ->
3. torch.bmm() ->
4. torch.einsum() ->
5. @ operator -> It is a shorthand for the torch.matmul() function.

### 4. Use torch.sum on the resulting tensor, passing the optional argument of dim=1 to sum across the 1st dimension. Before you run this, can you predict the size?

In [5]:
tensor_sum = torch.sum(product, 
                       dim = 1)
print('Sum of Tensor across 1st Dimension: \n' , tensor_sum)

print('Shape of Tensor: ' , tensor_sum.shape)

Sum of Tensor across 1st Dimension: 
 tensor([[ -0.3894, -43.6030,  -4.2514,  ...,   0.7059,   3.6391,  20.6740],
        [-73.2093,  69.4370, -48.8484,  ...,  55.7981, -16.5465,  42.0442],
        [ 29.6912,  45.8306,   9.9017,  ...,  28.8507,  19.8293, -26.6173],
        ...,
        [-23.6733, -24.7574, -25.7522,  ..., -11.3934, -32.3515,  54.4481],
        [-33.7505,  31.8131,  45.4374,  ...,  18.3431,   1.0394,  32.7575],
        [ -0.6389, -63.0511,  38.4923,  ...,  -2.8087,  13.5516,  45.7638]])
Shape of Tensor:  torch.Size([29, 100])


### 5. Create a new long tensor of size  (3, 10)

In [6]:
long_tensor = torch.ones((3, 10), 
                         dtype = torch.long)
print('Long Tensor: \n' , long_tensor)

long_tensor[0, 0] = 2
long_tensor[1, 2] = 4
long_tensor[2, 4] = 6

print('Updated Long Tensor: \n' , long_tensor)

Long Tensor: 
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Updated Long Tensor: 
 tensor([[2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 4, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 6, 1, 1, 1, 1, 1]])


### 6. Use this new long tensor to index into the tensor from step 2

In [7]:
indexed_tensor = product[long_tensor]
print('Indexed Tensor: \n' , indexed_tensor)

print('Shape of Tensor: ' , indexed_tensor.shape)

Indexed Tensor: 
 tensor([[[[ 9.3840e+00,  4.8832e-01, -3.3650e+00,  ..., -4.5400e+00,
           -7.1142e-01, -7.8463e-01],
          [-5.2652e+00,  6.5980e+00,  5.4461e+00,  ...,  5.3439e+00,
            7.1507e+00, -3.1384e+00],
          [-7.6232e+00, -9.1756e-01, -2.4831e+00,  ..., -6.1729e+00,
           -4.1371e-02,  1.0506e+01],
          ...,
          [-9.3179e+00, -4.4575e+00,  5.9407e+00,  ...,  9.2306e+00,
           -4.2039e+00,  1.1471e+01],
          [ 9.2039e+00,  4.2572e+00,  1.2229e+01,  ...,  4.4297e+00,
            4.0283e+00,  4.2574e+00],
          [ 7.9123e-03, -4.2531e+00,  8.1606e+00,  ...,  2.4967e+00,
            6.3144e+00, -1.6940e+00]],

         [[-4.8814e+00, -4.5118e+00,  5.8500e+00,  ..., -1.7505e+00,
           -9.7674e+00,  6.5590e+00],
          [-1.1605e+01, -8.1115e-01, -2.7348e+00,  ...,  3.6612e+00,
           -1.6930e+00,  2.9778e+00],
          [ 1.3404e+00,  5.8532e+00, -1.3133e+01,  ...,  2.7410e+00,
           -3.1024e+00, -2.4988e+00],
  

### 7. Use  torch.mean  to average across the last dimension in the tensor from step 6

In [8]:
mean_tensor = torch.mean(indexed_tensor, 
                         dim = 3)
print(mean_tensor)

print('Shape of Tensor: ' , mean_tensor.shape)

tensor([[[ 0.2157,  1.0234, -0.8735,  0.1761,  0.2776, -0.4006,  1.1887,
           0.4868,  0.2982, -0.5467,  0.3171,  0.6192,  0.1026,  0.5908,
          -0.8421,  0.2589, -1.1169, -0.1902,  0.1689,  0.9301,  0.1618,
           0.6100,  0.7669,  0.6138, -0.7244, -0.8433, -0.2402, -0.0565,
           0.6954, -0.1287],
         [ 0.3585, -0.2706, -0.6280,  0.2025, -0.3965,  0.0791, -0.2661,
           1.6628, -0.5922, -0.0110, -0.3937, -0.6176,  0.0131, -0.2943,
          -0.2396,  0.0329, -0.5518,  0.6359, -0.0845,  0.1698,  0.1838,
           0.1674,  0.1482, -0.1087, -0.1855,  0.4301, -0.6898,  0.0579,
           0.3037,  0.9831],
         [ 0.3585, -0.2706, -0.6280,  0.2025, -0.3965,  0.0791, -0.2661,
           1.6628, -0.5922, -0.0110, -0.3937, -0.6176,  0.0131, -0.2943,
          -0.2396,  0.0329, -0.5518,  0.6359, -0.0845,  0.1698,  0.1838,
           0.1674,  0.1482, -0.1087, -0.1855,  0.4301, -0.6898,  0.0579,
           0.3037,  0.9831],
         [ 0.3585, -0.2706, -0.6280, 

### 8. Redo step 2. on the GPU and compare results from step 2

In [9]:
tensor_a_cuda = tensor_a.to(device = device)
tensor_b_cuda = tensor_b.to(device = device)
product_gpu = torch.matmul(tensor_a, tensor_b)
print('Product of tensor_a and tensor_b on GPU: \n' , product_gpu)

print('Shape of Tensor: ' , product_gpu.shape)

Product of tensor_a and tensor_b on GPU: 
 tensor([[[-5.9818e+00, -1.7661e+00, -1.6034e+00,  ..., -3.0387e+00,
          -3.7300e-01, -1.1283e+00],
         [ 4.9403e+00, -2.2627e-01,  1.4862e+00,  ..., -1.2954e+01,
           1.8034e+00,  2.4297e+00],
         [-1.8042e+00, -1.0814e+01,  1.9619e+00,  ...,  2.5432e-01,
          -2.8195e-01, -1.5294e+00],
         ...,
         [-1.9089e-01,  8.1660e+00,  3.3491e+00,  ...,  9.2679e+00,
           5.1710e+00, -5.0976e+00],
         [-3.1682e-01, -9.1286e-01, -4.4653e+00,  ...,  3.5076e+00,
          -5.4038e+00,  8.5440e-01],
         [-7.3638e+00,  2.7562e+00,  1.4914e+00,  ..., -2.3446e+00,
          -5.7287e+00,  7.7993e-01]],

        [[-4.8814e+00, -4.5118e+00,  5.8500e+00,  ..., -1.7505e+00,
          -9.7674e+00,  6.5590e+00],
         [-1.1605e+01, -8.1115e-01, -2.7348e+00,  ...,  3.6612e+00,
          -1.6930e+00,  2.9778e+00],
         [ 1.3404e+00,  5.8532e+00, -1.3133e+01,  ...,  2.7410e+00,
          -3.1024e+00, -2.4988e+0

### 9. Write a pure PyTorch program to compute the value of $\sqrt{2}$ up to 4 decimal places without using the square root or other math functions from any of the libraries. 
### Hint: Notice that the answer is the (positive) root of the equation, $$𝑥^2 −2 = 0$$ 
### To find the root, you might want to use "Newton's Method": $$𝑥_{𝑛+1} = 𝑥_{𝑛} − \frac{𝑓(𝑥)}{𝑓′(𝑥)}$$


# Fail-fast prototyping

When building neural networks, you want things to either work or fail fast. Long iteration loops are 
the worst enemy of a machine learning practitioner. \
For e.g., while writing code, you might want to incrementally test your code by doing something 
like this:

batch_size = 32 \
num_features = 512 \
embedding_size = 16

\# construct a dummy input \
x = torch.randn(batch_size, num_features)

\# we want to project the input to embedding_size \
fc = torch.nn.Linear(num_features, embedding_size)

\# test if that works \
print(fc(x).shape)

# Fail-fast exercises

### 1. [Glove](https://nlp.stanford.edu/projects/glove/) has 300 dimension embeddings. Design an nn.Module that takes a sentence of max_len words, tokenizes words by spaces, represents the sentence by averaging the glove embeddings of constituent words. What is the shape of the resulting sentence embedding? When you implement this, you will need to make some assumptions. What are they?

In [66]:
# Load GloVe Embeddings
from torchtext.vocab import GloVe

GLOVE_DIM = 300
glove = GloVe(name = '840B', 
              dim = GLOVE_DIM)

print(f'Loaded {len(glove.itos)} words present in GloVe')

embeddings_tensor = glove.vectors
embeddings_tensor = embeddings_tensor.to(device = device)

Loaded 2196017 words present in GloVe


In [11]:
NUM_SENT = 512
sents = list()
for i in range(NUM_SENT):
    sents.append('This is the quest zero and it has a deadline this Sunday March 29')
print(len(sents))

512


In [12]:
import torch.nn as nn

class GloveEmbeddingAvg(nn.Module):
    
    def __init__(self, max_len):
        super().__init__()
        self.max_len = max_len
        self.embedding = nn.Embedding.from_pretrained(embeddings_tensor)
        
    def forward(self, 
                sent):
        # Tokenize the sentence by spaces
        tokens = sent.split(' ')[:self.max_len]
        # Get idx of each token from the GloVe dictionary
        glove_dict_indexes = [glove.stoi[token] for token in tokens]
        # Convert it into Tensor
        glove_dict_indexes = torch.tensor(glove_dict_indexes, 
                                          device = device)
        # Get Word Embeddings for all tokens
        word_embeds = self.embedding(glove_dict_indexes)
        # Sentence Embedding = Average of Word Embeddings
        sent_embeds = word_embeds.mean(dim = 0)
        # Reshape Sentence Embedding as a 2D Tensor
        return sent_embeds.view(1, -1)

MAX_LEN = 10
glove_embeds_avg = GloveEmbeddingAvg(MAX_LEN).to(device)

print(glove_embeds_avg(sents[0]).shape)

torch.Size([1, 300])


### 2. How will you modify step 1. so that the sentence embeddings are in $R^{50}$ ?
BONUS: Can you think of more than one way to do this? What are the implications of each method?

In [13]:
class GloveEmbeddingAvg_50_Dim(nn.Module):
    
    def __init__(self, max_len):
        super().__init__()
        self.max_len = max_len
        self.embedding = nn.Embedding.from_pretrained(embeddings_tensor)
        self.fc = nn.Linear(GLOVE_DIM, 50)
        
    def forward(self, 
                x):
        # Slice each sentence to Max Length
        x = x[:, :self.max_len]
        # Get Word Embeddings for all tokens
        word_embeds = self.embedding(x) # [BATCH_SIZE, MAX_LEN, GLOVE_DIM]
        # Sentence Embedding = Average of Word Embeddings
        sent_embeds = word_embeds.mean(dim = 0) # [MAX_LEN, GLOVE_DIM]
        # Linear Layer to reduce Sentence Embedding Dimension to 50
        return self.fc(sent_embeds) # [MAX_LEN, 50]

MAX_LEN = 10
glove_embeds_avg_50_dim = GloveEmbeddingAvg_50_Dim(MAX_LEN).to(device)

### 3. Quickly test your answer in step 2. with a batch of 512 sentences on the GPU.

In [18]:
# Tokenize a Sentence
def tokenize(sent):
    # Tokenize the sentence by spaces
    tokens = sent.split(' ')
    # Get idx of each token from the GloVe dictionary
    glove_dict_indexes = [glove.stoi[token] for token in tokens]
    return glove_dict_indexes

# Create Tokenized Sentence Corpus
tokenized_sents = list()
for sent in sents:
    tokenized_sents.append(tokenize(sent))
tokenized_sents = torch.tensor(tokenized_sents, 
                               device = device)

# Run forward pass
BATCH_SIZE = 512
for i in range(0, len(tokenized_sents), BATCH_SIZE):
    batch = tokenized_sents[i:i+BATCH_SIZE]
    sentence_embeddings = glove_embeds_avg_50_dim(batch)
    print(sentence_embeddings.shape)

torch.Size([10, 50])


### Congratulations! You almost implemented the model in the Deep Averaging Networks (DAN) paper!

# 4. Task: 
### Create a   MultiEmbedding  Module that can take two sets of indices, embed them, and concat the results. You might remember it from the previous lecture where we had to produce an embedding for "green apple" from embeddings of "green" and "apple". Your  MultiEmbedding class should work with the following test code.

In [19]:
class MultiEmbedding(nn.Module):
    
    def __init__(self, 
                 num_emb, 
                 size_emb1, 
                 size_emb2):
        super().__init__()
        self.embedding_A = nn.Embedding(num_emb, size_emb1)
        self.embedding_B = nn.Embedding(num_emb, size_emb2)
        
    def forward(self, 
                indices1, 
                indices2):
        embed_A = self.embedding_A(indices1)
        embed_B = self.embedding_B(indices2)
        # Concatenate the Embeddings
        return torch.cat((embed_A, embed_B), 
                         dim = -1)

In [20]:
# Test code: instantiate a MultiEmbedding with the sizes for each embedding. 
# For this example, you can just randomly initialize each interior embedding. 
# In a practical setting, you might support methods for initializing with 
# combinations of embeddings, such as GloVe 300d vectors and word2vec 200d 
# vectors, yielding 500d embeddings. Both embeddings share a vocabulary/range 
# of supported indices indicated by `num_emb`

NUM_EMB = 10000
SIZE_EMB1 = 300
SIZE_EMB2 = 200
BATCH_SIZE = 64
NUM_LENGTH = 10

multiemb = MultiEmbedding(NUM_EMB, 
                          SIZE_EMB1, 
                          SIZE_EMB2).to(device)

# You can then call this with a pair of indices where each value is in 0 <= i < num_emb
indices1 =  torch.randint(0, 
                          NUM_EMB, 
                          (BATCH_SIZE, NUM_LENGTH), 
                          dtype = torch.long, 
                          device = device) # long tensor of shape (batch, num_length)
indices2 =  torch.randint(0, 
                          NUM_EMB, 
                          (BATCH_SIZE, NUM_LENGTH), 
                          dtype = torch.long, 
                          device = device) # long tensor of shape (batch, num_length)
output = multiemb(indices1, 
                  indices2)
print(output.shape) # should be (batch, num_length, size_emb1 + size_emb2)

torch.Size([64, 10, 500])


# 5. Datasets and DataLoaders: 
### Read this short post on PyTorch Dataset and DataLoaders. Often in prototyping we need to generate dummy datasets to test our models. Implement a PyTorch Dataset class that generates up to  num_sentences  random sentences of length up to  max_len words. For each sentence, generate a binary label. You should be able to test your code as follows:

In [82]:
class DeepAveragingNetwork(nn.Module):
    
    def __init__(self, max_len):
        super().__init__()
        self.max_len = max_len
        self.embedding = nn.Embedding.from_pretrained(embeddings_tensor)
        self.fc = nn.Linear(GLOVE_DIM, 50)
        
    def forward(self, 
                sent):
        # Tokenize the sentence by spaces
        tokens = sent.split(' ')[:self.max_len]
        # Get idx of each token from the GloVe dictionary
        glove_dict_indexes = [glove.stoi[token] for token in tokens]
        # Convert it into Tensor
        glove_dict_indexes = torch.tensor(glove_dict_indexes, 
                                          device = device)
        # Get Word Embeddings for all tokens
        word_embeds = self.embedding(glove_dict_indexes)
        # Sentence Embedding = Average of Word Embeddings
        sent_embeds = word_embeds.mean(dim = 0)
        return self.fc(sent_embeds.view(1, -1))

model = DeepAveragingNetwork(MAX_LEN).to(device)

In [89]:
import random
from torch.utils.data import Dataset

class DummySentenceLabelDataset(Dataset):
    
    def __init__(self, 
                 num_sentences, 
                 max_len):
        self.num_sentences = num_sentences
        self.max_len = max_len
        self.sents = self.generate_sents()
        self.labels = self.generate_labels()
        
    def __len__(self):
        return self.num_sentences
        
    def __getitem__(self, 
                    idx):
        return self.sents[idx], self.labels[idx]
    
    def get_random_sent(self, word_list):
        sent = ''
        for i in range(self.max_len):
            word = random.choice(word_list)
            sent += word + ' '
        return sent
    
    def generate_sents(self):
        word_list = ['Hello', 'World', 'Python', 'Function', 'Random', 'Sentence', 
                     'List', 'Words', 'Generates', '20', 'Example', 'Simple', 'Program', 
                     'Easy', 'Understand', 'Learn', 'Code', 'Implement', 'Execute', 'Run', 
                     'Brazil', 'India', 'Chat', 'India', 'Golden', 'State', 'Warriors']
        sents = list()
        for i in range(self.num_sentences):
            sent = self.get_random_sent(word_list)
            sents.append(sent)
        return sents
    
    def generate_labels(self):
        labels = list()
        for i in range(self.num_sentences):
            labels.append(random.randint(0, 1))
        return labels

NUM_SENTENCES = 10
MAX_LEN = 20
dataset = DummySentenceLabelDataset(num_sentences = NUM_SENTENCES, 
                                    max_len = MAX_LEN)

#### Let's measure the error rate for one epoch

In [97]:
error = 0.0
for sentence, label in dataset:
  print(f'Sentence: {sentence},\nLabel: {label}')
  prediction = model(sentence)
  error += abs(prediction - label)
print('==================================================================')  
print(f'Error rate: {error/len(dataset)}')

Sentence: State India Hello Chat List Understand Python Warriors Sentence Learn Example Words State Run 20 Brazil India Chat Understand Chat ,
Label: 1
Sentence: India Sentence Understand Example Execute Easy Implement Simple Run Generates World India Function Chat Simple State India Program State Warriors ,
Label: 1
Sentence: Hello World Example Random Execute Python Example Random Example Function Hello Run Sentence Run Words Words Warriors List Python Program ,
Label: 0
Sentence: Learn Execute Chat Function Easy Run Brazil India Example Code Function Warriors Hello List Chat Warriors Chat Simple Random Generates ,
Label: 0
Sentence: Words Sentence Golden India 20 Generates Execute Sentence Brazil Words Easy Golden Program Simple 20 Function Simple Simple Simple List ,
Label: 1
Sentence: Warriors Understand Understand World Run Hello Learn Example Run Program 20 Understand Brazil Random Warriors Program Hello World Example Example ,
Label: 1
Sentence: Example Brazil Learn Understand 