# Import Necessary Libraries

In [1]:
import torch
import torchtext
print(torch.__version__)
print(torchtext.__version__)

1.13.1
0.14.1


# Set device as GPU if available

In [2]:
if torch.cuda.is_available():
    dev = 'cuda'
elif torch.backends.mps.is_available():
    dev = 'mps'
else:
    dev = 'cpu'
device = torch.device(dev)    
print(f'Device: {device}')

Device: mps


# PyTorch warmup

### 1. Use torch.randn to create two tensors of size (29, 30, 32) and (32, 100)

In [3]:
tensor_a = torch.randn(29, 30, 32)
print('Tensor of Size (29, 30, 32): \n' , tensor_a)

tensor_b = torch.randn(32, 100)
print('Tensor of Size (32, 100): \n' , tensor_b)

Tensor of Size (29, 30, 32): 
 tensor([[[ 1.4315,  0.2560,  0.8974,  ..., -2.6647, -1.3073,  0.1583],
         [-0.6114, -0.3433, -1.3451,  ..., -0.2320, -1.5039,  2.9262],
         [ 1.1454,  0.0537,  0.9149,  ...,  0.0663,  0.4858, -1.6646],
         ...,
         [-0.3856, -0.9098, -2.0143,  ..., -0.4296, -1.1758, -0.9505],
         [ 0.8244, -0.3193, -0.5848,  ..., -0.3575, -1.0483, -0.0702],
         [ 0.7738,  0.6565, -0.4229,  ...,  0.2583, -0.0219, -1.6786]],

        [[-0.4414,  1.0857, -0.5175,  ..., -1.9632,  1.1334,  0.8850],
         [ 0.1220, -0.1350,  0.2817,  ..., -0.1495, -0.0962, -1.2270],
         [ 0.4937,  0.2788, -0.6888,  ..., -0.1183,  1.3999,  0.4678],
         ...,
         [ 1.5856, -1.0178,  0.2445,  ...,  0.5461, -1.5699,  0.7392],
         [-0.3451, -0.4753,  2.6023,  ..., -0.5233, -0.2562,  1.7506],
         [-0.6029,  0.4195, -0.3989,  ...,  1.0408, -0.4963, -0.2334]],

        [[ 0.8383,  0.2752,  1.0120,  ...,  0.1503,  0.6751,  1.0003],
         [-1.5

### 2. Use  torch.matmul  to matrix multiply the two tensors

In [4]:
product = torch.matmul(tensor_a, 
                       tensor_b)
print('Product of tensor_a and tensor_b: \n' , product)

print('Shape of Tensor: ' , product.shape)

Product of tensor_a and tensor_b: 
 tensor([[[-1.2935e+01,  1.0707e+01,  2.3989e+00,  ..., -1.2342e+01,
           1.3075e+00,  3.2499e+00],
         [ 2.3836e+00, -7.7838e+00, -6.2557e+00,  ...,  3.4082e+00,
           5.9395e+00,  7.6136e+00],
         [ 6.9409e+00, -6.9336e-01, -1.8721e+00,  ...,  1.6076e+01,
          -1.9905e+00,  7.5179e+00],
         ...,
         [-1.6975e+00,  6.7376e+00, -1.0056e+00,  ..., -1.6540e+01,
           1.4331e+01,  4.9957e+00],
         [-4.5444e+00,  7.5032e+00,  2.5883e+00,  ..., -9.4577e-01,
          -2.1814e+00,  4.5546e+00],
         [-2.6276e+00,  3.1530e+00,  8.3507e+00,  ..., -1.6448e+01,
           2.5821e+00, -3.0676e+00]],

        [[ 5.4222e+00, -6.4050e+00, -9.5461e-02,  ..., -1.0439e+01,
          -6.7244e+00, -2.8128e+00],
         [ 1.3937e+00,  2.8340e+00, -5.8951e+00,  ..., -1.4838e+00,
           1.3206e+00,  3.5743e+00],
         [ 1.0266e+01,  5.1957e+00,  6.6383e+00,  ..., -6.0728e+00,
           2.4461e+00, -3.6607e+00],
   

### 3. What is the difference between torch.matmul , torch.mm , torch.bmm , and torch.einsum , and the @ operator?

1. torch.matmul() ->
2. torch.mm() ->
3. torch.bmm() ->
4. torch.einsum() ->
5. @ operator -> It is a shorthand for the torch.matmul() function.

### 4. Use torch.sum on the resulting tensor, passing the optional argument of dim=1 to sum across the 1st dimension. Before you run this, can you predict the size?

In [5]:
tensor_sum = torch.sum(product, 
                       dim = 1)
print('Sum of Tensor across 1st Dimension: \n' , tensor_sum)

print('Shape of Tensor: ' , tensor_sum.shape)

Sum of Tensor across 1st Dimension: 
 tensor([[-50.0316,  30.5396, -22.1811,  ..., -72.7584,  17.5020,  52.8702],
        [-24.5906,  66.6769,  -3.0805,  ..., -51.3462,  15.9958,  17.6778],
        [ -4.3321,  52.3295,  -0.1449,  ...,  38.0942,  11.8364, -38.1551],
        ...,
        [-17.1601,   8.4417, -27.3307,  ...,  21.3270, -37.3248,  24.2061],
        [-38.0967,  67.0543,  42.0357,  ..., -42.0369, -36.2919,  -0.6370],
        [-66.3404,  13.5168, -15.9936,  ...,   7.9241,  -7.6681,  40.2515]])
Shape of Tensor:  torch.Size([29, 100])


### 5. Create a new long tensor of size  (3, 10)

In [6]:
long_tensor = torch.ones((3, 10), 
                         dtype = torch.long)
print('Long Tensor: \n' , long_tensor)

long_tensor[0, 0] = 2
long_tensor[1, 2] = 4
long_tensor[2, 4] = 6

print('Updated Long Tensor: \n' , long_tensor)

Long Tensor: 
 tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
Updated Long Tensor: 
 tensor([[2, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 4, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 6, 1, 1, 1, 1, 1]])


### 6. Use this new long tensor to index into the tensor from step 2

In [7]:
indexed_tensor = product[long_tensor]
print('Indexed Tensor: \n' , indexed_tensor)

print('Shape of Tensor: ' , indexed_tensor.shape)

Indexed Tensor: 
 tensor([[[[ -3.0986,   9.5812,   1.5876,  ...,   2.4863,   0.0659,   1.2564],
          [ -8.7685,   0.2723,   1.4871,  ...,  -6.7436,   9.4754,  -0.8763],
          [-11.6087,   4.7631,   3.8919,  ...,   4.0621,   2.9426,   8.9158],
          ...,
          [-10.9076,   2.3546,   2.4360,  ...,   4.4684,   3.0328,   4.2456],
          [ -8.0787,   3.0227,   0.1128,  ...,  14.9838,  10.0184,  -9.0426],
          [ -2.9087,  -7.6347,   3.0203,  ...,   2.4480,  -7.7035, -14.7399]],

         [[  5.4222,  -6.4050,  -0.0955,  ..., -10.4389,  -6.7244,  -2.8128],
          [  1.3937,   2.8340,  -5.8951,  ...,  -1.4838,   1.3206,   3.5743],
          [ 10.2663,   5.1957,   6.6383,  ...,  -6.0728,   2.4461,  -3.6607],
          ...,
          [ -4.3729,   5.9844,   2.9867,  ...,  -9.2773,   2.5517,   5.6538],
          [ -6.3688,   2.8970,   3.2849,  ...,   7.0410,   3.2560,  -3.8223],
          [  0.6447,   5.0446,  11.4363,  ...,   3.5320,  -2.0670,  -2.0816]],

         [[ 

### 7. Use  torch.mean  to average across the last dimension in the tensor from step 6

In [8]:
mean_tensor = torch.mean(indexed_tensor, 
                         dim = 3)
print(mean_tensor)

print('Shape of Tensor: ' , mean_tensor.shape)

tensor([[[ 0.3231,  0.2444, -0.5558, -0.2634,  0.3927,  0.0657,  0.5092,
          -0.6199,  0.1039,  0.0898, -0.3455, -0.5221,  0.4114,  0.0585,
           0.8071, -0.5141, -0.4428, -0.6532,  0.2653,  0.0141, -0.0559,
           0.5494,  0.7992, -0.3725, -0.3948,  1.1148,  0.2472,  0.3472,
          -0.5406,  0.2302],
         [-0.5990, -0.1416,  0.0622, -0.5265,  0.0953,  0.3992, -0.0054,
           0.2694,  0.6207,  0.0644,  0.6903, -1.0322, -0.4999,  0.1124,
           0.5429, -0.1972, -0.3976, -0.8685,  0.0675, -1.0969,  0.3917,
          -1.0370, -0.9075, -0.1823,  0.6936,  0.1614, -0.8640, -0.2586,
           0.3212,  0.0760],
         [-0.5990, -0.1416,  0.0622, -0.5265,  0.0953,  0.3992, -0.0054,
           0.2694,  0.6207,  0.0644,  0.6903, -1.0322, -0.4999,  0.1124,
           0.5429, -0.1972, -0.3976, -0.8685,  0.0675, -1.0969,  0.3917,
          -1.0370, -0.9075, -0.1823,  0.6936,  0.1614, -0.8640, -0.2586,
           0.3212,  0.0760],
         [-0.5990, -0.1416,  0.0622, 

### 8. Redo step 2. on the GPU and compare results from step 2

In [9]:
tensor_a_cuda = tensor_a.to(device = device)
tensor_b_cuda = tensor_b.to(device = device)
product_gpu = torch.matmul(tensor_a, tensor_b)
print('Product of tensor_a and tensor_b on GPU: \n' , product_gpu)

print('Shape of Tensor: ' , product_gpu.shape)

Product of tensor_a and tensor_b on GPU: 
 tensor([[[-1.2935e+01,  1.0707e+01,  2.3989e+00,  ..., -1.2342e+01,
           1.3075e+00,  3.2499e+00],
         [ 2.3836e+00, -7.7838e+00, -6.2557e+00,  ...,  3.4082e+00,
           5.9395e+00,  7.6136e+00],
         [ 6.9409e+00, -6.9336e-01, -1.8721e+00,  ...,  1.6076e+01,
          -1.9905e+00,  7.5179e+00],
         ...,
         [-1.6975e+00,  6.7376e+00, -1.0056e+00,  ..., -1.6540e+01,
           1.4331e+01,  4.9957e+00],
         [-4.5444e+00,  7.5032e+00,  2.5883e+00,  ..., -9.4577e-01,
          -2.1814e+00,  4.5546e+00],
         [-2.6276e+00,  3.1530e+00,  8.3507e+00,  ..., -1.6448e+01,
           2.5821e+00, -3.0676e+00]],

        [[ 5.4222e+00, -6.4050e+00, -9.5461e-02,  ..., -1.0439e+01,
          -6.7244e+00, -2.8128e+00],
         [ 1.3937e+00,  2.8340e+00, -5.8951e+00,  ..., -1.4838e+00,
           1.3206e+00,  3.5743e+00],
         [ 1.0266e+01,  5.1957e+00,  6.6383e+00,  ..., -6.0728e+00,
           2.4461e+00, -3.6607e+0

### 9. Write a pure PyTorch program to compute the value of $\sqrt{2}$ up to 4 decimal places without using the square root or other math functions from any of the libraries. 
### Hint: Notice that the answer is the (positive) root of the equation, $$𝑥^2 −2 = 0$$ 
### To find the root, you might want to use "Newton's Method": $$𝑥_{𝑛+1} = 𝑥_{𝑛} − \frac{𝑓(𝑥)}{𝑓′(𝑥)}$$


# Fail-fast prototyping

When building neural networks, you want things to either work or fail fast. Long iteration loops are 
the worst enemy of a machine learning practitioner. \
For e.g., while writing code, you might want to incrementally test your code by doing something 
like this:

batch_size = 32 \
num_features = 512 \
embedding_size = 16

\# construct a dummy input \
x = torch.randn(batch_size, num_features)

\# we want to project the input to embedding_size \
fc = torch.nn.Linear(num_features, embedding_size)

\# test if that works \
print(fc(x).shape)

# Fail-fast exercises

### 1. [Glove](https://nlp.stanford.edu/projects/glove/) has 300 dimension embeddings. Design an nn.Module that takes a sentence of max_len words, tokenizes words by spaces, represents the sentence by averaging the glove embeddings of constituent words. What is the shape of the resulting sentence embedding? When you implement this, you will need to make some assumptions. What are they?

In [10]:
# Load GloVe Embeddings
from torchtext.vocab import GloVe

GLOVE_DIM = 300
glove = GloVe(name = '840B', 
              dim = GLOVE_DIM)

print(f'Loaded {len(glove.itos)} words present in GloVe')

embeddings_tensor = glove.vectors
embeddings_tensor = embeddings_tensor.to(device = device)

Loaded 2196017 words present in GloVe


In [11]:
NUM_SENT = 512
sents = list()
for i in range(NUM_SENT):
    sents.append('This is the quest zero and it has a deadline this Sunday March 29')
print(len(sents))

512


In [38]:
import torch.nn as nn

class GloveEmbeddingAvg(nn.Module):
    
    def __init__(self, max_len):
        super().__init__()
        self.max_len = max_len
        self.embedding = nn.Embedding.from_pretrained(embeddings_tensor)
        
    def forward(self, 
                sent):
        # Tokenize the sentence by spaces
        tokens = sent.split(' ')[:self.max_len]
        # Get idx of each token from the GloVe dictionary
        glove_dict_indexes = [glove.stoi[token] for token in tokens]
        # Convert it into Tensor
        glove_dict_indexes = torch.tensor(glove_dict_indexes, 
                                          device = device)
        # Get Word Embeddings for all tokens
        word_embeds = self.embedding(glove_dict_indexes)
        # Sentence Embedding = Average of Word Embeddings
        sent_embeds = word_embeds.mean(dim = 0)
        # Reshape Sentence Embedding as a 2D Tensor
        return sent_embeds.view(1, -1)

MAX_LEN = 10
glove_embeds_avg = GloveEmbeddingAvg(MAX_LEN).to(device)

print(glove_embeds_avg(sents[0]).shape)

torch.Size([1, 300])


### 2. How will you modify step 1. so that the sentence embeddings are in $R^{50}$ ?
BONUS: Can you think of more than one way to do this? What are the implications of each method?

In [34]:
class GloveEmbeddingAvg_50_Dim(nn.Module):
    
    def __init__(self, max_len):
        super().__init__()
        self.max_len = max_len
        self.embedding = nn.Embedding.from_pretrained(embeddings_tensor)
        self.fc = nn.Linear(GLOVE_DIM, 50)
        
    def forward(self, 
                x):
        # Slice each sentence to Max Length
        x = x[:, :self.max_len]
        # Get Word Embeddings for all tokens
        word_embeds = self.embedding(x) # [BATCH_SIZE, MAX_LEN, GLOVE_DIM]
        # Sentence Embedding = Average of Word Embeddings
        sent_embeds = word_embeds.mean(dim = 0) # [MAX_LEN, GLOVE_DIM]
        # Linear Layer to reduce Sentence Embedding Dimension to 50
        return self.fc(sent_embeds) # [MAX_LEN, 50]

MAX_LEN = 10
glove_embeds_avg_50_dim = GloveEmbeddingAvg_50_Dim(MAX_LEN).to(device)

### 3. Quickly test your answer in step 2. with a batch of 512 sentences on the GPU.

In [35]:
# Tokenize a Sentence
def tokenize(sent):
    # Tokenize the sentence by spaces
    tokens = sent.split(' ')
    # Get idx of each token from the GloVe dictionary
    glove_dict_indexes = [glove.stoi[token] for token in tokens]
    return glove_dict_indexes

# Create Tokenized Sentence Corpus
tokenized_sents = list()
for sent in sents:
    tokenized_sents.append(tokenize(sent))
tokenized_sents = torch.tensor(tokenized_sents, 
                               device = device)

# Run forward pass
BATCH_SIZE = 512
for i in range(0, len(tokenized_sents), BATCH_SIZE):
    batch = tokenized_sents[i:i+BATCH_SIZE]
    sentence_embeddings = glove_embeds_avg_50_dim(batch)
    print(sentence_embeddings.shape)

torch.Size([10, 50])


### Congratulations! You almost implemented the model in the Deep Averaging Networks (DAN) paper!

# 4. Task: 
### Create a   MultiEmbedding  Module that can take two sets of indices, embed them, and concat the results. You might remember it from the previous lecture where we had to produce an embedding for "green apple" from embeddings of "green" and "apple". Your  MultiEmbedding class should work with the following test code.

In [15]:
class MultiEmbedding(nn.Module):
    
    def __init__(self, 
                 num_emb, 
                 size_emb1, 
                 size_emb2):
        super().__init__()
        self.embedding_A = nn.Embedding(num_emb, size_emb1)
        self.embedding_B = nn.Embedding(num_emb, size_emb2)
        
    def forward(self, 
                indices1, 
                indices2):
        embed_A = self.embedding_A(indices1)
        embed_B = self.embedding_B(indices2)
        # Concatenate the Embeddings
        return torch.cat((embed_A, embed_B), 
                         dim = -1)

In [16]:
# Test code: instantiate a MultiEmbedding with the sizes for each embedding. 
# For this example, you can just randomly initialize each interior embedding. 
# In a practical setting, you might support methods for initializing with 
# combinations of embeddings, such as GloVe 300d vectors and word2vec 200d 
# vectors, yielding 500d embeddings. Both embeddings share a vocabulary/range 
# of supported indices indicated by `num_emb`

NUM_EMB = 10000
SIZE_EMB1 = 300
SIZE_EMB2 = 200
BATCH_SIZE = 64
NUM_LENGTH = 10

multiemb = MultiEmbedding(NUM_EMB, 
                          SIZE_EMB1, 
                          SIZE_EMB2).to(device)

# You can then call this with a pair of indices where each value is in 0 <= i < num_emb
indices1 =  torch.randint(0, 
                          NUM_EMB, 
                          (BATCH_SIZE, NUM_LENGTH), 
                          dtype = torch.long, 
                          device = device) # long tensor of shape (batch, num_length)
indices2 =  torch.randint(0, 
                          NUM_EMB, 
                          (BATCH_SIZE, NUM_LENGTH), 
                          dtype = torch.long, 
                          device = device) # long tensor of shape (batch, num_length)
output = multiemb(indices1, 
                  indices2)
print(output.shape) # should be (batch, num_length, size_emb1 + size_emb2)

torch.Size([64, 10, 500])


# 5. Datasets and DataLoaders: 
### Read this short post on PyTorch Dataset and DataLoaders. Often in prototyping we need to generate dummy datasets to test our models. Implement a PyTorch Dataset class that generates up to  num_sentences  random sentences of length up to  max_len words. For each sentence, generate a binary label. You should be able to test your code as follows: