# Week 1 - Task 1 | GroundZero AI


---



### Import the necessary stuff!

Let's first import all the libraries we are going to use in this task.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.optim import Adam

# Part 1 - How is RNN limited? (Context and Vanishing Gradient)
 As you have learnt RNNs lose context over time, let's try to visualize that, through an example


In [None]:
### CODE STARTS HERE ###

# enter the length of sequence here, variable name 'sequence_length' | you are supposed to take the values 20,100,1000,2000 and compare the results
sequence_length=20
### CODE ENDS HERE ###

# generate a sequence of values from the sine function and create sequence_length evenly spaced values between 0 and 4(pi)
data = np.sin(np.linspace(0, 4 * np.pi, sequence_length))
#adjusts the shape of the 1D array into a 3D array
data = data.reshape((1, sequence_length, 1))

In [None]:
# Create a simple RNN Model
class RNNModel(nn.Module):
    def __init__(self, sequence_length):
        super().__init__()
        self.rnn = nn.RNN(input_size=1, hidden_size=10, batch_first=True)
        self.dense = nn.Linear(10, 1)
    def forward(self, x):
        x, _ = self.rnn(x)
        return self.dense(x)
rnn_model = RNNModel(sequence_length)

In [None]:
#train it
optimizer = Adam(rnn_model.parameters(), lr=0.01)
criterion = nn.MSELoss()
data_tensor = torch.FloatTensor(data)
for epoch in range(10):
    optimizer.zero_grad()
    outputs = rnn_model(data_tensor)
    loss = criterion(outputs, data_tensor)
    loss.backward()
    optimizer.step()

#find out the rnn outputs
with torch.no_grad():
    predictions = rnn_model(torch.FloatTensor(data)).numpy()

In [None]:
#plot the original sequence vs rnn outputs
plt.figure(figsize=(10, 6))
plt.plot(data[0, :, 0], label="Original Sequence", marker='o')
plt.plot(predictions[0, :, 0], label="RNN Output", linestyle='--', marker='x')
plt.title("RNN and Context Retention")
plt.legend()
plt.show()

## Takeaways
Notice as you increase the length of the sequence of words, the gap between the original sequence and the final RNN output increases. This shows us how contextual information is often lost in RNNs while processing long sequences


---



Now let's talk about how gradient starts to vanish after traning it for longer
epochs, thus after a long time, it becomes close to zero!

In [None]:
#define a vanishing gradient rnn model
class VanishingRNNModel(nn.Module):
    def __init__(self, sequence_length):
        super().__init__()
        self.rnn1 = nn.RNN(input_size=1, hidden_size=10, batch_first=True)
        self.rnn2 = nn.RNN(input_size=10, hidden_size=10, batch_first=True)
        self.rnn3 = nn.RNN(input_size=10, hidden_size=10, batch_first=True)
        self.dense = nn.Linear(10, 1)
    def forward(self, x):
        x, _ = self.rnn1(x)
        x, _ = self.rnn2(x)
        x, _ = self.rnn3(x)
        return self.dense(x)
vanishing_rnn_model = VanishingRNNModel(sequence_length)

In [None]:
# select the optimizers and loss function
optimizer = Adam(vanishing_rnn_model.parameters(), lr=0.01)
criterion = nn.MSELoss()

In [None]:
### CODE STARTS HERE ###

# train the model on 10,20,50 epochs and see how the graph looks everytime | store the result in a variable named 'history'
num_epochs = 50
history = {'loss': []}
data_tensor = torch.FloatTensor(data)

for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = vanishing_rnn_model(data_tensor)
    loss = criterion(outputs, data_tensor)
    loss.backward()
    optimizer.step()
    history['loss'].append(loss.item())
### CODE ENDS HERE ###

In [None]:
# plot the loss to observe vanishing gradient
plt.figure(figsize=(10, 6))
plt.plot(history['loss'], label="Training Loss", marker='o')
plt.title("Vanishing Gradient in Deeper RNNs")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

Notice how at longer epochs, the graph edges very close to zero, hence we have the problem of using RNNs for longer training sessions!


---



# Part 2 - Embeddings

Now let's try to code up the input embeddings!

In [None]:
# we input any sentence we want
sentence = "Transformers have revolutionized natural language processing and machine learning." #please feel free to add your sentence as well!

In [None]:
#create a vocabulary
vocab = set(sentence.split())
vocab_size = len(vocab)
print(vocab)

In [None]:
### CODE STARTS HERE ###
word_to_index = #input here - write a dictionary that maps every word to its index
index_to_word = #input here - vice versa of the above
tokens = #input here - assign each word of the sentence to it's tokens
### CODE ENDS HERE ###
print(tokens)

In [None]:
# we create an embedding layer
embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=8)

# we find the embedding by passing our tokens through the embedding layer
embeddings = embedding_layer(tokens)

In [None]:
#let's see what the embeddings look like
print(embeddings)

In [None]:
#visualize the embeddings
plt.figure(figsize=(10, 6))
for i in range(sentence_tokens.shape[1]):
    plt.scatter(range(embeddings.shape[2]), embeddings[0, i, :], label=f"Word: {index_to_word[sentence_tokens[0, i]]}" if i < 5 else "")
plt.title("Input Embeddings")
plt.legend(loc="best")
plt.show()

---

# Part 3 - Positional Embeddings

In this section, we code up the positional embedding values.

The positional embedding formulas are given as follows:
For a position (pos) and embedding dimension (i):


$$PE_{pos, 2i} = \sin\left(\frac{pos}{10000^{\frac{2i}{d_{\text{model}}}}}\right)$$
$$PE_{pos, 2i+1} = \cos\left(\frac{pos}{10000^{\frac{2i}{d_{\text{model}}}}}\right)
$$

In [None]:
### CODE STARTS HERE ###
positions = #input here - hint : use np.arange
denominator = #input here - code up the denominator terms
pos_embed =  #input here - initialize everything as zero
#fill pos_embed with the correct values
### CODE ENDS HERE ###

In [None]:
#let us check what the first 5 positional embeddings are
print(pos_embed[0:5])



---


# Part 4 - Encoder Input

We finally add up the input embeddings we made and positional embeddings to get the final input for the encoders.

$$Encoder \space input = input \space embeddings + positional \space embeddings$$

In [None]:
### CODE STARTS HERE ###
final_input = #input here
### CODE ENDS HERE ###

In [None]:
#let us check what the final input is
print(final_input)