<a href="https://colab.research.google.com/github/ankuj/teaching/blob/main/intro_nlp_day_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


<br>
====================================================<br>
RNN Improvements Practical<br>
Vanishing/Exploding Gradients, GRUs and LSTMs<br>
====================================================<br>
Learning Goals:<br>
- Understand vanishing and exploding gradients in RNNs<br>
- Apply gradient clipping as a solution<br>
- Implement GRU and LSTM for text classification<br>
- Compare GRU vs LSTM on IMDB Sentiment Dataset<br>
====================================================<br>


In [6]:
!pip install torch torchtext



In [56]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

====================================================<br>
TASK 1 — Conceptual<br>
====================================================


<br>
Q1: Why do RNNs suffer from vanishing or exploding gradients?<br>
Write your answer here:<br>

That is because in the long seqences, the gradents becomes very small or very big in the training and this happens becuase of the multiplactions as it repeated , which make it hard for RNNs to get to learn patterns over long time steps.



====================================================<br>
TASK 2 — Demonstrate Exploding Gradients <br>
====================================================

In [57]:
rnn = nn.RNN(input_size=1, hidden_size=1,  nonlinearity='relu', batch_first=True) # Simple RNN model , add relu nonlinearity
x = torch.ones((1, 300, 1))          # long sequence (becuase of that it will explode the gradents) - I edited from 1, 50, 1 to 1, 200, 1 (optional)
target = torch.tensor([1])  # fake label

criterion = nn.MSELoss() 
optimizer = optim.SGD(rnn.parameters(), lr=1.0) # High learning rate to explode gradients

In [58]:
print("\n--- Task 2: Exploding Gradients Demonstration ---")
# Your code here  
for epoch in range(5):
    optimizer.zero_grad() # To clear old gradients from the last step
    output, _ = rnn(x)
    loss = criterion(output[:, -1, :], target.float().unsqueeze(1)) #Because we want to get the output of the last time step and compare it with the target 
    loss.backward() # Backpropagation
    optimizer.step()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


--- Task 2: Exploding Gradients Demonstration ---
Epoch 1, Loss: 0.3436098098754883
Epoch 2, Loss: 0.6835104823112488
Epoch 3, Loss: 1.0
Epoch 4, Loss: 1.0
Epoch 5, Loss: 1.0



<br>
# --- TIP ---<br>
You should observe gradient norms growing very large, signifying an exploding gradient problem. Why is that the case? How can you remedy that?<br>


====================================================<br>
TASK 3 — Apply Gradient Clipping (15 mins)<br>
====================================================

In [None]:
print("\n--- Task 3: Gradient Clipping ---")
rnn = nn.RNN(input_size=1, hidden_size=1, batch_first=True)
optimizer = optim.SGD(rnn.parameters(), lr=1.0) 



--- Task 3: Gradient Clipping ---


In [None]:
# Your code here

for epoch in range(5):
    optimizer.zero_grad() # To clear old gradients from the last step
    output, _ = rnn(x)
    loss = criterion(output[:, -1, :], target.float().unsqueeze(1))
    loss.backward() # Backpropagation
    
    # Gradient clipping
    # If its larger than 1.0, it scales all gradients down proportionally
    torch.nn.utils.clip_grad_norm_(rnn.parameters(), max_norm=1.0) #  makes sure the total size of all gradients does not exceed 1.0, which prevents exploding gradients.
    
    optimizer.step()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 0.14726056158542633
Epoch 2, Loss: 2.5889881726470776e-05
Epoch 3, Loss: 2.584683352324646e-05
Epoch 4, Loss: 2.5803215976338834e-05
Epoch 5, Loss: 2.576024053269066e-05


------------------------------<br>
Task 4: Manual Forward Pass <br>
------------------------------

In [None]:
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def task4_manual_forward_pass():
    """
    Compute a forward pass manually (hidden and output state) for a small LSTM using the activation functions in the formula.
    Input sequence length T=3, input size=2, hidden size=2
    """
    # Input: 3 timesteps, 2 features each
    x_seq = [np.array([0.5, -1.0]), # timestep 1 input vector
             np.array([1.0, 0.0]), # timestep 2 input vector
             np.array([-0.5, 0.5])] # timestep 3 input vector
    h_prev = np.zeros(2)  # hidden state
    c = np.zeros(2)       # cell state

    # Input gate
    W_i = np.array([[0.5, 0.1], # weights for input to input gate
                    [0.2, 0.3]])
    U_i = np.array([[0.1, 0.0], # weights for hidden to input gate
                    [0.0, 0.1]])
    b_i = np.zeros(2)      # bias for input gate

    # Forget gate
    W_f = np.array([[-0.3, 0.2],
                    [ 0.1, 0.4]])
    U_f = np.array([[0.2, 0.0],
                    [0.0, 0.2]])
    b_f = np.zeros(2)

    # Output gate
    W_o = np.array([[0.25, -0.15],
                    [0.05,  0.20]])
    U_o = np.array([[-0.1, 0.0],
                    [ 0.0, -0.1]])
    b_o = np.zeros(2)

    # cell input
    W_g = np.array([[0.4, -0.3],
                    [0.2,  0.5]])
    U_g = np.array([[0.05, 0.0],
                    [0.0,  0.05]])
    b_g = np.zeros(2)

    print("\n--- Manual LSTM Forward Pass ---")
    for t, x in enumerate(x_seq, 1):
        # Gates
        i = sigmoid(W_i @ x + U_i @ h_prev + b_i)
        f = sigmoid(W_f @ x + U_f @ h_prev + b_f)
        o = sigmoid(W_o @ x + U_o @ h_prev + b_o)
        g = np.tanh( W_g @ x + U_g @ h_prev + b_g)

        # Update states
        c = f * c + i * g
        h_prev = o * np.tanh(c)

        print(f"\nTime {t}, x={x}")
        print(f"i={i}, f={f}, o={o}, g={g}")
        print(f"c_t={c}, h_t={h_prev}")

    print("\nFinal hidden:", h_prev)
    print("Final cell:", c)
    
task4_manual_forward_pass()


--- Manual LSTM Forward Pass ---

Time 1, x=[ 0.5 -1. ]
i=[0.53742985 0.450166  ], f=[0.41338242 0.41338242], o=[0.56831998 0.45636131], g=[ 0.46211716 -0.37994896]
c_t=[ 0.24835555 -0.17104011], h_t=[ 0.13831331 -0.07730372]

Time 2, x=[1. 0.]
i=[0.6257042  0.54791987], f=[0.43233337 0.52112224], o=[0.55876926 0.51442859], g=[0.38585067 0.19365789]
c_t=[0.34880078 0.01697621], h_t=[0.18736181 0.00873221]

Time 3, x=[-0.5  0.5]
i=[0.45480772 0.51271556], f=[0.57137721 0.53786398], o=[0.44553295 0.51852321], g=[-0.32804142  0.14931194]
c_t=[0.05010105 0.08568544], h_t=[0.02230301 0.04432148]

Final hidden: [0.02230301 0.04432148]
Final cell: [0.05010105 0.08568544]


====================================================<br>
Preprocess IMDB dataset<br>
====================================================

In [65]:
print("\n--- Loading IMDB dataset ---")


--- Loading IMDB dataset ---


====================================================<br>
TASK 5 — Implement LSTM Sentiment Classifier on the IMDB dataset<br>
====================================================

In [None]:
# Load Custom Dataset about bars reviews with two keys "review" and "sentiment" positive or negative

import random

positive_phrases = [
    "The cocktails were amazing",
    "Loved the cozy atmosphere",
    "Great music and vibe",
    "Bartenders were very friendly",
    "The mojito was perfect",
    "Excellent service and staff",
    "Happy hour deals were great",
    "The drinks tasted fantastic",
    "Best bar experience ever",
    "Highly recommend this place"
]

negative_phrases = [
    "The drinks were overpriced",
    "Too noisy and crowded",
    "Service was terrible",
    "The cocktails tasted watered down",
    "The bartender was rude",
    "We waited too long for drinks",
    "The place smelled bad",
    "Not enough seating",
    "The floor was sticky",
    "Worst bar experience ever"
]

# Generate 100 positive and 100 negative reviews
bar_reviews = []

for i in range(100):
    review = random.choice(positive_phrases) + f" (review {i+1})"
    bar_reviews.append({"review": review, "sentiment": "positive"})

for i in range(100):
    review = random.choice(negative_phrases) + f" (review {i+1})"
    bar_reviews.append({"review": review, "sentiment": "negative"})

for r in bar_reviews[:5]:
    print(r)

{'review': 'Happy hour deals were great (review 1)', 'sentiment': 'positive'}
{'review': 'Highly recommend this place (review 2)', 'sentiment': 'positive'}
{'review': 'The mojito was perfect (review 3)', 'sentiment': 'positive'}
{'review': 'Excellent service and staff (review 4)', 'sentiment': 'positive'}
{'review': 'Best bar experience ever (review 5)', 'sentiment': 'positive'}


In [None]:
import random
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader, TensorDataset

# Tokenizer & manual vocab
tok = get_tokenizer("basic_english")

tokens = set()
for r in bar_reviews:
    tokens.update(tok(r["review"]))

PAD, UNK = 0, 1
token2id = {"<pad>": PAD, "<unk>": UNK}
for w in sorted(tokens):
    if w not in token2id:
        token2id[w] = len(token2id)

def encode(text, maxlen=20):
    ids = [token2id.get(t, UNK) for t in tok(text)]
    ids = ids[:maxlen]
    if len(ids) < maxlen:
        ids += [PAD] * (maxlen - len(ids))
    return ids  # list[int] of length maxlen

MAXLEN = 20
X_all = [encode(r["review"], MAXLEN) for r in bar_reviews]
y_all = [1.0 if r["sentiment"] == "positive" else 0.0 for r in bar_reviews]


pairs = list(zip(X_all, y_all))
random.shuffle(pairs)
X_all, y_all = zip(*pairs)

split = int(0.8 * len(X_all))
X_train = torch.tensor(X_all[:split], dtype=torch.long)
y_train = torch.tensor(y_all[:split], dtype=torch.float32)
X_test  = torch.tensor(X_all[split:], dtype=torch.long)
y_test  = torch.tensor(y_all[split:], dtype=torch.float32)


BATCH_SIZE = 32
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(TensorDataset(X_test,  y_test ), batch_size=BATCH_SIZE, shuffle=False)

# GRU text classifier

class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, emb=64, hid=128, pad_idx=PAD):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb, padding_idx=pad_idx)
        self.gru = nn.GRU(emb, hid, batch_first=True)
        self.fc  = nn.Linear(hid, 1)
    def forward(self, x):
        _, h = self.gru(self.emb(x))        # h: (1, B, H)
        logits = self.fc(h.squeeze(0))      # (B, 1)
        return logits.squeeze(1)            # raw logits (no sigmoid)

model = GRUClassifier(vocab_size=len(token2id))
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

#  Train & eval
EPOCHS = 30

for ep in range(1, EPOCHS + 1):
    # ---- train ----
    model.train()
    running_loss = 0.0
    for Xb, yb in train_loader:
        optimizer.zero_grad()
        logits = model(Xb)
        loss = criterion(logits, yb)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # ---- test ----
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for Xb, yb in test_loader:
            logits = model(Xb)
            preds = (torch.sigmoid(logits) > 0.5).float()
            correct += (preds == yb).sum().item()
            total   += yb.numel()
    acc = correct / total if total else 0.0
    print(f"Epoch {ep}/{EPOCHS}  Loss: {running_loss/len(train_loader):.4f}  Test Acc: {acc:.2f}")

Epoch 1/30  Loss: 0.6934  Test Acc: 0.42
Epoch 2/30  Loss: 0.6928  Test Acc: 0.42
Epoch 3/30  Loss: 0.6926  Test Acc: 0.42
Epoch 4/30  Loss: 0.6922  Test Acc: 0.42
Epoch 5/30  Loss: 0.6911  Test Acc: 0.42
Epoch 6/30  Loss: 0.6837  Test Acc: 0.68
Epoch 7/30  Loss: 0.6144  Test Acc: 0.68
Epoch 8/30  Loss: 0.4067  Test Acc: 0.85
Epoch 9/30  Loss: 0.1487  Test Acc: 0.90
Epoch 10/30  Loss: 0.0788  Test Acc: 0.88
Epoch 11/30  Loss: 0.0653  Test Acc: 0.90
Epoch 12/30  Loss: 0.0643  Test Acc: 0.90
Epoch 13/30  Loss: 0.0649  Test Acc: 0.90
Epoch 14/30  Loss: 0.0652  Test Acc: 0.90
Epoch 15/30  Loss: 0.0622  Test Acc: 0.90
Epoch 16/30  Loss: 0.0608  Test Acc: 0.90
Epoch 17/30  Loss: 0.0605  Test Acc: 0.90
Epoch 18/30  Loss: 0.0609  Test Acc: 0.90
Epoch 19/30  Loss: 0.0602  Test Acc: 0.90
Epoch 20/30  Loss: 0.0604  Test Acc: 0.90
Epoch 21/30  Loss: 0.0604  Test Acc: 0.90
Epoch 22/30  Loss: 0.0601  Test Acc: 0.90
Epoch 23/30  Loss: 0.0598  Test Acc: 0.90
Epoch 24/30  Loss: 0.0596  Test Acc: 0.90
E

====================================================<br>
TASK 6 — Swap LSTM with GRU and repeat Task 4<br>
====================================================

In [None]:
class GRUClassifier(nn.Module):


====================================================<br>
TASK 7 <br>
====================================================

Compare loss curves for the LSTM and GRU classifiers. Which performs better and why?<br>