# Exercise 6: Attention

## Task 1: Implementation of Self Attention

padding is done at the end in transformer... why not at the begining?

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
import numpy as np
from nltk.tokenize import word_tokenize
from gensim.downloader import load as gensim_load
from datasets import load_dataset, concatenate_datasets

In [23]:
glove = gensim_load('glove-wiki-gigaword-100')

In [25]:
def load_imdb(n_samples=100):
    dataset = load_dataset("imdb", split="train")

    # How many samples per class
    n_per_class = n_samples // 2

    # Filter each class
    pos = dataset.filter(lambda x: x["label"] == 1).shuffle(seed=42).select(range(n_per_class))
    neg = dataset.filter(lambda x: x["label"] == 0).shuffle(seed=42).select(range(n_per_class))

    # Combine and shuffle
    balanced = concatenate_datasets([pos, neg]).shuffle(seed=42)

    texts = balanced["text"]
    labels = balanced["label"]
    return texts, labels

texts, labels = load_imdb(n_samples=10000) # adjust n_samples based on your computational resources

In [None]:
import nltk
#nltk.download('punkt_tab')

def vectorize(tokens, max_len=100):
    tokens = word_tokenize(tokens.lower())

    #embedded represnetation of input tokens
    X =[]

    for token in tokens [:max_len]:
        if token in glove:
            X.append(glove[token])
        else:
            X.append(np.zeros(glove.vector_size))
    while len(X) < max_len:
        X.append(np.zeros(glove.vector_size))

    return X



[nltk_data] Downloading package punkt_tab to
[nltk_data]     c:\Users\balkh\miniconda3\envs\myenv\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# Create X and y
X = torch.tensor([vectorize(t) for t in texts], dtype=torch.float32)
Y = torch.tensor(labels, dtype=torch.long) 


In [None]:
class SelfAttention(nn.Module):
    def __init__(self, d_model, d_k):
        super().__init__()
        # define the weight matrices W_Q, W_K, and W_V
        self.model = d_model
        self.d_k = d_k

        # learnable weight matrices
        self.W_Q = nn.Linear(d_model, d_k) #x @ weight + bias
        self.W_K = nn.Linear(d_model, d_k) #x @ weight + bias
        self.W_V = nn.Linear(d_model, d_k) #....



    def forward(self, x):
        #vector production
        W_Q = self.W_Q(x)
        W_K = self.W_K(x)
        W_V = self.W_V(x)

        #attention score using dot product between queries and keys
        scores = torch.matmul (W_Q, W_K.transpose(-2,-1)) / np.sqrt(self.d_k)

        W_attention = F.softmax(scores, dim=-1)
        #attention-weighted sum of the value vector
        out = torch.matmul(W_attention, W_V)
        return out

## Task 2: Adding a Classification Layer

In [None]:
class BinaryClassificationModel(nn.Module):
    def __init__(self, d_model, d_k, temp= 2, dim = 128):
        super().__init__()
        # attention layer
        self.attention =SelfAttention(d_model, d_k)

        #multilayer perceptron
        #linear then non-linear, then linear.
        #project input into a higher dimensional feature space (to learn more compelex combinatuion of features)

        #then Relu (non-linear) to add non-0linear transformations for classification

        #then linear to compress features into one logit for binarz classification


        self.layers = nn.Sequential (
            nn.Linear(d_k, dim),
            nn.ReLU(),
            nn.Linear(dim, 2)
        )
        self.temperature = temp
        # add temperature to softmax


#before going into the MLP, the attention matrix will need ro be multiplied with what i assume is X. where isthe bias here? is it already included in the layer variable? considering it's sequential nn?

    def forward(self, x):
        atten_out= self.attention(x)

        #to summerize the sequence of tokens rather than looking at tokens (one prediction oer input sequence)
        pooled = atten_out.mean(dim=1)
        logits = self.layers(pooled)


        #confused??? should i really be using softmax for binary classification. but i need to be able to modify the temperature
        #probs = F.softmax(logits / self.temperature, dim=1)
        #solved. divide logits by temp before passing to CELoss
        return logits

## Task 3: Training

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset


def train_model(model, X, Y, epochs=10, lr=1e-3, batch_size=8):
    model.train()

    #should i be using BCE becaue it's binary classification task? i don't think it'll work with sigmoid.. it didn't, model got stuck and didn't learn. predictions were stuck at 0.7310586 = Sigmoid(1)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr)

    dataset = TensorDataset(X, Y)
    loader = DataLoader(dataset, batch_size, shuffle=True)


    for epoch in range(epochs):
        total_loss=0
        for x, y in loader:
            optimizer.zero_grad()

            logits = model(x)

            # modifying the logits directy before passing to CEL as it already uses a softmax
            scaled_logits = logits / model.temperature
            loss = criterion(scaled_logits, y)

            loss.backward()
            optimizer.step()

            total_loss+= loss.item() * x.size(0)
        avg_loss = total_loss/ len(loader.dataset)
        print(f"epoch {epoch+1}, loss: {avg_loss:.3f}")


In [65]:
from sklearn.metrics import accuracy_score, recall_score, f1_score

d_model = X.shape[-1] # determine d_model (e.g., based on X)
## based on input dimension of X. basically 100, because we're using glove-100
d_k = 64 # try different values for d_k
model = BinaryClassificationModel(d_model=d_model, temp=6, d_k=d_k)
train_model(model, X, Y)

def evaluate_model(model, X, Y, temperature=1.0):
    model.eval()
    with torch.no_grad():
        logits = model(X)
        probs = torch.softmax(logits, dim=1)
        preds = torch.argmax(probs, dim=1)

    preds_np = preds.numpy()
    labels_np = Y.numpy()

    acc = accuracy_score(labels_np, preds_np)
    rec = recall_score(labels_np, preds_np)
    f1 = f1_score(labels_np, preds_np)

    print(f"Accuracy: {acc:.4f}")
    print(f"Recall:   {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")

#evaluate_model(model, X, Y)

epoch 1, loss: 0.553
epoch 2, loss: 0.500
epoch 3, loss: 0.483
epoch 4, loss: 0.475
epoch 5, loss: 0.466
epoch 6, loss: 0.462
epoch 7, loss: 0.454
epoch 8, loss: 0.448
epoch 9, loss: 0.444
epoch 10, loss: 0.440


## Task 4: Inference

In [None]:
def predict_sentiment(text):
    model.eval()
    vectors = vectorize(text)
    x = torch.tensor([vectors], dtype=torch.float32) 

    with torch.no_grad():
        logits = model(x)
        probs = torch.softmax(logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()

    sentiment = "positive" if pred == 1 else "negative"
    return sentiment


In [71]:
predict_sentiment("This movie was fantastic and full of suspense!")

'positive'

In [72]:
predict_sentiment("This movie was boring")

'negative'

In [78]:
review = (
    "While the film certainly had its moments of brilliance, especially in its cinematography and musical score, "
    "it was ultimately bogged down by a convoluted plot and inconsistent character development. "
    "The lead actor gave a compelling performance, yet the supporting cast felt miscast and underused. "
    "By the end, I wasn’t sure whether I was moved or just confused. It’s a film that aims high, "
    "but its ambition may have outpaced its execution."
)

predict_sentiment(review)

'negative'

Continuing with the analogy proposed in the lecture with a search engine, think of a particular engine, the engine of your preferences. What would be the keys (K), the queries (Q) and the values (V )?
Query would be what you type.
value the info from the webpage

sqaure root for the dimension in softmax is basically a scale to control the values. 

(K is usually very big. the multiplication will only make it bigger). 

if you scale by dk you will have a more stable softmax between 1 and 0

 we can compare with csine simimlary as we eventually end up witha  similary matrix between the keys and queries


during inference, the results are more free and you would have "hallucinations".. it's not tuning. when tuning you need to train the model after you tweak the parameters. 


