## **Problem 7: Transformers**

# Question 1:

# Part 4.

Last Layer Output

In [2]:
import torch
import torch.nn as nn
import math
import numpy as np
from collections import Counter
from itertools import chain
import random
from torch.utils.data import Dataset, DataLoader
from seqeval.metrics import classification_report, f1_score, accuracy_score
import os
import warnings
warnings.filterwarnings("ignore")

def load_clean_ner_data(tokens_path, labels_path):
    with open(tokens_path, "r", encoding="utf-8") as token_file, open(labels_path, "r", encoding="utf-8") as label_file:
        token_lines = token_file.readlines()
        label_lines = label_file.readlines()

    sentences, labels = [], []
    
    for t_line, l_line in zip(token_lines, label_lines):
        tokens = t_line.strip().split()
        lbls = l_line.strip().split()
        if len(tokens) == len(lbls) and len(tokens) > 0:
            sentences.append([t.strip() for t in tokens])
            labels.append([l.strip().lower().replace("_", "-") for l in lbls])
    return sentences, labels

# Read and merge ARMAN & PEYMA datasets
arman_sent, arman_lab = load_clean_ner_data("arman-tokens.txt", "arman-labels.txt")
peyma_sent, peyma_lab = load_clean_ner_data("peyma-tokens.txt", "peyma-labels.txt")
all_sentences = arman_sent + peyma_sent
all_labels = arman_lab + peyma_lab

# Combine and shuffle all data, then split into train (%80) and test (%20) sets
combined = list(zip(all_sentences, all_labels))
random.seed(42)
random.shuffle(combined)
split_idx = int(0.8 * len(combined))
train_sentences, train_labels = zip(*combined[:split_idx])
test_sentences, test_labels = zip(*combined[split_idx:])

all_labels_unique = sorted(set(l for seq in (train_labels + test_labels) for l in seq))
label2id = {l: i for i, l in enumerate(all_labels_unique)}
id2label = {i: l for l, i in label2id.items()}

token_counter = Counter(chain(*train_sentences))
token2id = {"<pad>": 0, "<unk>": 1}
for tok, count in token_counter.items():
    if count > 1:
        token2id[tok] = len(token2id)
id2token = {i: t for t, i in token2id.items()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# NER dataset with padding and masks
class NERDataset(Dataset):
    def __init__(self, sentences, labels, max_len, token2id, label2id):
        self.sentences = sentences
        self.labels = labels
        self.max_len = max_len
        self.token2id = token2id
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = [self.token2id.get(t, self.token2id["<unk>"]) for t in self.sentences[idx]]
        tags = [self.label2id[l] for l in self.labels[idx]]
        tokens = tokens[:self.max_len]
        tags = tags[:self.max_len]
        attn_mask = [1] * len(tokens)
        pad_len = self.max_len - len(tokens)
        tokens += [0] * pad_len
        tags += [-100] * pad_len
        attn_mask += [0] * pad_len
        return {
            "input_ids": torch.tensor(tokens),
            "labels": torch.tensor(tags),
            "attention_mask": torch.tensor(attn_mask)
        }

# Prepare NER datasets and loaders
MAX_LEN = 128
train_data = NERDataset(train_sentences, train_labels, MAX_LEN, token2id, label2id)
test_data = NERDataset(test_sentences, test_labels, MAX_LEN, token2id, label2id)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16)

class PositionalEncoding(nn.Module):
    # Positional encoding for token positions
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class MultiHeadAttention(nn.Module):
    # Standard multi-head self-attention
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, q, k, v, mask=None):
        B = q.size(0)
        q = self.w_q(q).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.w_k(k).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.w_v(v).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)
        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        context = attn @ v
        context = context.transpose(1, 2).contiguous().view(B, -1, self.num_heads * self.d_k)
        return self.w_o(context)

class PositionwiseFeedForward(nn.Module):
    # Feed-forward block used in transformer
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.linear2(self.dropout(torch.relu(self.linear1(x))))

class SublayerConnection(nn.Module):
    # Wraps sublayer with norm, dropout, and residual
    def __init__(self, size, dropout):
        super().__init__()
        self.norm = nn.LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, d_ff, dropout):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, heads, dropout)
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.sublayers = nn.ModuleList([SublayerConnection(d_model, dropout) for _ in range(2)])

    def forward(self, x, mask):
        x = self.sublayers[0](x, lambda x: self.attn(x, x, x, mask))
        return self.sublayers[1](x, self.ffn)

# Full transformer-based NER model
class TransformerNER(nn.Module):
    def __init__(self, vocab_size, d_model, heads, d_ff, num_layers, max_len, num_labels):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pe = PositionalEncoding(d_model, max_len)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads, d_ff, 0.1) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, num_labels)

    def forward(self, input_ids, mask, labels=None):
        x = self.embedding(input_ids)
        x = self.pe(x)
        for layer in self.encoder:
            x = layer(x, mask)
        x = self.norm(x)
        logits = self.classifier(x)
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss(ignore_index=-100)(logits.view(-1, logits.size(-1)), labels.view(-1))
        return {"loss": loss, "logits": logits}

# Model hyperparameters
d_model = 128
num_heads = 4
ff_dim = 512
num_layers = 6
vocab_size = len(token2id)
num_labels = len(label2id)

model = TransformerNER(
    vocab_size=vocab_size,
    d_model=d_model,
    heads=num_heads,
    d_ff=ff_dim,
    num_layers=num_layers,
    max_len=MAX_LEN,
    num_labels=num_labels
).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)


# Training phase
for epoch in range(15):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        mask = batch["attention_mask"].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, mask, labels)
        loss = outputs["loss"]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/15, Loss: {total_loss / len(train_loader):.6f}")

# Predict on test set 
model.eval()
all_preds,all_labels_eval = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        mask = batch["attention_mask"].to(device)
        logits = model(input_ids, mask)["logits"].argmax(-1)

        for p, l, m in zip(logits, labels, mask):
            true_seq = [id2label[i.item()] for i, msk in zip(l, m) if i.item() != -100 and msk.item() == 1]
            pred_seq = [id2label[i.item()] for i, msk, gt in zip(p, m, l) if gt.item() != -100 and msk.item() == 1]
            all_labels_eval.append(true_seq)
            all_preds.append(pred_seq)

# compute evaluation metrics
accuracy = 100 * accuracy_score(all_labels_eval, all_preds)
f1 = 100 * f1_score(all_labels_eval, all_preds, average="weighted")

print(f"\nAccuracy: {accuracy:.4f}%")
print(f"F1_Score: {f1:.4f}%")
print(classification_report(all_labels_eval, all_preds))

Epoch 1/15, Loss: 0.398327
Epoch 2/15, Loss: 0.242080
Epoch 3/15, Loss: 0.178929
Epoch 4/15, Loss: 0.139925
Epoch 5/15, Loss: 0.112427
Epoch 6/15, Loss: 0.091104
Epoch 7/15, Loss: 0.075981
Epoch 8/15, Loss: 0.063532
Epoch 9/15, Loss: 0.054402
Epoch 10/15, Loss: 0.047018
Epoch 11/15, Loss: 0.041197
Epoch 12/15, Loss: 0.036242
Epoch 13/15, Loss: 0.032756
Epoch 14/15, Loss: 0.029624
Epoch 15/15, Loss: 0.026554

Accuracy: 97.8419%
F1_Score: 80.8414%
              precision    recall  f1-score   support

           _       0.78      0.81      0.79     10398
         dat       0.43      0.57      0.49       357
       event       0.85      0.91      0.88       396
         fac       0.85      0.93      0.89       281
         loc       0.86      0.88      0.87      3238
         mon       0.30      0.47      0.37       113
         org       0.78      0.83      0.81      3941
         pct       0.47      0.79      0.59        71
         per       0.61      0.61      0.61       928
        p

Averaging Last 3 Layers Output

In [None]:
import torch
import torch.nn as nn
import math
import numpy as np
from collections import Counter
from itertools import chain
import random
from torch.utils.data import Dataset, DataLoader
from seqeval.metrics import classification_report, f1_score, accuracy_score
import os
import warnings
warnings.filterwarnings("ignore")

def load_clean_ner_data(tokens_path, labels_path):
    with open(tokens_path, "r", encoding="utf-8") as token_file, open(labels_path, "r", encoding="utf-8") as label_file:
        token_lines = token_file.readlines()
        label_lines = label_file.readlines()

    sentences, labels = [], []
    
    for t_line, l_line in zip(token_lines, label_lines):
        tokens = t_line.strip().split()
        lbls = l_line.strip().split()
        if len(tokens) == len(lbls) and len(tokens) > 0:
            sentences.append([t.strip() for t in tokens])
            labels.append([l.strip().lower().replace("_", "-") for l in lbls])
    return sentences, labels

# Read and merge ARMAN & PEYMA datasets
arman_sent, arman_lab = load_clean_ner_data("arman-tokens.txt", "arman-labels.txt")
peyma_sent, peyma_lab = load_clean_ner_data("peyma-tokens.txt", "peyma-labels.txt")
all_sentences = arman_sent + peyma_sent
all_labels = arman_lab + peyma_lab

# Combine and shuffle all data, then split into train (%80) and test (%20) sets
combined = list(zip(all_sentences, all_labels))
random.seed(42)
random.shuffle(combined)
split_idx = int(0.8 * len(combined))
train_sentences, train_labels = zip(*combined[:split_idx])
test_sentences, test_labels = zip(*combined[split_idx:])

all_labels_unique = sorted(set(l for seq in (train_labels + test_labels) for l in seq))
label2id = {l: i for i, l in enumerate(all_labels_unique)}
id2label = {i: l for l, i in label2id.items()}

token_counter = Counter(chain(*train_sentences))
token2id = {"<pad>": 0, "<unk>": 1}
for tok, count in token_counter.items():
    if count > 1:
        token2id[tok] = len(token2id)
id2token = {i: t for t, i in token2id.items()}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# NER dataset with padding and masks
class NERDataset(Dataset):
    def __init__(self, sentences, labels, max_len, token2id, label2id):
        self.sentences = sentences
        self.labels = labels
        self.max_len = max_len
        self.token2id = token2id
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        tokens = [self.token2id.get(t, self.token2id["<unk>"]) for t in self.sentences[idx]]
        tags = [self.label2id[l] for l in self.labels[idx]]
        tokens = tokens[:self.max_len]
        tags = tags[:self.max_len]
        attn_mask = [1] * len(tokens)
        pad_len = self.max_len - len(tokens)
        tokens += [0] * pad_len
        tags += [-100] * pad_len
        attn_mask += [0] * pad_len
        return {
            "input_ids": torch.tensor(tokens),
            "labels": torch.tensor(tags),
            "attention_mask": torch.tensor(attn_mask)
        }

# Prepare NER datasets and loaders
MAX_LEN = 128
train_data = NERDataset(train_sentences, train_labels, MAX_LEN, token2id, label2id)
test_data = NERDataset(test_sentences, test_labels, MAX_LEN, token2id, label2id)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16)

class PositionalEncoding(nn.Module):
    # Positional encoding for token positions
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class MultiHeadAttention(nn.Module):
    # Standard multi-head self-attention
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        self.d_k = d_model // num_heads
        self.num_heads = num_heads
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, q, k, v, mask=None):
        B = q.size(0)
        q = self.w_q(q).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        k = self.w_k(k).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        v = self.w_v(v).view(B, -1, self.num_heads, self.d_k).transpose(1, 2)
        scores = q @ k.transpose(-2, -1) / math.sqrt(self.d_k)
        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(2)
            scores = scores.masked_fill(mask == 0, -1e9)
        attn = torch.softmax(scores, dim=-1)
        attn = self.dropout(attn)
        context = attn @ v
        context = context.transpose(1, 2).contiguous().view(B, -1, self.num_heads * self.d_k)
        return self.w_o(context)

class PositionwiseFeedForward(nn.Module):
    # Feed-forward block used in transformer
    def __init__(self, d_model, d_ff, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.linear2(self.dropout(torch.relu(self.linear1(x))))

class SublayerConnection(nn.Module):
    # Wraps sublayer with norm, dropout, and residual
    def __init__(self, size, dropout):
        super().__init__()
        self.norm = nn.LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, d_ff, dropout):
        super().__init__()
        self.attn = MultiHeadAttention(d_model, heads, dropout)
        self.ffn = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.sublayers = nn.ModuleList([SublayerConnection(d_model, dropout) for _ in range(2)])

    def forward(self, x, mask):
        x = self.sublayers[0](x, lambda x: self.attn(x, x, x, mask))
        return self.sublayers[1](x, self.ffn)

# Full transformer-based NER model
class TransformerNER(nn.Module):
    def __init__(self, vocab_size, d_model, heads, d_ff, num_layers, max_len, num_labels):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pe = PositionalEncoding(d_model, max_len)
        self.encoder = nn.ModuleList([EncoderLayer(d_model, heads, d_ff, 0.1) for _ in range(num_layers)])
        self.norm = nn.LayerNorm(d_model)
        self.classifier = nn.Linear(d_model, num_labels)
        self.num_layers = num_layers 

    def forward(self, input_ids, mask, labels=None):
        x = self.embedding(input_ids)
        x = self.pe(x)
        encoder_outputs = [] 
        for layer in self.encoder:
            x = layer(x, mask)
            encoder_outputs.append(x) 

        num_layers_to_average = 3 
        if self.num_layers < num_layers_to_average:
            avg_layers_output = torch.stack(encoder_outputs, dim=0).mean(dim=0)
        else:
            avg_layers_output = torch.stack(encoder_outputs[-num_layers_to_average:], dim=0).mean(dim=0)
        x_processed = self.norm(avg_layers_output) 
        logits = self.classifier(x_processed) 
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss(ignore_index=-100)(logits.view(-1, logits.size(-1)), labels.view(-1))
        return {"loss": loss, "logits": logits}

# Model hyperparameters
d_model = 128
num_heads = 4
ff_dim = 512
num_layers = 6
vocab_size = len(token2id)
num_labels = len(label2id)

model = TransformerNER(
    vocab_size=vocab_size,
    d_model=d_model,
    heads=num_heads,
    d_ff=ff_dim,
    num_layers=num_layers,
    max_len=MAX_LEN,
    num_labels=num_labels
).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4)


# Training phase
for epoch in range(15):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        mask = batch["attention_mask"].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, mask, labels)
        loss = outputs["loss"]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/15, Loss: {total_loss / len(train_loader):.6f}")

# Predict on test set 
model.eval()
all_preds,all_labels_eval = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        mask = batch["attention_mask"].to(device)
        logits = model(input_ids, mask)["logits"].argmax(-1)

        for p, l, m in zip(logits, labels, mask):
            true_seq = [id2label[i.item()] for i, msk in zip(l, m) if i.item() != -100 and msk.item() == 1]
            pred_seq = [id2label[i.item()] for i, msk, gt in zip(p, m, l) if gt.item() != -100 and msk.item() == 1]
            all_labels_eval.append(true_seq)
            all_preds.append(pred_seq)

# compute evaluation metrics
accuracy = 100 * accuracy_score(all_labels_eval, all_preds)
f1 = 100 * f1_score(all_labels_eval, all_preds, average="weighted")

print(f"\nAccuracy: {accuracy:.4f}%")
print(f"F1_Score: {f1:.4f}%")
print(classification_report(all_labels_eval, all_preds))

Epoch 1/15, Loss: 0.407909
Epoch 2/15, Loss: 0.246975
Epoch 3/15, Loss: 0.183764
Epoch 4/15, Loss: 0.144216
Epoch 5/15, Loss: 0.116433
Epoch 6/15, Loss: 0.095507
Epoch 7/15, Loss: 0.080167
Epoch 8/15, Loss: 0.067811
Epoch 9/15, Loss: 0.058009
Epoch 10/15, Loss: 0.050300
Epoch 11/15, Loss: 0.044389
Epoch 12/15, Loss: 0.038998
Epoch 13/15, Loss: 0.035698
Epoch 14/15, Loss: 0.031886
Epoch 15/15, Loss: 0.029089

Accuracy: 97.8806%
F1_Score: 81.2971%
              precision    recall  f1-score   support

           _       0.79      0.82      0.80     10398
         dat       0.45      0.48      0.46       357
       event       0.85      0.90      0.88       396
         fac       0.86      0.92      0.89       281
         loc       0.84      0.89      0.86      3238
         mon       0.17      0.19      0.18       113
         org       0.79      0.84      0.82      3941
         pct       0.44      0.54      0.48        71
         per       0.63      0.62      0.63       928
        p