In [1]:
from huggingface_hub import notebook_login
import os
#os.environ["CUDA_LAUNCH_BLOCKING"]="1"
os.environ["TOKENIZERS_PARALLELISM"]="true"

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from tqdm import tqdm
from utils import *
import gc
import tiktoken
from previous_chapters import *

model_name = "gpt2-large"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

ltype = "curlora"
for param in model.parameters():
        param.requires_grad = False

for name, module in model.named_modules():
    if isinstance(module, type(model.transformer.h[0].attn)):
        if ltype == "lora":
            module.c_attn = LinearWithLoRA(module.c_attn, 8, 1)
        else:
            module.c_attn = LinearWithCURLoRA(module.c_attn, 8, 1)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters after: {total_params:,}")

model.to(device)
    
ppl = calculate_perplexity(model, tokenizer, txt)
print("Perplexity:", round(ppl, 2))
    
torch.manual_seed(1311)
num_classes = 2
lm_head = model.lm_head
in_features=1280
model.lm_head = torch.nn.Linear(in_features=in_features, out_features=num_classes)
model.to(device)

Total trainable parameters after: 2,304
Perplexity: 28.25


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): LinearWithCURLoRA(
            (linear): Conv1D()
            (curlora): CURModule()
          )
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=2, bias=True)
)

In [3]:
torch.cuda.empty_cache()
_ = gc.collect()

In [4]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr)
num_epochs = 3
num_training_steps = num_epochs * len(mrpc_dataset["train"])
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)



In [5]:
# Convert to PyTorch datasets
train_dataset = mrpc_dataset["train"]
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i in tqdm(range(0, len(train_dataset), batch_size)):
        batch = train_dataset[i:i+batch_size]
        inputs = tokenizer(batch["sentence1"], batch["sentence2"], return_tensors="pt",
                           truncation=True, padding = True, max_length = max_len).to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)["logits"][:, -1, :]#.cpu()
        y = torch.LongTensor(batch["label"]).to(device)
        loss = torch.nn.functional.cross_entropy(outputs, y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        torch.cuda.empty_cache()
        _ = gc.collect()

    print(f"Epoch {epoch + 1}, Average loss: {total_loss / len(train_dataset)}")

100%|██████████| 115/115 [03:51<00:00,  2.01s/it]


Epoch 1, Average loss: 0.02003725999184237


100%|██████████| 115/115 [03:58<00:00,  2.07s/it]


Epoch 2, Average loss: 0.019793559812342456


100%|██████████| 115/115 [03:39<00:00,  1.91s/it]

Epoch 3, Average loss: 0.0193952102795032





In [6]:
print("Evaluating on MRPC...")
mrpc_accuracy = evaluate_mrpc(model, tokenizer, mrpc_dataset, device)
print(f"MRPC Accuracy: {mrpc_accuracy:.4f}")

Evaluating on MRPC...


100%|██████████| 408/408 [02:06<00:00,  3.23it/s]

MRPC Accuracy: 0.7034





In [7]:
mrpc_head = model.lm_head

In [8]:
torch.manual_seed(1311)

num_classes = 2
model.lm_head = torch.nn.Linear(in_features=in_features, out_features=num_classes).to(device)

In [9]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr)
num_epochs = 3
num_training_steps = num_epochs * len(sst_dataset["train"])
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)

In [10]:
# Convert to PyTorch datasets
train_dataset = sst_dataset["train"]

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i in tqdm(range(0, 5000, batch_size)):
        batch = train_dataset[i:i+batch_size]
        inputs = tokenizer(batch["sentence"], return_tensors="pt",
                           truncation=True, padding = True, max_length = max_len).to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)["logits"][:, -1, :]#.cpu()
        y = torch.LongTensor(batch["label"]).to(device)
        loss = torch.nn.functional.cross_entropy(outputs, y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        torch.cuda.empty_cache()
        _ = gc.collect()

    print(f"Epoch {epoch + 1}, Average loss: {total_loss / len(train_dataset)}")

100%|██████████| 157/157 [03:23<00:00,  1.29s/it]


Epoch 1, Average loss: 0.001623428777327302


100%|██████████| 157/157 [03:16<00:00,  1.25s/it]


Epoch 2, Average loss: 0.001548231452516296


100%|██████████| 157/157 [02:52<00:00,  1.10s/it]

Epoch 3, Average loss: 0.0014444273413506473





In [11]:
print("Evaluating on SST-2...")
accuracy = evaluate_sst2(model, tokenizer, sst_dataset, device)
print(f"SST-2 Accuracy: {accuracy:.4f}")

Evaluating on SST-2...


100%|██████████| 872/872 [04:25<00:00,  3.28it/s]

SST-2 Accuracy: 0.7569





In [12]:
torch.cuda.empty_cache()
_ = gc.collect()

In [13]:
sst_head = model.lm_head
model.lm_head = mrpc_head

In [14]:
print("Evaluating on MRPC...")
mrpc_accuracy = evaluate_mrpc(model, tokenizer, mrpc_dataset, device)
print(f"MRPC Accuracy: {mrpc_accuracy:.4f}")

Evaluating on MRPC...


100%|██████████| 408/408 [02:05<00:00,  3.25it/s]

MRPC Accuracy: 0.7059





In [15]:
torch.manual_seed(1311)

num_classes = 3
model.lm_head = torch.nn.Linear(in_features=in_features, out_features=num_classes).to(device)

In [16]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr)
num_epochs = 5
num_training_steps = num_epochs * len(sentiment_dataset["test"])

In [17]:
# Convert to PyTorch datasets
train_dataset = sentiment_dataset["test"]

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i in tqdm(range(0, len(train_dataset), batch_size)):
        batch = train_dataset[i:i+batch_size]
        inputs = tokenizer(batch["text"], return_tensors="pt",
                           truncation=True, padding = True, max_length = max_len).to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)["logits"][:, -1, :]#.cpu()
        y = torch.LongTensor(batch["sentiment"]).to(device) // 4
        loss = torch.nn.functional.cross_entropy(outputs, y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()
        _ = gc.collect()

    print(f"Epoch {epoch + 1}, Average loss: {total_loss / len(train_dataset)}")

100%|██████████| 16/16 [00:18<00:00,  1.13s/it]


Epoch 1, Average loss: 0.03077170767458567


100%|██████████| 16/16 [00:17<00:00,  1.12s/it]


Epoch 2, Average loss: 0.022606948472888595


100%|██████████| 16/16 [00:18<00:00,  1.13s/it]


Epoch 3, Average loss: 0.021655397482186436


100%|██████████| 16/16 [00:19<00:00,  1.23s/it]


Epoch 4, Average loss: 0.02075899245748558


100%|██████████| 16/16 [00:19<00:00,  1.20s/it]

Epoch 5, Average loss: 0.020433013817392678





In [18]:
print("Evaluating on Sentiment140...")
sentiment_accuracy = evaluate_sentiment(model, tokenizer, sentiment_dataset, device)
print(f"Sentiment Accuracy: {sentiment_accuracy:.4f}")

Evaluating on Sentiment140...


100%|██████████| 8/8 [00:06<00:00,  1.33it/s]

Sentiment Accuracy: 0.9971





In [19]:
sentiment_head = model.lm_head
sentiment_head

Linear(in_features=1280, out_features=3, bias=True)

In [20]:
model.lm_head = mrpc_head
print("Evaluating on MRPC...")
mrpc_accuracy = evaluate_mrpc(model, tokenizer, mrpc_dataset, device)
print(f"MRPC Accuracy: {mrpc_accuracy:.4f}")

Evaluating on MRPC...


100%|██████████| 408/408 [02:03<00:00,  3.30it/s]

MRPC Accuracy: 0.7059





In [21]:
model.lm_head = sst_head
print("Evaluating on SST-2...")
accuracy = evaluate_sst2(model, tokenizer, sst_dataset, device)
print(f"SST-2 Accuracy: {accuracy:.4f}")

Evaluating on SST-2...


100%|██████████| 872/872 [04:24<00:00,  3.30it/s]

SST-2 Accuracy: 0.7603





In [22]:
torch.cuda.empty_cache()
_ = gc.collect()

In [23]:
model.lm_head = lm_head
ppl = calculate_perplexity(model, tokenizer, txt)
print("Perplexity:", round(ppl, 2))

Perplexity: 28.24


In [24]:
text = "every effort moves you"
input_ids = tokenizer.encode(text, return_tensors="pt").to(0)

output = model.generate(input_ids, do_sample=False, max_length=500)
print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


every effort moves you forward," he said.

"I'm not going to be a politician. I'm not going to be a politician who's going to be in the media and say, 'I'm going to do this and I'm going to do that.' I'm going to do it the right way. I'm going to do it the right way."

The mayor said he's not going to be a politician who's going to be in the media and say, 'I'm going to do this and I'm going to do that.' I'm going to do it the right way. - Mayor Rob Ford

Ford said he's not going to be a politician who's going to be in the media and say, "I'm going to do this and I'm going to do that."

"I'm not going to be a politician who's going to be in the media and say, 'I'm going to do this and I'm going to do that.' I'm going to do it the right way," he said.

"I'm not going to be a politician who's going to be in the media and say, 'I'm going to do this and I'm going to do that.' I'm going to do it the right way."

Ford said he's not going to be a politician who's going to be in the media and 

In [25]:
torch.cuda.empty_cache()
_ = gc.collect()