In [1]:
from huggingface_hub import notebook_login
import os
#os.environ["CUDA_LAUNCH_BLOCKING"]="1"
os.environ["TOKENIZERS_PARALLELISM"]="true"

In [2]:
import torch
import torch.nn as nn
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from tqdm import tqdm
from utils import *
import gc

model_name = "gpt2-large"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

ltype = "lora"
for param in model.parameters():
        param.requires_grad = False

for name, module in model.named_modules():
    if isinstance(module, type(model.transformer.h[0].attn)):
        if ltype == "lora":
            module.c_attn = LinearWithLoRA(module.c_attn, 8, 1)
        else:
            module.c_attn = LinearWithCURLoRA(module.c_attn, 8, 1)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total trainable parameters after: {total_params:,}")

model.to(device)
    
ppl = calculate_perplexity(model, tokenizer, txt)
print("Perplexity:", round(ppl, 2))
    
torch.manual_seed(1311)
num_classes = 2
lm_head = model.lm_head
in_features=1280
    
model.lm_head = torch.nn.Linear(in_features=in_features, out_features=num_classes)
model.to(device)

Total trainable parameters after: 1,474,560
Perplexity: 28.25


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1280)
    (wpe): Embedding(1024, 1280)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-35): 36 x GPT2Block(
        (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): LinearWithLoRA(
            (linear): Conv1D()
            (lora): LoRALayer()
          )
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1280, out_features=2, bias=True)
)

In [3]:
torch.cuda.empty_cache()
_ = gc.collect()

In [4]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr)
num_epochs = 3
num_training_steps = num_epochs * len(mrpc_dataset["train"])
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)



In [5]:
train_dataset = mrpc_dataset["train"]
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i in tqdm(range(0, len(train_dataset), batch_size)):
        batch = train_dataset[i:i+batch_size]
        inputs = tokenizer(batch["sentence1"], batch["sentence2"], return_tensors="pt",
                           truncation=True, padding = True, max_length = max_len).to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)["logits"][:, -1, :]#.cpu()
        y = torch.LongTensor(batch["label"]).to(device)
        loss = torch.nn.functional.cross_entropy(outputs, y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        torch.cuda.empty_cache()
        _ = gc.collect()

    print(f"Epoch {epoch + 1}, Average loss: {total_loss / len(train_dataset)}")

100%|██████████| 115/115 [03:04<00:00,  1.61s/it]


Epoch 1, Average loss: 0.019906573972564227


100%|██████████| 115/115 [03:10<00:00,  1.65s/it]


Epoch 2, Average loss: 0.01902800353168791


100%|██████████| 115/115 [03:08<00:00,  1.64s/it]

Epoch 3, Average loss: 0.01624104251420875





In [6]:
print("Evaluating on MRPC...")
mrpc_accuracy = evaluate_mrpc(model, tokenizer, mrpc_dataset, device)
print(f"MRPC Accuracy: {mrpc_accuracy:.4f}")

Evaluating on MRPC...


100%|██████████| 408/408 [00:09<00:00, 41.93it/s]

MRPC Accuracy: 0.7917





In [7]:
mrpc_head = model.lm_head

In [8]:
torch.manual_seed(1311)

num_classes = 2
model.lm_head = torch.nn.Linear(in_features=in_features, out_features=num_classes).to(device)

In [9]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr)
num_epochs = 3
num_training_steps = num_epochs * len(sst_dataset["train"])
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)

In [10]:
# Convert to PyTorch datasets
train_dataset = sst_dataset["train"]

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i in tqdm(range(0, 5000, batch_size)):
        batch = train_dataset[i:i+batch_size]
        inputs = tokenizer(batch["sentence"], return_tensors="pt",
                           truncation=True, padding = True, max_length = max_len).to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)["logits"][:, -1, :]#.cpu()
        y = torch.LongTensor(batch["label"]).to(device)
        loss = torch.nn.functional.cross_entropy(outputs, y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        torch.cuda.empty_cache()
        _ = gc.collect()

    print(f"Epoch {epoch + 1}, Average loss: {total_loss / len(train_dataset)}")

100%|██████████| 157/157 [02:43<00:00,  1.04s/it]


Epoch 1, Average loss: 0.001606773968788933


100%|██████████| 157/157 [02:00<00:00,  1.30it/s]


Epoch 2, Average loss: 0.0007389305756085664


100%|██████████| 157/157 [02:43<00:00,  1.04s/it]

Epoch 3, Average loss: 0.0005056121178319505





In [11]:
print("Evaluating on SST-2...")
accuracy = evaluate_sst2(model, tokenizer, sst_dataset, device)
print(f"SST-2 Accuracy: {accuracy:.4f}")

Evaluating on SST-2...


100%|██████████| 872/872 [00:17<00:00, 49.94it/s]

SST-2 Accuracy: 0.9369





In [12]:
torch.cuda.empty_cache()
_ = gc.collect()

In [13]:
sst_head = model.lm_head
model.lm_head = mrpc_head

In [14]:
print("Evaluating on MRPC...")
mrpc_accuracy = evaluate_mrpc(model, tokenizer, mrpc_dataset, device)
print(f"MRPC Accuracy: {mrpc_accuracy:.4f}")

Evaluating on MRPC...


100%|██████████| 408/408 [00:08<00:00, 50.64it/s]

MRPC Accuracy: 0.7647





In [15]:
torch.manual_seed(1311)

num_classes = 3
model.lm_head = torch.nn.Linear(in_features=in_features, out_features=num_classes).to(device)

In [16]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=lr)
num_epochs = 5
num_training_steps = num_epochs * len(sentiment_dataset["test"])

In [17]:
train_dataset = sentiment_dataset["test"]

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for i in tqdm(range(0, len(train_dataset), batch_size)):
        batch = train_dataset[i:i+batch_size]
        inputs = tokenizer(batch["text"], return_tensors="pt",
                           truncation=True, padding = True, max_length = max_len).to(device)
        optimizer.zero_grad()
        outputs = model(**inputs)["logits"][:, -1, :]#.cpu()
        y = torch.LongTensor(batch["sentiment"]).to(device) // 4
        loss = torch.nn.functional.cross_entropy(outputs, y)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()
        _ = gc.collect()

    print(f"Epoch {epoch + 1}, Average loss: {total_loss / len(train_dataset)}")

100%|██████████| 16/16 [00:17<00:00,  1.07s/it]


Epoch 1, Average loss: 0.018428838695866995


100%|██████████| 16/16 [00:18<00:00,  1.17s/it]


Epoch 2, Average loss: 0.00886059582353117


100%|██████████| 16/16 [00:19<00:00,  1.22s/it]


Epoch 3, Average loss: 0.005677207840614051


100%|██████████| 16/16 [00:18<00:00,  1.13s/it]


Epoch 4, Average loss: 0.0038867912977096067


100%|██████████| 16/16 [00:17<00:00,  1.07s/it]

Epoch 5, Average loss: 0.0031792531912046744





In [18]:
print("Evaluating on Sentiment140...")
sentiment_accuracy = evaluate_sentiment(model, tokenizer, sentiment_dataset, device)
print(f"Sentiment Accuracy: {sentiment_accuracy:.4f}")

Evaluating on Sentiment140...


100%|██████████| 8/8 [00:07<00:00,  1.02it/s]

Sentiment Accuracy: 0.9229





In [19]:
sentiment_head = model.lm_head
sentiment_head

Linear(in_features=1280, out_features=3, bias=True)

In [20]:
model.lm_head = mrpc_head
print("Evaluating on MRPC...")
mrpc_accuracy = evaluate_mrpc(model, tokenizer, mrpc_dataset, device)
print(f"MRPC Accuracy: {mrpc_accuracy:.4f}")

Evaluating on MRPC...


100%|██████████| 408/408 [00:09<00:00, 44.22it/s]

MRPC Accuracy: 0.4877





In [21]:
model.lm_head = sst_head
print("Evaluating on SST-2...")
accuracy = evaluate_sst2(model, tokenizer, sst_dataset, device)
print(f"SST-2 Accuracy: {accuracy:.4f}")

Evaluating on SST-2...


100%|██████████| 872/872 [00:18<00:00, 45.93it/s]

SST-2 Accuracy: 0.8979





In [22]:
torch.cuda.empty_cache()
_ = gc.collect()

In [23]:
model.lm_head = lm_head
ppl = calculate_perplexity(model, tokenizer, txt)
print("Perplexity:", round(ppl, 2))

Perplexity: 42.96


In [24]:
text = "every effort moves you"
input_ids = tokenizer.encode(text, return_tensors="pt").to(0)

output = model.generate(input_ids, do_sample=False, max_length=500)
print(tokenizer.decode(output[0], skip_special_tokens=True))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


every effort moves you closer to the story and the characters, and the beautiful landscapes, and the wonderful music, by the author, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by the composer, and by the wonderful music by t

In [25]:
torch.cuda.empty_cache()
_ = gc.collect()