In [7]:
import os
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset as BaseDataset
from torch.utils.data import DataLoader as BaseDataLoader

from transformers import GPT2Tokenizer, AutoModelForCausalLM, GPT2LMHeadModel, AutoTokenizer
from transformers import TrainingArguments, Trainer

from peft import LoraConfig, get_peft_model

from datasets import load_dataset

device="cuda"

In [8]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [9]:
config = {
    "emb_dim" : 768,
    "letter_emb_dim": 1024,
    "vocab_size" : tokenizer.vocab_size,
    "save_path": "./models/v4_peft.pth"
}

In [10]:
class Dataset(BaseDataset):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.data = load_dataset("csv", data_files="dataset.csv")["train"]

    def __getitem__(self, ix):
        item = self.data[ix]
        return item


    def __len__(self, ):
        return len(self.data)


In [11]:
class CustomCollator:
    def __call__(self, batch):
        titles = [item["title"] for item in batch if item is not None]
        ctx_embs = torch.tensor([json.loads(item["context_embedding"]) for item in batch if item is not None], dtype=torch.float)

        tokenized_title = tokenizer(titles,
                  padding="longest",
                  truncation=True,
                  return_tensors="pt")
        
        attention_mask = torch.stack([torch.cat([torch.tensor([1,], dtype=torch.long), mask], dim=-1) for mask in tokenized_title["attention_mask"]]).to(device)

        input_ids = tokenized_title["input_ids"][:, :-1].long()
        targets = tokenized_title["input_ids"]
        targets = targets.masked_fill(targets == tokenizer.pad_token_id, -100)

        return {
            "attention_mask": attention_mask[:, :-1].to(device),
            "letter_emb": ctx_embs.to(device),
            "input_ids": input_ids.to(device),
            "label": targets.to(device)
        }

dataset = Dataset(tokenizer)
collator_fn = CustomCollator()

In [6]:
class Model(nn.Module):

    def __init__(self, tokenizer, config):
        super().__init__()
        self.tokenizer = tokenizer
        self.letter_projection = nn.Sequential(nn.Linear(config["letter_emb_dim"], config["letter_emb_dim"] * 2),
                                                nn.Linear(config["letter_emb_dim"] * 2, config["emb_dim"]))
        self.gpt = AutoModelForCausalLM.from_pretrained("gpt2")
        tp = 0
        for p in self.letter_projection.parameters():
            tp += p.numel()
            p.requires_grad=True
        for p in self.gpt.lm_head.parameters():
            tp += p.numel()
            p.requires_grad=True

        print("number of trainable params:", tp)

    @classmethod
    def from_pretrained(cls, tokenizer, config):
        print("check model existance...")
        if os.path.isfile(config["save_path"]):
            print("Loading the model...")
            self = cls(tokenizer, config)
            self.load_state_dict(torch.load(config["save_path"], weights_only=True))
            print("loaded successfully!")
        else:
            print(f"couldn't find the {config['save_path']} file!")
            print("Creating a new model...")
            self = cls(tokenizer, config)
        return self

    def save(self, ):
        torch.save(self.state_dict(), config["save_path"])
        print(f"Model saved at {config['save_path']}!")
    
    def forward(self, attention_mask, letter_emb, input_ids, label):
        letter_emb = self.letter_projection(letter_emb).unsqueeze(1)
        x = self.gpt.transformer.wte(input_ids)
        x += self.gpt.transformer.wpe(torch.arange(x.shape[1]).to(device))
        x = torch.cat([letter_emb, x], dim=1)

        output = self.gpt(inputs_embeds=x,
            attention_mask=attention_mask,
            return_dict=True,
            labels=label
        )
        return output

    
    @torch.no_grad
    def generate(self, letter_emb):
        model.eval()
        letter_emb = torch.tensor(json.loads(letter_emb)).view(1,1,-1).to(device)
        letter_emb = self.letter_projection(letter_emb)
        output = model.gpt.generate(
        inputs_embeds=letter_emb,
        attention_mask=torch.ones((1, 1), dtype=torch.long).to(device),
        do_sample=True,
        top_p=0.9,
        temperature=0.9,
        num_beams=5,
        max_length=128,
        min_length=1,
        repetition_penalty=1.0,
        length_penalty=1.0,
        num_return_sequences=1,)

model = Model(tokenizer, config)
model.to(device);

number of trainable params: 42270208


In [None]:
# peft_config = LoraConfig(
#     r=8,
#     lora_alpha=32,
#     target_modules=["k_proj", "q_proj"],
#     lora_dropout=0.1,
#     task_type="CAUSAL_LM"
# )

# model.gpt = get_peft_model(model.gpt, peft_config)

# model.gpt.print_trainable_parameters()

In [None]:
train_args = TrainingArguments(
    output_dir="./cache/",
    learning_rate=1e-3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=32,
    num_train_epochs=10,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    save_strategy="epoch",
    logging_steps=10,
    logging_strategy="steps",
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    save_safetensors=False,
    fp16=True
)

trainer = Trainer(model=model,
        args=train_args,
        data_collator=collator_fn,
        train_dataset=dataset)

In [9]:
trainer.train()
model.save()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,5.0827
20,3.6352
30,3.295
40,3.0943
50,2.7665
60,2.5262
70,2.2333
80,2.0817
90,1.8289
100,1.6956


Model saved at ./models/v4_peft.pth!


In [11]:
''.join(model.generate(dataset[0]["context_embedding"]), )

Setting `pad_token_id` to `eos_token_id`:9 for open-end generation.


'گزارش  گزارش  گزارش گزارش    عنوان گزارش  گزارش گزارش گزارش   گزارش  گزارش   گزارش بروزرسانی گزارش   گزارش       گزارش   گزارش عنوان گزارش گزارش  راه   گزارش عنوان گزارش گزارش گزارش گزارش  گزارش گزارش  گزارش  گزارش  گزارش   گزارش گزارش     گزارش گزارش گزارش راه  گزارش  گزارش گزارش  گزارش   گزارش  عنوان کنترل    گزارش گزارش گزارش گزارش گزارش گزارش گزارش   گزارش   عنوان گزارش  گزارش    گزارش گزارش گزارش   گزارش گزارش گزارش     گزارش  عنوان گزارش گزارش   '

In [12]:
ix = 10
sample_data = dataset[ix]

In [4]:
with torch.no_grad():
    model.eval()
    letter_emb = model.letter_projection(torch.tensor(json.loads(sample_data["context_embedding"])).to(device).view(1, 1,-1))
    output = model.gpt.generate(
        inputs_embeds=letter_emb,
        attention_mask=torch.ones((1, 1), dtype=torch.long).to(device),
        do_sample=True,
        top_p=0.9,
        temperature=0.9,
        num_beams=5,
        max_length=24,
        min_length=1,
        repetition_penalty=1.0,
        length_penalty=1.0,
        num_return_sequences=1,
    )

    output_ids = tokenizer.batch_decode(output, skip_special_tokens=True)
    print('Generated title:', ''.join(output_ids))
    print('True title:', sample_data["title"])
    print('context:', sample_data["context"])

NameError: name 'dataset' is not defined

In [17]:
tokenizer.vocab_size

50257