In [1]:
import os
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset as BaseDataset
from torch.utils.data import DataLoader, random_split

from transformers import GPT2Tokenizer, AutoModelForCausalLM, GPT2LMHeadModel, AutoTokenizer
from transformers import TrainingArguments, Trainer

from peft import LoraConfig, get_peft_model

from datasets import load_dataset

device="cuda"
model_name = "HooshvareLab/gpt2-fa"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({"additional_special_tokens": ["<letter>", "</letter>"]})

2

In [3]:
tokenizer.pad_token_id = tokenizer.eos_token_id

In [4]:
config = {
    "emb_dim" : 768,
    "letter_emb_dim": 1024,
    "vocab_size" : tokenizer.vocab_size,
    "save_path": "./models/v12.pth"
}

In [5]:
class Dataset(BaseDataset):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        self.data = load_dataset("csv", data_files="datasets/datasets_new.csv")["train"]

    def __getitem__(self, ix):
        item = self.data[ix]
        return item


    def __len__(self, ):
        return len(self.data)


In [6]:
class CustomCollator:
    def __call__(self, batch):
        titles = ["</letter>" + item["title"] for item in batch if item is not None]
        ctx_embs = torch.tensor([json.loads(item["context_embedding"]) for item in batch if item is not None], dtype=torch.float)

        tokenized_title = tokenizer(titles,
                  padding="max_length",
                  truncation=True,
                  return_tensors="pt",
                  max_length=256)
        
        attention_mask = torch.stack([torch.cat([torch.tensor([1,], dtype=torch.long), mask], dim=-1) for mask in tokenized_title["attention_mask"]]).to(device)
        input_ids = tokenized_title["input_ids"][:, 1:-1].long()
        targets = tokenized_title["input_ids"][:, 1:]
        targets = torch.cat([torch.ones(targets.shape[0], 1).fill_(-100).long(), targets.masked_fill(targets == tokenizer.pad_token_id, -100)], dim=1)
        targets[:, 2] = -100

        return {
            "attention_mask": attention_mask[:, :-1].to(device),
            "letter_emb": ctx_embs.to(device),
            "input_ids": input_ids.to(device),
            "label": targets.to(device)
        }

train_dataset, eval_dataset = random_split(Dataset(tokenizer), [0.97, 0.03])
collator_fn = CustomCollator()

In [7]:
class Model(nn.Module):

    def __init__(self, tokenizer, config):
        super().__init__()
        self.tokenizer = tokenizer
        self.letter_projection = nn.Sequential(nn.Linear(config["letter_emb_dim"], config["letter_emb_dim"] // 2),
                                                nn.Linear(config["letter_emb_dim"] // 2, config["emb_dim"]))
        self.gpt = GPT2LMHeadModel.from_pretrained(model_name)
        self.gpt.resize_token_embeddings(len(tokenizer))

        tp = 0
        for p in self.letter_projection.parameters():
            tp += p.numel()
            p.requires_grad=True
        for p in self.gpt.lm_head.parameters():
            tp += p.numel()
            p.requires_grad=True
        for p in self.gpt.transformer.wte.parameters():
            tp += p.numel()
            p.requires_grad=True
        for p in self.gpt.transformer.wpe.parameters():
            tp += p.numel()
            p.requires_grad=True
        
        print(f"number of trainable params:{tp:,}")


    @classmethod
    def from_pretrained(cls, tokenizer, config):
        print("check model existance...")
        if os.path.isfile(config["save_path"]):
            print("Loading the model...")
            self = cls(tokenizer, config)
            self.load_state_dict(torch.load(config["save_path"], weights_only=True))
            print("loaded successfully!")
        else:
            print(f"couldn't find the {config['save_path']} file!")
            print("Creating a new model...")
            self = cls(tokenizer, config)
        return self

    def save(self, ):
        torch.save(self.state_dict(), config["save_path"])
        print(f"Model saved at {config['save_path']}!")
    
    def forward(self, attention_mask, letter_emb, input_ids, label):
        start_letter_token_id = tokenizer("<letter>", return_tensors='pt')["input_ids"][:, 0].to(device)
        start_letter_token_emb = self.gpt.transformer.wte(start_letter_token_id).to(device)
        letter_emb = self.letter_projection(letter_emb).unsqueeze(1).to(device)
        x = self.gpt.transformer.wte(input_ids)
        x = torch.cat([start_letter_token_emb.repeat(x.shape[0], 1, 1), letter_emb, x], dim=1)
        x += self.gpt.transformer.wpe(torch.arange(x.shape[1]).to(device))
        # print(x.shape, attention_mask.shape, tokenizer("<letter>", return_tensors='pt'))
        output = self.gpt(inputs_embeds=x,
            attention_mask=attention_mask,
            return_dict=True,
            labels=label
        )
        return output

    
    @torch.no_grad
    def generate(self, letter_emb):
        self.gpt.eval()
        letter_emb = torch.tensor(json.loads(letter_emb)).view(1,1,-1).to(device)
        letter_emb = self.letter_projection(letter_emb)
        output = self.gpt.generate(
        inputs_embeds=letter_emb,
        attention_mask=torch.ones((1, 1), dtype=torch.long).to(device),
        do_sample=True,
        top_p=0.9,
        temperature=0.9,
        num_beams=5,
        max_length=128,
        min_length=1,
        repetition_penalty=1.0,
        length_penalty=1.0,
        num_return_sequences=1,)
        return self.tokenizer.batch_decode(output)

model = Model.from_pretrained(tokenizer, config)
model.to(device);

check model existance...
Loading the model...


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


number of trainable params:66,221,824
loaded successfully!


In [8]:
train_args = TrainingArguments(
    output_dir="./cache/v12/",
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=8,
    num_train_epochs=100,
    weight_decay=0.01,
    lr_scheduler_type="linear",
    save_strategy="epoch",
    logging_steps=10,
    logging_strategy="steps",
    eval_strategy="steps",
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    save_safetensors=False,
)

trainer = Trainer(model=model,
        args=train_args,
        data_collator=collator_fn,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset)

In [None]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
10,1.6826,1.82025
20,1.9163,1.862472
30,1.9588,1.901913
40,2.0032,1.937918
50,2.0232,1.966914
60,2.0643,2.023161
70,2.0682,2.027923
80,2.1506,2.036496
90,2.0687,2.063663
100,2.0769,2.083252


In [26]:
model.save()

Model saved at ./models/v12.pth!


In [31]:
ix = 4
sample_data = train_dataset[ix]
with torch.no_grad():
    model.eval()
    elt = model.gpt.transformer.wte(tokenizer("</letter>", return_tensors='pt')["input_ids"][:, 1:2].to(device))
    slt = model.gpt.transformer.wte(tokenizer("<letter>", return_tensors='pt')["input_ids"][:, 1:2].to(device))
    letter_emb = model.letter_projection(torch.tensor(json.loads(sample_data["context_embedding"])).to(device).view(1, 1,-1))
    letter_emb = torch.cat([slt, letter_emb, elt], dim=1)
    letter_emb += model.gpt.transformer.wpe(torch.arange(letter_emb.shape[1]).to(device))
    output = model.gpt.generate(
        inputs_embeds=letter_emb,
        attention_mask=torch.ones((1, 1), dtype=torch.long).to(device),
        do_sample=True,
        top_p=0.5,
        temperature=0.5,
        num_beams=5,
        max_length=16,
        min_length=1,
        repetition_penalty=1.0,
        length_penalty=1.0,
        num_return_sequences=1,
    )

    output_ids = tokenizer.batch_decode(output, skip_special_tokens=True)
    print('Generated title:', ''.join(output_ids))
    print('True title:', sample_data["title"])
    print('context:', sample_data["context"])

Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.


Generated title: های برنامههایهایهایهایهایهایهایهایهایهایهایهایهای
True title: تأییدیه برنامه آموزشی
context: 

بسمه تعالی





شماره: 

تاریخ: 

پیوست: 


جناب آقای مهندس مسعود مومن زاده نائینی
مدیریت محترم اداره فناوری اطلاعات بیمه آسیا

با سلام و احترام
به استحضار می‎رساند برنامه آموزشی ارسالی از طرف نماینده محترم کارفرما مورد تایید این شرکت می‏باشد. 
همچنین طی جلسات هماهنگی فعالیت‏های ابتدای پروژه که در روزهای ۵ / ۳ / ۹۴، ۲۵ / ۳ / ۹۴، ۲۶ / ۳ / ۹۴، ۲۷ / ۳ / ۹۴ و ۳۱ / ۳ / ۹۴ در آن شرکت محترم برگزار گردید مذاکرات لازم در این خصوص صورت گرفته و برنامه آموزشی که به تایید طرفین رسید به پیوست حضورتان ارسال می‎گردد. 




باسپاس
الهه مقدس زاده
مدیر پشتیبانی و استقرار سیستم‌ها




