In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Installing collected 

In [2]:
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from datasets import load_from_disk, Dataset, DatasetDict, load_dataset
from torch.nn.utils.rnn import pad_sequence
import copy

In [3]:
model_name = "sberbank-ai/mGPT"

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name, padding_size='right')

special_tokens_dict = {
    'bos_token': '<BOS>',
    'eos_token': '<|endoftext|>',
    'pad_token': '<PAD>'}

num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.89M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

In [41]:
class PoemDataset(Dataset):
    def __init__(self, dataset_dict, tokenizer, seq_length=1024):
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        self.dataset = dataset_dict

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):

      data = self.dataset[idx]
      poet, poem = data.get("poet", ""), data.get("poem", "")
      lines = " ".join(poem)
      first_line = lines.partition('\n')[0]

      text = f"<BOS> {poet} : {first_line}\n<|endoftext|>"
      # print(f'i:{idx}, text= {text}')


      input_encoding = self.tokenizer.encode_plus(
                                 text,
                                 max_length=self.seq_length,
                                 padding='max_length',
                                 return_attention_mask=True,
                                 return_tensors='pt',
                                 truncation=True
                                )

      input_ids = input_encoding['input_ids'].flatten()
      attention_masks = input_encoding['attention_mask'].flatten()

      target_encoding = self.tokenizer.encode_plus(
                                 poem,
                                 max_length=self.seq_length,
                                 padding='max_length',
                                 return_attention_mask=True,
                                 return_tensors='pt',
                                 truncation=True
                                )
      target_ids = target_encoding['input_ids'].flatten()


      return {
            'input_ids': input_ids,
            'attention_mask': attention_masks,
            'target_ids': target_ids
        }


In [42]:
dataset_dict = load_from_disk('turkish_poems_cleaned')
#dataset_dict = load_dataset("beratcmn/instruction-turkish-poems")
poemDataset = PoemDataset(dataset_dict, tokenizer)

In [43]:
a = dataset_dict[1]
a.get("poet", "")

'Ümit Yaşar Oğuzcan'

In [44]:
def train_val_split(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size

train_size, val_size = train_val_split(0.9, poemDataset)
train_dataset, val_dataset = random_split(poemDataset, [train_size, val_size])

In [45]:
train_dataloader = DataLoader(train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=1)

val_dataloader = DataLoader(val_dataset,
                            sampler=SequentialSampler(val_dataset),
                            batch_size=1)

In [11]:
model = GPT2LMHeadModel.from_pretrained(model_name, gradient_checkpointing=True, use_cache=False)
model.resize_token_embeddings(len(tokenizer))

model_org = copy.deepcopy(model)

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.45G [00:00<?, ?B/s]

In [46]:
model = copy.deepcopy(model_org)

In [47]:
# Freeze transformer layers except the first and the last one. Do not freeze any layernorms
for n, p in model.named_parameters():
    if 'transformer.h' in n:
        layer_num = int(n.split('.')[2])
        if 'ln_' not in n and layer_num > 0 and layer_num < 23:
            p.requires_grad = False
            #print('Freeze', n)

In [48]:
model.cuda()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=5e-7)

In [49]:
model.train()
for epoch in range(2):
    print('Epoch', epoch)

    train_losses = []
    print('Training...')

    progressbar = tqdm(train_dataloader)
    for batch in progressbar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(model.device)
        labels = batch['target_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        progressbar.set_description("Loss: %.3f" % np.mean(train_losses[-10:]))


    val_losses = []
    print('Validating...')


    progressbar = tqdm(val_dataloader)
    for batch in progressbar:

        input_ids = batch['input_ids'].to(model.device)
        labels = batch['target_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)

        with torch.no_grad():

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            val_losses.append(loss.item())

            progressbar.set_description("Loss: %.3f" % np.mean(val_losses[-10:]))


Epoch 0
Training...


Loss: 2.943: 100%|██████████| 450/450 [05:20<00:00,  1.40it/s]


Validating...


Loss: 1.436: 100%|██████████| 50/50 [00:11<00:00,  4.50it/s]


Epoch 1
Training...


Loss: 1.379: 100%|██████████| 450/450 [05:20<00:00,  1.40it/s]


Validating...


Loss: 1.222: 100%|██████████| 50/50 [00:11<00:00,  4.51it/s]


In [71]:
torch.save(model.state_dict(), 'model.pth')

In [None]:
model = GPT2LMHeadModel.from_pretrained(model_name)

In [72]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.resize_token_embeddings(len(tokenizer))
model.load_state_dict(torch.load('/content/drive/MyDrive/model.pth'))

<All keys matched successfully>

In [50]:
eos_id = tokenizer.encode("<EOS>")

In [69]:
input_ids = tokenizer.encode("<BOS>Abdurrahim Karakoç : Yoktur aşkın mantığı,\n", return_tensors = "pt").cuda()
input_ids = input_ids.to(model.device)
out = model.generate(
        input_ids,
        num_beams=5,
        num_return_sequences=1,
        max_length=100,
        eos_token_id=eos_id,
        do_sample=True,
        top_k=6,
        top_p=0.92,
        temperature=0.9,
        no_repeat_ngram_size=3)

In [70]:
for sample_output in out:
    print("{} ".format(tokenizer.decode(sample_output, skip_special_tokens=True)))

Abdurrahim Karakoç : Yoktur aşkın mantığı,
Yoktur bir aşk hikayesi, Yoktur bir şiir,
Şiir yoktur aşka,
Aşk yoktur şiire,
İnsan yoktur,
Dünya yoktur.
İşte bu yüzdendir,
Bu yüzden aşk,
Çünkü aşk vardır,
Böyle bir aşktır 
