In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Installing collected 

In [2]:
import torch
import numpy as np
from tqdm import tqdm
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from datasets import load_from_disk, Dataset, DatasetDict, load_dataset
from torch.nn.utils.rnn import pad_sequence
import copy

In [3]:
model_name = "sberbank-ai/mGPT"

In [4]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

special_tokens_dict = {
    'bos_token': '<BOS>',
    'eos_token': '<EOS>',
    'pad_token': '<PAD>'}

num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.89M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/606 [00:00<?, ?B/s]

In [5]:
class PoemDataset(Dataset):
    def __init__(self, dataset_dict, tokenizer, seq_length=1024):
        self.tokenizer = tokenizer
        self.seq_length = seq_length
        self.dataset = dataset_dict

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):

      data = self.dataset[idx]
      poet, poem = data.get("poet", ""), data.get("poem", "")
      lines = " ".join(poem)
      first_line = lines.partition('\n')[0]

      text = f"<BOS> {poet} : {first_line}<EOS>"

      input_encoding = self.tokenizer(
                                 text,
                                 max_length=self.seq_length,
                                 padding='max_length',
                                 return_attention_mask=True,
                                 return_tensors='pt',
                                 truncation=True
                                )

      input_ids = input_encoding['input_ids'].squeeze(0)
      attention_masks = input_encoding['attention_mask'].squeeze(0)

      target_encoding = self.tokenizer(
                                 poem,
                                 max_length=self.seq_length,
                                 padding='max_length',
                                 return_attention_mask=True,
                                 return_tensors='pt',
                                 truncation=True
                                )
      target_ids = target_encoding['input_ids'].squeeze(0)


      return {
            'input_ids': input_ids,
            'attention_mask': attention_masks,
            'target_ids': target_ids
        }


In [6]:
dataset_dict = load_from_disk('turkish_poems_cleaned')
#dataset_dict = load_dataset("beratcmn/instruction-turkish-poems")
poemDataset = PoemDataset(dataset_dict, tokenizer)

In [7]:
def train_val_split(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size

train_size, val_size = train_val_split(0.9, poemDataset)
train_dataset, val_dataset = random_split(poemDataset, [train_size, val_size])

In [8]:
train_dataloader = DataLoader(train_dataset,
                              sampler=RandomSampler(train_dataset),
                              batch_size=2)

val_dataloader = DataLoader(val_dataset,
                            sampler=SequentialSampler(val_dataset),
                            batch_size=2)

In [9]:
model = GPT2LMHeadModel.from_pretrained(model_name, gradient_checkpointing=True, use_cache=False)
model.resize_token_embeddings(len(tokenizer))

model_org = copy.deepcopy(model)

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.45G [00:00<?, ?B/s]

In [None]:
model = copy.deepcopy(model_org)

In [10]:
# Freeze transformer layers except the first and the last one. Do not freeze any layernorms
for n, p in model.named_parameters():
    if 'transformer.h' in n:
        layer_num = int(n.split('.')[2])
        if 'ln_' not in n and layer_num > 0 and layer_num < 23:
            p.requires_grad = False
            #print('Freeze', n)

In [13]:
#model.cuda()
optimizer = torch.optim.AdamW(params=model.parameters(), lr=5e-7)

In [14]:
model.train()
for epoch in range(2):
    print('Epoch', epoch)

    train_losses = []

    progressbar = tqdm(train_dataloader)
    for batch in progressbar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(model.device)
        labels = batch['target_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        progressbar.set_description("Loss: %.3f" % np.mean(train_losses[-10:]))


    val_losses = []

    progressbar = tqdm(val_dataloader)
    for batch in progressbar:

        input_ids = batch['input_ids'].to(model.device)
        labels = batch['target_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)

        with torch.no_grad():

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            val_losses.append(loss.item())

            progressbar.set_description("Loss: %.3f" % np.mean(val_losses[-10:]))


Epoch 0


  0%|          | 0/225 [00:00<?, ?it/s]


IndexError: index 2 is out of bounds for dimension 0 with size 2

In [None]:
model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(100003, 2048)
    (wpe): Embedding(2048, 2048)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-23): 24 x GPT2Block(
        (ln_1): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=2048, out_features=100003, bias=False)
)

In [None]:
torch.save(model.state_dict(), 'model.pth')

In [None]:
model = GPT2LMHeadModel.from_pretrained(model_name)

In [None]:
model.resize_token_embeddings(len(tokenizer))
model.load_state_dict(torch.load('/content/drive/MyDrive/model.pth'))

<All keys matched successfully>

In [None]:
input_ids = tokenizer.encode("<BOS>Abdurrahim Karakoç : Huzursuzluktur beni yoran,", return_tensors = "pt").cuda()
input_ids = input_ids.to(model.device)
out = model.generate(
        input_ids,
        num_beams=10,
        num_return_sequences=1,
        max_length=100,
        eos_token_id=5,
        do_sample=True,
        top_k=3,
        top_p=0.85,
        no_repeat_ngram_size=2)

In [None]:
for sample_output in out:
    print("{} ".format(tokenizer.decode(sample_output, skip_special_tokens=True)))

Abdurrahim Karakoç : Huzursuzluktur beni yoran, beni rahatsız eden şeyler. İçimde bir huzur vardır. Benim için bir yalnızlıktır.Bir insanın hissettiği hüzün, bir insan için çok büyük bir şey. Hüzne kapılıp, kendini kaybedince, hırçınlaşır, içine kapanırsın. Bir insanı yor 
