# GPT2 Finetuning on Wikibooks Dataset

## 1. Загружаем датасет

In [2]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('/kaggle/input/wikibooks-dataset/wikibooks.sqlite')

df = pd.read_sql_query("SELECT * FROM ru LIMIT 3300", conn)

In [3]:
df.head()

Unnamed: 0,title,url,abstract,body_text,body_html
0,Викиучебник: Техника и технология средств масс...,https://ru.wikibooks.org/wiki/%D0%A2%D0%B5%D1%...,* [станция|Рабочая станция];,Рабочая станция;\nСервер;\nПерсональный компью...,"<div class=""mw-parser-output""><ul><li><a href=..."
1,Викиучебник: АОН/Пилотское свидетельство,https://ru.wikibooks.org/wiki/%D0%90%D0%9E%D0%...,Гражданское пилотское свидетельство - разрешен...,В Википедии имеется статья по теме «Свидетельс...,"<div class=""mw-parser-output""><div class=""info..."
2,Викиучебник: Книга программиста/Структуры данн...,https://ru.wikibooks.org/wiki/%D0%9A%D0%BD%D0%...,К оглавлению,"К оглавлению\nВсе программы, код которых вылож...","<div class=""mw-parser-output""><p><a href=""/wik..."
3,Викиучебник: Тесты НМО/Гигиенические основы и ...,https://ru.wikibooks.org/wiki/%D0%A2%D0%B5%D1%...,Гигиенические основы и медицинский контроль за...,Гигиенические основы и медицинский контроль за...,"<div class=""mw-parser-output""><p><b>Гигиеничес..."
4,Викиучебник: Коктейли/Пенная фея,https://ru.wikibooks.org/wiki/%D0%9A%D0%BE%D0%...,Пенная фея,Пенная фея\n\nДжин Old Tom — 60 г\nАбсент — 15...,"<div class=""mw-parser-output""><p><b>Пенная фея..."


In [4]:
df = df[df['body_text'] != '']

## 2. Train test split

In [5]:
from sklearn.model_selection import train_test_split

train_texts, test_texts = train_test_split(df['body_text'], test_size=0.2, random_state=0)

In [6]:
train_texts.shape, test_texts.shape

((2635,), (659,))

### Сохраняем тексты в файлы

In [7]:
with open("train.txt", "w") as file:
    file.write("\n".join(train_texts.tolist()))

with open("valid.txt", "w") as file:
    file.write("\n".join(test_texts.tolist()))

## 3. Запускаем дообучение

In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForLanguageModeling, TextDataset
from transformers import AdamW, get_cosine_schedule_with_warmup

2024-05-29 18:36:27.095505: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-29 18:36:27.095605: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-29 18:36:27.231307: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [9]:
device = "cuda"

model_name_or_path = 'ai-forever/rugpt3small_based_on_gpt2'

# tokenizer based on GPT2 for text preprocessing
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# loading a pre-trained model based on GPT2
model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(device)

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/551M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [10]:
train_dataset = TextDataset(tokenizer=tokenizer,file_path='/kaggle/working/train.txt', 
                            block_size=64)
  
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, 
                                                mlm=False)



In [11]:
training_args = TrainingArguments(
    output_dir = "./finetuned_model",
    overwrite_output_dir = True,
    num_train_epochs = 10,
    gradient_accumulation_steps = 2,
    fp16 = True,
    per_device_train_batch_size = 64,
    learning_rate = 0.0002,
    optim = 'adafactor',
    lr_scheduler_type = 'cosine',
    save_steps=1000,
    seed=42
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [13]:
!rm -rf wandb & rm -rf finetuned_model

In [14]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss
500,2.9582
1000,2.5202
1500,2.2749
2000,2.0875
2500,1.9182
3000,1.7681
3500,1.6326
4000,1.5065
4500,1.4311
5000,1.3807


TrainOutput(global_step=5860, training_loss=1.8585257103825592, metrics={'train_runtime': 6702.7479, 'train_samples_per_second': 111.953, 'train_steps_per_second': 0.874, 'total_flos': 2.4489040453632e+16, 'train_loss': 1.8585257103825592, 'epoch': 9.99})

## 4. Генерируем примеры текста

In [13]:
import torch

def generate(prompt, do_sample=True, num_beams=2, temperature=1.5, top_p=0.9, max_length=75):
    
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    model.eval()
    with torch.no_grad():
        out = model.generate(input_ids, 
                            do_sample=do_sample,
                            num_beams=num_beams,
                            temperature=temperature,
                            top_p=top_p,
                            max_length=max_length,
                            )

    print(list(map(tokenizer.decode, out))[0])

In [5]:
from transformers import AutoConfig

In [8]:
config = AutoConfig.from_pretrained("./finetuned_model/checkpoint-5000")

In [10]:
device = "cuda"

model_name_or_path = 'ai-forever/rugpt3small_based_on_gpt2'

# tokenizer based on GPT2 for text preprocessing
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)

# loading a pre-trained model based on GPT2
model = GPT2LMHeadModel(config=config).to(device)

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

In [24]:
generate("женщина", max_length=30)

женщина тык✂✂ потерять предательство назначения получимварда 123ike геопол геополгрегрегре отдельными 123 проведено путями полиэти контур 123 123 лезетшней Ставрополь Ставропольota
