# Init

In [1]:
from os import environ as ENV
from uuid import uuid4
from datetime import date
import numpy as np
import pandas as pd
import torch
from google.colab import userdata, drive

In [None]:
drive.mount('/content/drive')

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

torch.set_default_device(device)

In [None]:
np.random.seed(42)
torch.manual_seed(42)

In [5]:
data_path  = 'drive/MyDrive/ml/data/baguettes_clean.tsv'
model_path = f'drive/MyDrive/ml/models/baguette'

iteration_step = 1

train_data_path = f'drive/MyDrive/ml/data/baguettes_train-{iteration_step}.txt'
valid_data_path = f'drive/MyDrive/ml/data/baguettes_valid-{iteration_step}.txt'

In [6]:
ENV['HF_HOME']        = '/root/hf_home'
ENV['HF_TOKEN']       = userdata.get('HF_TOKEN')
ENV['MODEL_PATH']     = model_path
ENV['TRAIN_PATH']     = train_data_path
ENV['VALID_PATH']     = valid_data_path
ENV['WANDB_DISABLED'] = 'true'

In [None]:
%%bash
git clone --depth 1 https://github.com/huggingface/transformers
cd transformers
pip install .

In [None]:
!pip3 install datasets
!pip3 install evaluate

In [None]:
!wget https://raw.githubusercontent.com/huggingface/transformers/main/examples/pytorch/language-modeling/run_clm.py

# Data

In [None]:
data = pd.read_csv(data_path, sep='\t')
data.head()

In [None]:
files_count = 1
valid_size  = 0.4

valid_ind = np.random.choice(data.shape[0], int(data.shape[0] * valid_size), replace=False)
train_ind = [i for i in range(len(data)) if i not in valid_ind]

valid_per_file = len(valid_ind) // files_count
train_per_file = (data.shape[0] - len(valid_ind)) // files_count

for i in range(1, files_count+1):
  begin = valid_per_file * (i-1)
  end   = valid_per_file * i

  with open(f'baguettes_valid-{i}.txt', 'w') as file:
    file.write('\n'.join([data.iloc[id]['baguette'] for id in valid_ind[begin:end]]))

for i in range(1, files_count+1):
  begin = train_per_file * (i-1)
  end   = train_per_file * i

  with open(f'baguettes_train-{i}.txt', 'w') as file:
    file.write('\n'.join([data.iloc[id]['baguette'] for id in train_ind[begin:end]]))

# Train

In [None]:
!python run_clm.py \
  --model_type=gpt2 \
  --model_name_or_path sberbank-ai/rugpt3small_based_on_gpt2 \
  --dataset_config_name plain_text \
  --do_train \
  --train_file=${TRAIN_PATH} \
  --per_device_train_batch_size 8 \
  --do_eval \
  --validation_file=${VALID_PATH} \
  --per_device_eval_batch_size 8 \
  --fp16 True \
  --block_size 2048 \
  --num_train_epochs 10 \
  --gradient_accumulation_steps 4 \
  --gradient_checkpointing True \
  --optim adafactor \
  --output_dir=${MODEL_PATH} \
  --overwrite_output_dir

# Run

In [10]:
import time
import re
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [11]:
model_path = 'drive/MyDrive/ml/models/baguette'

In [12]:
TOKEN_BOS = '<BOS>'
TOKEN_EOS = '<EOS>'
TOKEN_PAD = '<PAD>'

special_tokens_dict = {
  'bos_token': TOKEN_BOS,
  'eos_token': TOKEN_EOS,
  'pad_token': TOKEN_PAD,
}

In [13]:
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.bos_token_id = tokenizer.convert_tokens_to_ids(TOKEN_BOS)
tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids(TOKEN_EOS)
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(TOKEN_PAD)

model = GPT2LMHeadModel.from_pretrained(model_path).to(device)
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [17]:
def gen_fragment(context, tokenizer, model, device, temperature=1.0, min_size=128, max_size=256):
  input_ids = tokenizer.encode(context.upper(), add_special_tokens=False, return_tensors='pt').to(device)
  input_ids = input_ids[:, -1024:]
  input_size = input_ids.size(1)

  output_sequences = model.generate(
    input_ids=input_ids,
    max_length=input_size + max_size,
    min_length=input_size + min_size,
    top_p=0.6,
    top_k=100,
    do_sample=True,
    num_return_sequences=1,
    temperature=temperature,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    no_repeat_ngram_size=6
  )

  if len(output_sequences.shape) > 2:
    output_sequences.squeeze_()

  out = tokenizer.decode(output_sequences[0], clean_up_tokenization_spaces=True)
  generated_text = out.replace(tokenizer.bos_token, '').replace(tokenizer.eos_token, '').replace(tokenizer.pad_token, '\n@\n')
  generated_text = re.sub(r'[BEOSPAD<>|&%]+', '', re.sub(r'<+(\w+)?>*', '', re.sub('\n+', '', re.sub('(@\s*@\s*)', '@', generated_text))))
  generated_text = generated_text if (pos := generated_text.rfind('@')) == -1 else generated_text[:pos]

  return generated_text.replace(' @ ', '@').replace('@', '\n@\n')

In [None]:
gen_fragment(
  'Напиши багет',
  tokenizer=tokenizer,
  model=model,
  device=device,
  temperature=1.6,
)