In [1]:
!pip install transformers pynvml datasets accelerate torch==2.0.1 --quiet

In [2]:
import json
from typing import Optional
import logging
from dataclasses import dataclass, field
import tqdm
import torch
import torch.optim
from torch.utils.data import Dataset
import transformers
from transformers import TrainingArguments, Trainer
from transformers import HfArgumentParser
from pynvml import *



In [1]:
import csv


templates = []
with open('/kaggle/input/rusql-to-text/dataset.csv', 'r', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',')
    next(spamreader)
    for row in spamreader:
        templates.append({
            "question": row[1].strip(),
            "context": row[2].strip(),
            "answer": row[3].strip(),
        })

In [2]:
data = []
for i in templates:
    context = ""
    for t in i["context"].strip().split("CREATE")[1:]:
        table_name = t.split('(\n')[0][7:].strip()
        if '.' in table_name:
            table_name = table_name.split('.')[1]
        if table_name in i['answer']:
            context += "CREATE" + t
            break
    data.append({
            "question": i["question"],
            "context": context,
            "answer": i["answer"],
        })

In [3]:
data[0]

{'question': 'Вывести всю информацию об игровых боссах.',
 'context': 'CREATE TABLE terraria.boss (\n  id_boss INT(255) NOT NULL PRIMARY KEY,\n  damage INT(255) NOT NULL,\n  level INT(255) NOT NULL,\n  magic_trick VARCHAR(255) NOT NULL,\n  name VARCHAR(255) NOT NULL,\n  speed INT(255) NOT NULL,\n);\n\n',
 'answer': 'SELECT * FROM boss;'}

In [6]:
for i in data:
    if i['context'] == "":
        print(i)

In [7]:
from datasets import Dataset, Features, Value


# with open(r"/kaggle/input/text-to-sql/know_sql_val3.json") as f:
#     templates = json.load(f)
    

dictionary = {
    "train": data
}

features = Features({
    'train': {
        "question": Value('string'),
        "context": Value('string'),
        "answer": Value('string'),
    }
})


dataset = Dataset.from_dict(dictionary, features=features)
dataset

Dataset({
    features: ['train'],
    num_rows: 757
})

In [8]:
def load_samples(dataset_path, tokenizer):
    samples = []
    #with open(dataset_path, 'r') as f:
    for sample in tqdm.tqdm(dataset['train']):
        try:
            seed = '<SC6>' + "По таблице:\n" + sample['context'] + "\nВыполни задание:" + sample['question'] + '<extra_id_0>'
            reply = '<extra_id_0>' + sample['answer']
            input_tokens = tokenizer.encode(seed, add_special_tokens=False, truncation=True, max_length=1024)
            output_tokens = tokenizer.encode(reply, add_special_tokens=False)
            if len(input_tokens) < 128 and len(output_tokens) < 768:
                samples.append({'input_tokens': input_tokens, 'output_tokens': output_tokens})
        except Exception as ex:
            print(ex)

    return samples

In [9]:
class SFTDataset(Dataset):
    def __init__(self, samples, tokenizer, dataset):
        self.tokenizer = tokenizer
        self.max_input_len = 0
        self.max_output_len = 0
        self.samples = []
        self._data = dataset._data
        self._fingerprint = dataset._fingerprint
        self._info = dataset._info

        self.bos_token_id = tokenizer.encode('<s>', add_special_tokens=False)[0]
        self.eos_token_id = tokenizer.encode('</s>', add_special_tokens=False)[0]
        self.pad_token_id = tokenizer.encode('<pad>', add_special_tokens=False)[0]

        for sample in samples:
            input_ids = sample['input_tokens']
            output_ids = sample['output_tokens'] + [self.eos_token_id]
            self.samples.append((input_ids, output_ids))
            self.max_input_len = max(self.max_input_len, len(input_ids))
            self.max_output_len = max(self.max_output_len, len(output_ids))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, index: int):
        input_ids, output_ids = self.samples[index]

        input_npad = self.max_input_len - len(input_ids)
        attention_mask = [1] * len(input_ids) + [0] * input_npad
        input_ids = input_ids + input_npad * [self.pad_token_id]

        output_npad = self.max_output_len - len(output_ids)
        labels = output_ids + output_npad * [-100]

        return {'input_ids': torch.LongTensor(input_ids), 'attention_mask': attention_mask,
                'labels': torch.LongTensor(labels)}

In [18]:
pretrained_model_name = "ai-forever/FRED-T5-large"

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(pretrained_model_name)
model = transformers.T5ForConditionalGeneration.from_pretrained(pretrained_model_name)


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/574 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

You are using a model of type gpt2 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at ai-forever/rugpt3large_based_on_gpt2 and are newly initialized: ['transformer.encoder.block.0.layer.0.SelfAttention.o.weight', 'transformer.decoder.block.3.layer.1.EncDecAttention.v.weight', 'transformer.encoder.block.3.layer.0.layer_norm.weight', 'transformer.encoder.block.1.layer.1.layer_norm.weight', 'transformer.decoder.block.3.layer.1.EncDecAttention.k.weight', 'transformer.decoder.block.4.layer.0.SelfAttention.k.weight', 'transformer.decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'transformer.encoder.block.5.layer.1.layer_norm.weight', 'transformer.decoder.block.2.layer.1.EncDecAttention.k.weight', 'transformer.decoder.block.3.layer.0.SelfAttention.o.weight', 'transformer.decoder.block.5.layer.0.SelfAttention.k.weight', 'transformer.lm_head.weight', 'transformer.encoder.block.4.layer.0.SelfAttention.o.weight', 'transformer.encoder.block.3.layer.0.SelfAttention.o.we

In [12]:
device = "cuda"
model.to(device)

tokenizer.add_special_tokens({'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>'})

3

In [13]:
train_samples = load_samples(dataset, tokenizer)

100%|██████████| 757/757 [00:00<00:00, 1536.67it/s]


In [14]:
len(train_samples)

56

In [15]:
import accelerate
print(accelerate.__version__)

0.25.0


In [16]:
# from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./finetuned", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=1, # number of training epochs
    per_device_train_batch_size=1, # batch size for training
    #warmup_steps=250,# number of warmup steps for learning rate scheduler
    learning_rate = 5e-5,
    gradient_accumulation_steps=35,
    bf16=False,
    fp16=True,
    lr_scheduler_type = "constant",
#   torch_compile=True,
    save_steps=1500,
    save_total_limit = 1
    )

train_dataset = SFTDataset(train_samples, tokenizer, dataset)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        data_collator=None,
        optimizers = (torch.optim.AdamW(model.parameters(),lr=5e-5),None)
    )

In [17]:
%%time
train_result = trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 15.89 GiB total capacity; 15.06 GiB already allocated; 24.12 MiB free; 15.54 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [22]:
metrics = train_result.metrics
trainer.log_metrics("train3", metrics)
trainer.save_metrics("train3", metrics)

***** train3 metrics *****
  epoch                    =        5.0
  total_flos               =  4226552GF
  train_loss               =        0.0
  train_runtime            = 0:16:50.76
  train_samples_per_second =      1.904
  train_steps_per_second   =      0.054


In [23]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
tokenizer.push_to_hub("yuraz28/FRED-T5-know_sql-test5")
model.push_to_hub("yuraz28/FRED-T5-know_sql-test5")

model.safetensors:   0%|          | 0.00/3.28G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/yuraz28/FRED-T5-know_sql-test5/commit/29cdbcd4b777f36f00ecf655c3d8dea6f21bc26d', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='29cdbcd4b777f36f00ecf655c3d8dea6f21bc26d', pr_url=None, pr_revision=None, pr_num=None)