# Transfer learning

### Обучить модель которая будет генерировать заголовки к постам

In [1]:
# Скачивание датасета
# !wget https://www.dropbox.com/s/ykqk49a8avlmnaf/ru_all_split.tar.gz

In [2]:
# Разархивация

# import tarfile

# tar = tarfile.open("ru_all_split.tar.gz")
# tar.extractall(path='./ru_all_split')
# tar.close()

In [3]:
# Список файлов

import os

os.listdir('./ru_all_split')

['.DS_Store', 'ru_all_val.jsonl', 'ru_all_train.jsonl', 'ru_all_test.jsonl']

In [4]:
# Импорт данных в pandas dataframe

import pandas as pd

train_data = pd.read_json('./ru_all_split/ru_all_train.jsonl', lines=True) 
test_data = pd.read_json('./ru_all_split/ru_all_test.jsonl', lines=True) 

In [5]:
# Импорт данных в датасет hugging face

from datasets import Dataset

train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

In [6]:
# Ограничим размер датасета (при исходном умирает ядро)

train_dataset = train_dataset.select(range(100))
test_dataset = test_dataset.select(range(50))

In [7]:
train_dataset['title'][0]

'В разных районах Омска появились свои "Эйфелевы башни"'

In [8]:
# Модель

model_name = "IlyaGusev/rut5_base_sum_gazeta"

In [9]:
def len_tok(text):
    return len(text.split())

In [10]:
# Максимальная длинна текста и заголовка

max_len_text, max_len_tl = max(map(len_tok, train_dataset['text'])), max(map(len_tok, train_dataset['title']))
max_len_text, max_len_tl

(710, 23)

In [11]:
max(train_dataset['title'])

'Японские учёные обнаружили новый вид живых существ'

In [12]:
# Токенизация

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    tokenized_input = tokenizer(batch['text'], padding='max_length', truncation=True, max_length=max_len_text)
    tokenized_label = tokenizer(batch['title'], padding='max_length', truncation=True, max_length=max_len_tl)

    tokenized_input['labels'] = tokenized_label['input_ids']

    return tokenized_input

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=8)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=8)

train_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('numpy', columns=['input_ids', 'attention_mask', 'labels'])



  0%|          | 0/13 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

In [13]:
# Сохранение на диск

train_dataset.save_to_disk('./ru_all_split/train')
test_dataset.save_to_disk('./ru_all_split/test')

In [14]:
# Загрузка модели

from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained(model_name)

In [15]:
# Тренировочные аргументы

output_dir = './ru_all_split/output'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_accumulation_steps=1, # Number of eval steps to keep in GPU (the higher, the mor vRAM used)
    prediction_loss_only=True, # If I need co compute only loss and not other metrics, setting this to true will use less RAM
    learning_rate=0.000001,
    evaluation_strategy='steps', # Run evaluation every eval_steps
    save_steps=1000, # How often to save a checkpoint
    save_total_limit=1, # Number of maximum checkpoints to save
    remove_unused_columns=True, # Removes useless columns from the dataset
    run_name='run_headlines', # Wandb run name
    logging_steps=500, # How often to log loss to wandb
    eval_steps=500, # How often to run evaluation on the val_set
    logging_first_step=False, # Whether to log also the very first training step to wandb
    load_best_model_at_end=True, # Whether to load the best model found at each evaluation.
    metric_for_best_model="loss", # Use loss to evaluate best model.
    greater_is_better=False # Best model is the one with the lowest loss, not highest.
)

In [16]:
# Обучение

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)



In [17]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: timestamp, text, title, url. If timestamp, text, title, url are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 100
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 39
  batch[k] = torch.tensor([f[k] for f in features])


Step,Training Loss,Validation Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=39, training_loss=12.268614157652243, metrics={'train_runtime': 2885.4741, 'train_samples_per_second': 0.104, 'train_steps_per_second': 0.014, 'total_flos': 282781117440000.0, 'train_loss': 12.268614157652243, 'epoch': 3.0})

In [18]:
# Сохранение модели

trainer.save_model(output_dir + '/model')

Saving model checkpoint to ./ru_all_split/output/model
Configuration saved in ./ru_all_split/output/model/config.json
Model weights saved in ./ru_all_split/output/model/pytorch_model.bin


In [23]:
# Индекс для создания заголовка

index = 11

In [24]:
# Работа модели

import torch

input_text = test_dataset['text'][index]

with torch.no_grad():
    tokenized_text = tokenizer(input_text, truncation=True, padding=True, return_tensors='pt')

    source_ids = tokenized_text['input_ids']
    source_mask = tokenized_text['attention_mask'] 

    generated_ids = model.generate(
        input_ids = source_ids,
        attention_mask = source_mask, 
        max_length=512,
        num_beams=7,
        temperature = 1.3,
        repetition_penalty=1, 
        length_penalty=1, 
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [25]:
# Сравнение исходного и предсказанного заголовка

print('\nИсходный заголовок:\n{}'.format(test_dataset['title'][index]))
print('\nПредсказанный заголовок:\n{}'.format(pred))


Исходный заголовок:
Шнабель из ЕЦБ: решение конституционного суда Германии напрямую на нас не повлияет -- FT

Предсказанный заголовок:
Решение Конституционного суда Германии против программы покупки облигаций Европейского центрального банка не окажет прямого влияния на ЕЦБ и не приведет к тому, что Бундесбанку придется выйти из схемы, считают эксперты.
