In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import pandas as pd
from datasets import load_dataset, load_metric
from sklearn.utils import shuffle
import torch 
import mlflow
import numpy as np
from transformers import T5ForConditionalGeneration, T5Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
metric = load_metric('rouge')

  metric = load_metric('rouge')


In [3]:
model_name = 'cointegrated/rut5-base-multitask'

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [4]:
train = pd.read_csv('data\\train_small.csv', index_col=0)
train = shuffle(train[train['size']=='small'])
train_pairs = list(zip(train.source, train.target))

val = pd.read_csv('data\\eval_small.csv', index_col=0)
val = val[val['size']=='small']
eval_pairs = list(zip(val.source, val.target))

print(train.shape[0], val.shape[0])

12000 1200


In [5]:
print(train_pairs[0])
print(eval_pairs[0])

('simplify | Применение антипсихотиков приводит к сердечно-сосудистым и лёгочным нарушениям, что по крайней мере отчасти объясняет повышенный риск смертности.', 'Повышенный риск смертности при приеме антипсихотиков можно объяснить возникающими нарушениями работы легких, сердца и сосудов.')
('simplify | Лиза П. Джексон — вице-президент по охране окружающей среды, образовательной политике и социальным инициативам.', 'Лиза П. Джексон - активный человек.')


In [6]:
train_pairs = train_pairs[:200]
eval_pairs = eval_pairs[:50]

In [7]:
class SimplificationDataset(torch.utils.data.Dataset):
    def __init__(self, pairs, tokenizer, max_length):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, index):
        source_text, target_text = self.pairs[index]
        source_encoding = self.tokenizer(source_text, truncation=True, max_length=self.max_length, padding="max_length")
        target_encoding = self.tokenizer(target_text, truncation=True, max_length=self.max_length, padding="max_length")

        return {
            "input_ids": source_encoding["input_ids"],
            "attention_mask": source_encoding["attention_mask"],
            "labels": target_encoding["input_ids"],
        }

In [8]:
max_length = 512
train_dataset = SimplificationDataset(train_pairs, tokenizer, max_length=max_length)
eval_dataset = SimplificationDataset(eval_pairs, tokenizer, max_length=max_length)

In [10]:
def compute_rouge(pred):
    predictions, labels = pred
    #decode the predictions
    decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    #decode labels
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    #compute results
    res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)
    #get %
    res = {key: value.mid.fmeasure * 100 for key, value in res.items()}

    pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    res['gen_len'] = np.mean(pred_lens)

    return {k: round(v, 4) for k, v in res.items()}

In [11]:
experiment_name = "rut5"
mlflow.set_experiment(experiment_name)
current_experiment = dict(mlflow.get_experiment_by_name(experiment_name))
exp_id = current_experiment['experiment_id']

In [12]:
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [13]:
save_dir = 'rut5_test'

args = Seq2SeqTrainingArguments(
    save_dir, #save directory
    overwrite_output_dir = 'True',
    learning_rate=1e-5,
    per_device_train_batch_size= 6,
    per_device_eval_batch_size= 6,
    num_train_epochs=10,
    predict_with_generate=True,
    evaluation_strategy='steps',
    logging_steps=100,
    save_steps=10000,
    fp16=True
    )

trainer = Seq2SeqTrainer(
    model, 
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_rouge
)

Using cuda_amp half precision backend


In [14]:
#, run_name='first_test'
with mlflow.start_run(experiment_id=exp_id):
    # Логируем параметры
    mlflow.log_param('model_name', model_name)
    #mlflow.log_param('max_length', max_length)
    mlflow.log_param('train_data_size', len(train_dataset))
    mlflow.log_param('batch_size', args.per_device_train_batch_size)
    
    trainer.train()
    
    # Логируем метрики
    metrics = trainer.evaluate()
    for key, value in metrics.items():
        mlflow.log_metric(key, value)

    # Сохраняем модель и логируем путь к ней
    output_dir = f'./{save_dir}'
    trainer.save_model(output_dir)
    mlflow.log_artifacts(output_dir, artifact_path='models')
    
mlflow.end_run()

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

***** Running training *****
  Num examples = 200
  Num Epochs = 10
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 340
  Number of trainable parameters = 244309248


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
100,0.0,,2.0,2.0,2.0,2.0,18.9
200,0.0,,2.0,2.0,2.0,2.0,18.9
300,0.0,,2.0,2.0,2.0,2.0,18.9


***** Running Evaluation *****
  Num examples = 50
  Batch size = 6
***** Running Evaluation *****
  Num examples = 50
  Batch size = 6
***** Running Evaluation *****
  Num examples = 50
  Batch size = 6


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 50
  Batch size = 6


Saving model checkpoint to ./rut5_test
Configuration saved in ./rut5_test\config.json
Model weights saved in ./rut5_test\pytorch_model.bin
tokenizer config file saved in ./rut5_test\tokenizer_config.json
Special tokens file saved in ./rut5_test\special_tokens_map.json
