# Общая информация
__Цель:__ сделать fine-tuning mGPT by Sber

__Задачи:__
1) Выбрать вопросы с определенным тегом, например python

2) Понять формат входных и выходных данных, например перед вопросов, возможно надо ставить [QUESTION]

3) Сделать torch Dataset

4) Определить, как делать evaluation

5) Способ трэкинга

6) Проанализировать результаты

# Импорт библиотек

In [10]:
import pickle

import numpy as np
import pandas as pd
import torch
import wandb
from datasets import load_metric
from torch.utils.data import Dataset, random_split
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          IntervalStrategy, Trainer, TrainingArguments)

In [11]:
# зафиксируем random seed
torch.manual_seed(42)

<torch._C.Generator at 0x18f046ac9f0>

# Чтение данных

In [12]:
pd.set_option("display.max_colwidth", None)
with open(f"../../data/filtered_df.p", "rb") as f:
    df = pickle.load(f)

Отсортируем датасет по времени и разобьем его на train test

In [13]:
df = df.sort_values("Q_Date")

In [14]:
df = df.loc[df.apply(lambda x: f"python" in x.Tag, axis=1)]

In [15]:
train_df, test_df = np.split(df, [int(0.75 * len(df))])

# Загрузим модель

In [16]:
model_name = "EleutherAI/gpt-neo-1.3B"

In [20]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    bos_token="<|startoftext|>",
    eos_token="<|endoftext|>",
    pad_token="<|pad|>",
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
model = AutoModelForCausalLM.from_pretrained(model_name)

In [22]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 2048)

Отберем только те строки, в тегах которых есть слово _python_

In [23]:
questions = df.Q_Body
max_length = max([len(tokenizer.encode(question)) for question in questions])
print(f"Max length: {max_length}")

Max length: 1055


In [24]:
class Q_A_Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length, tag):
        df = df.loc[df.apply(lambda x: f"{tag}" in x.Tag, axis=1)]
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        self.answers = []
        self.questions = []

        for _, row in df.iterrows():
            prep_text = f"<|startoftext|>Question: {row.Q_Body}\nAnswer: {row.A_Body}<|endoftext|>"

            question_len = len(
                tokenizer(
                    f"<|startoftext|>Question: {row.Q_Body}\nAnswer:",
                )["input_ids"]
            )

            encoding_dict = tokenizer(
                prep_text, truncation=True, max_length=max_length, padding="max_length"
            )

            self.input_ids.append(torch.tensor(encoding_dict["input_ids"]))
            self.attn_masks.append(torch.tensor(encoding_dict["attention_mask"]))
            self.labels.append(torch.tensor(encoding_dict["input_ids"]))
            self.labels[-1][:question_len] = -100

            self.answers.append(row.A_Body)
            self.questions.append(row.Q_Body)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return (
            self.input_ids[idx],
            self.attn_masks[idx],
            self.labels[idx],
            self.answers[idx],
            self.questions[idx],
        )

In [25]:
train_dataset = Q_A_Dataset(train_df, tokenizer, max_length=max_length, tag="python")
test_dataset = Q_A_Dataset(test_df, tokenizer, max_length=max_length, tag="python")

In [26]:
del df

# Определим evaluation metrics

Определить метрики качества \ __BLEU, ROGUE__

Трекинг модели в wandb

API HF Trainer

In [28]:
bleu = load_metric("bleu")
rouge = load_metric("rouge")

# Авторизумеся в wandb

In [110]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\porsh/.netrc


True

In [111]:
wandb.init(project="QA specific domain", entity="myashka")

[34m[1mwandb[0m: Currently logged in as: [33mmyashka[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [112]:
%env WANDB_LOG_MODEL=true

env: WANDB_LOG_MODEL=true


In [113]:
%env WANDB_WATCH=all
%env WANDB_SILENT=true

env: WANDB_WATCH=all
env: WANDB_SILENT=true


In [114]:
wandb.finish()

# Определим Trainer и запустим обучение

In [29]:
data_collator = lambda data: {
    "input_ids": torch.stack([f for f in data[0]]),
    "attention_mask": torch.stack([f for f in data[1]]),
    "labels": torch.stack([f for f in data[2]]),
}

In [30]:
data_collator(train_dataset[0:2])["input_ids"].shape

torch.Size([2, 1055])

In [None]:
args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to="wandb",  # enable logging to W&B
    run_name="gpt_neo_first_run",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=lambda data: {
        "input_ids": torch.stack([f for f in data[0]]),
        "attention_mask": torch.stack([f for f in data[1]]),
        "labels": torch.stack([f for f in data[2]]),
    },
)

In [118]:
data_collator(train_dataset[0:2])

{'input_ids': tensor([[[50257, 24361,    25,  ..., 50258, 50258, 50258]],
 
         [[50257, 24361,    25,  ..., 50258, 50258, 50258]]]),
 'attention_mask': tensor([[[1, 1, 1,  ..., 0, 0, 0]],
 
         [[1, 1, 1,  ..., 0, 0, 0]]]),
 'labels': tensor([[[-100, -100, -100,  ..., -100, -100, -100]],
 
         [[-100, -100, -100,  ..., -100, -100, -100]]])}

# Evaluation

### #TODO
1. Дописать evaluate
2. Прогнать через модель dataset

In [31]:
text = "Question: What is the weather like?\nAnswer: Today is a good day!\n"
question = "Question: What is the weather like?\nAnswer: "

In [32]:
rouge()

'Today is a good day!\n'

In [134]:
def evaluate(model, test_dataset, tokenizer):
    model.eval()

    original_text, predicted_text, original_answer, predicted_answer = [], [], [], []

    bleu_scores = []
    rouge_scores = []

    for encoded_ids, _, _, answer, question in tqdm(test_dataset):

        original_text.append(f"Question: {question}\nAnswer: {answer}")
        original_answer.append(f"Answer: {answer}")

        question_len = len(f"Question: {question}\nAnswer: ")

        text_to_answer = f"<|startoftext|>Question: {question}\nAnswer:"

        enc_text_to_answer = tokenizer(text_to_answer, return_tensors="pt").input_ids

        generated_output = model.generate(
            enc_text_to_answer,
            do_sample=False,
            top_k=50,
            max_length=max_length,
            top_p=0.90,
            temperature=0,
            num_return_sequences=0,
        )
        # возвращается с pad
        generated_q_a = tokenizer.batch_decode(
            generated_output[0], skip_special_rokens=True
        )
        generated_a = generated_q_a[question_len:]

        predicted_text.append(generated_q_a)
        predicted_answer.append(generated_a)

        bleu_scores.append(
            bleu.compute(predictions=generated_a, references=answer)["bleu"]
        )
        rouge_scores.append(
            rouge.compute(predictions=generated_a, references=answer)[
                "rouge1"
            ].mid.fmeasure
        )

    return (
        original_text,
        predicted_text,
        original_answer,
        predicted_answer,
        np.mean(bleu_scores),
        np.mean(rouge_scores),
    )

In [103]:
generated_q_a

"This partly depends on your mod_wsgi configuration. If you configure it to use only one thread per process, then global variables are safe--although I wouldn't recommend using them, for a variety of reasons. In a multi-thread configuration, there is nothing guaranteeing that requests won't get mixed up if you use global variables.\nYou should be able to find some more local place to stash the data you need between pre_save and post_save. I'd recommend putting some more thought into your design.\n<|startoftext|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|><|pad|>