# Общая информация
__Цель:__ сделать fine-tuning GPT neo с 1,2B параметров

__Задачи:__
1) Выбрать вопросы с определенным тегом, например python

2) Понять формат входных и выходных данных, например перед вопросов, возможно надо ставить [QUESTION]

3) Сделать torch Dataset

4) Определить, как делать evaluation

5) Способ трэкинга

6) Проанализировать результаты

# Импорт библиотек

In [7]:
# from google.colab import drive

# drive.mount("/content/drive")

In [1]:
#!pip install transformers

In [50]:
import pickle
import warnings

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          IntervalStrategy, Trainer, TrainingArguments)

warnings.filterwarnings("ignore")

In [3]:
#!pip install pynvml

In [4]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [20]:
print_gpu_utilization()

GPU memory occupied: 5809 MB.


In [21]:
# зафиксируем random seed
torch.manual_seed(42)
np.random.seed(42)

# Чтение данных

In [7]:
# df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/vkr_data/filtered_df.csv")

In [22]:
pd.set_option("display.max_colwidth", None)

In [23]:
pd.set_option("display.max_colwidth", None)
with open(f"../../data/prepared/filtered_df.p", "rb") as f:
    df = pickle.load(f)

Отсортируем датасет по времени и разобьем его на train test

In [24]:
df = df.sort_values("Q_Date")

In [25]:
df = df.loc[df.apply(lambda x: f"python" in x.Tag, axis=1)]

In [26]:
train_df, test_df = np.split(df, [int(0.75 * len(df))])

# Загрузим модель

In [9]:
device = torch.device("cuda") if torch.cuda.is_available else "cpu"

In [10]:
model_name = "EleutherAI/gpt-neo-1.3B"

In [27]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    bos_token="<|startoftext|>",
    eos_token="<|endoftext|>",
    pad_token="<|pad|>",
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Downloading: 100%|████████████████████████████████████████████████████████████████| 4.95G/4.95G [03:14<00:00, 27.3MB/s]


In [26]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 2048)

In [27]:
print_gpu_utilization()

GPU memory occupied: 6486 MB.


In [None]:
for n, p in model.named_parameters():
    if "transformer.h" in n:
        layer_num = int(n.split(".")[2])
        if "ln_" not in n and layer_num > 0 and layer_num < 23:
            p.requires_grad = False
            print("Freeze", n)

In [None]:
print_gpu_utilization()

GPU memory occupied: 6314 MB.


Отберем только те строки, в тегах которых есть слово _python_

In [28]:
questions = df.Q_Body
max_length = max([len(tokenizer.encode(question)) for question in questions])
print(f"Max length: {max_length}")

Max length: 1055


In [29]:
class Q_A_Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length, tag):
        df = df.loc[df.apply(lambda x: f"{tag}" in x.Tag, axis=1)]
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        self.answers = []
        self.questions = []

        for _, row in df.iterrows():
            prep_text = f"<|startoftext|>Question: {row.Q_Body}\nAnswer: {row.A_Body}<|endoftext|>"

            question_len = len(
                tokenizer(
                    f"<|startoftext|>Question: {row.Q_Body}\nAnswer:",
                )["input_ids"]
            )

            encoding_dict = tokenizer(
                prep_text, truncation=True, max_length=max_length, padding="max_length"
            )

            self.input_ids.append(torch.tensor(encoding_dict["input_ids"]))
            self.attn_masks.append(torch.tensor(encoding_dict["attention_mask"]))
            self.labels.append(torch.tensor(encoding_dict["input_ids"]))
            self.labels[-1][:question_len] = -100

            self.answers.append(row.A_Body)
            self.questions.append(row.Q_Body)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return (
            self.input_ids[idx],
            self.attn_masks[idx],
            self.labels[idx],
            self.answers[idx],
            self.questions[idx],
        )

In [30]:
train_dataset = Q_A_Dataset(train_df, tokenizer, max_length=max_length, tag="python")
test_dataset = Q_A_Dataset(test_df, tokenizer, max_length=max_length, tag="python")

In [31]:
del df, train_df, test_df

# Авторизумеся в wandb

In [76]:
#!pip install wandb

In [32]:
import wandb

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\user/.netrc


True

In [33]:
run = wandb.init(project="QA specific domain", entity="myashka")

[34m[1mwandb[0m: Currently logged in as: [33mmyashka[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
%env WANDB_LOG_MODEL=true

env: WANDB_LOG_MODEL=true


In [None]:
%env WANDB_WATCH=all
%env WANDB_SILENT=true

env: WANDB_WATCH=all
env: WANDB_SILENT=true


# Определим Trainer и запустим обучение

In [None]:
model.config.use_cache = False

In [None]:
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/vkr_data/results",
    num_train_epochs=5,
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/Colab Notebooks/vkr_data/logs",
    report_to="wandb",  # enable logging to W&B
    run_name="gpt_neo_second_run",
    evaluation_strategy="epoch",
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=lambda data: {
        "input_ids": torch.stack([f[0] for f in data]),
        "attention_mask": torch.stack([f[1] for f in data]),
        "labels": torch.stack([f[0] for f in data]),
    },
)

Using cuda_amp half precision backend


In [None]:
print_gpu_utilization()

GPU memory occupied: 6314 MB.


In [None]:
trainer.train()

***** Running training *****
  Num examples = 1315
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 2
  Total optimization steps = 820
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
0,5.2856,5.066072
1,2.6316,3.651306
2,8.0558,8.023211


***** Running Evaluation *****
  Num examples = 439
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-164
Configuration saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-164/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-164/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 439
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-328
Configuration saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-328/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-328/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 439
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-492
Configuration saved in /content/drive/MyDrive/Colab Notebooks/v

Epoch,Training Loss,Validation Loss
0,5.2856,5.066072
1,2.6316,3.651306
2,8.0558,8.023211
3,8.2075,8.6931
4,8.671,8.78241


Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-656
Configuration saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-656/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-656/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 439
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-820
Configuration saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-820/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-820/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-328 (score: 3.651305913925171).
Using cuda_amp half precision backend
Saving model checkpoint to /tmp/tmpu9sguz0z
Configur

TrainOutput(global_step=820, training_loss=5.610103132666611, metrics={'train_runtime': 12026.6484, 'train_samples_per_second': 0.547, 'train_steps_per_second': 0.068, 'total_flos': 5.027265341472768e+16, 'train_loss': 5.610103132666611, 'epoch': 5.0})

In [None]:
wandb.finish()

VBox(children=(Label(value='5114.658 MB of 5114.658 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.…

0,1
eval/loss,▃▁▇██
eval/runtime,█▇▂▁▁
eval/samples_per_second,▁▂▇██
eval/steps_per_second,▁▁███
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,▁▂▃▅▆████▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁
train/loss,▁▃▂▂▁▂▂▂▄▄▃▂▁▂▁▁▁▂▂▂▂▄▄▆▆▇▇▇▆█▇▇▇██▇██▆█
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,8.78241
eval/runtime,223.8346
eval/samples_per_second,1.961
eval/steps_per_second,0.491
train/epoch,5.0
train/global_step,820.0
train/learning_rate,0.0
train/loss,8.671
train/total_flos,5.027265341472768e+16
train/train_loss,5.6101


# Evaluation

In [36]:
#!pip install bert_score, datasets, rouge_score, evaluate

In [16]:
from evaluate import load

In [10]:
# model = AutoModelForCausalLM.from_pretrained(
#     "/content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-328"
# ).to(device)

In [13]:
model = AutoModelForCausalLM.from_pretrained("./results/checkpoint-328").to(device)

In [34]:
model.config.use_cache = True

In [17]:
import nltk
from datasets import load_metric

rouge = load_metric("rouge")
bertscore = load("bertscore")

Downloading builder script: 100%|█████████████████████████████████████████████████| 7.95k/7.95k [00:00<00:00, 7.67MB/s]


In [35]:
text_table = wandb.Table(
    columns=[
        "question",
        "generated_answer",
        "original_answer",
        "bert_precision",
        "bert_recall",
        "bert_f1",
        "rouge_score",
        "bleu_score",
        "len_of_generated_answer",
    ]
)

In [51]:
def evaluate_model(model, test_dataset, tokenizer, device, text_table, wandb_run):
    model.eval()

    bleu_scores = []
    rouge_scores = []
    bert_scores = []
    bert_precisions = []
    bert_recalls = []
    bert_f1s = []

    for encoded_ids, _, _, answer, question in tqdm(test_dataset):

        question_len = len(f"Question: {question}\nAnswer: ")

        text_to_answer = f"<|startoftext|>Question: {question}\nAnswer:"

        enc_text_to_answer = tokenizer(
            text_to_answer, return_tensors="pt"
        ).input_ids.to(device)

        generated_output = model.generate(
            enc_text_to_answer,
            do_sample=False,
            top_k=50,
            max_length=max_length,
            top_p=0.90,
            temperature=0,
            num_return_sequences=0,
            pad_token_id=tokenizer.eos_token_id,
        ).to("cpu")

        generated_q_a = tokenizer.decode(generated_output[0], skip_special_tokens=True)
        generated_a = generated_q_a[question_len:]

        bert_score = bertscore.compute(
            predictions=[generated_a], references=[answer], lang="en"
        )

        bleu_score = nltk.translate.bleu_score.sentence_bleu(answer, generated_a)
        rouge_score = rouge.compute(predictions=[generated_a], references=[answer])[
            "rouge1"
        ].mid.fmeasure

        text_table.add_data(
            question,
            generated_a,
            answer,
            bert_score["precision"],
            bert_score["recall"],
            bert_score["f1"],
            rouge_score,
            bleu_score,
            len(generated_a),
        )

    wandb_run.log({"evaluation_table": text_table})

    return (
        np.mean(bleu_scores),
        np.mean(rouge_scores),
        np.mean(bert_precisions),
        np.mean(bert_recalls),
        np.mean(bert_f1s),
    )

In [48]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


In [None]:
bleu_score, rouge_score, bert_precision, bert_recall, bert_f1 = evaluate_model(
    model, test_dataset, tokenizer, device, text_table, run
)

In [53]:
print(bleu_score, rouge_score, bert_precision, bert_recall, bert_f1)

nan nan nan nan nan


Обучение прошло неудачно, генерируется бред и повторяющиеся слова

__Возможные причины:__
1) Не нужно замораживать веса при обучении

2) Плохой выбор гиперпараметров

3) Параметры генерации, возможно увеличить температуру

4) Ошибка в реализации data collator, в следствие чего модель генерировала и вопрос в том числе

__Возможные варианты__
1) Обучить с весами

2) Проверить способность дата коллектора

3) Использовать deepspeed для более быстрого обучения

4) Попробовать дефолтные гиперпараметры