# Общая информация
__Цель:__ сделать fine-tuning GPT neo

__Задачи:__

1) Сделать генерацию с разными вариантами температуры

2) Выполнить то же самое для версии, с пофикшенным data collator

3) Сравнить адекватность

4) Дообучить модель (если потребуется)

5) Сравнить дообученную модель первой версии и последней (если потребуется)

# Начальная инициализация

## Установка библиотек

In [None]:
#!pip install pynvml

In [None]:
#!pip install transformers

In [None]:
#!pip install wandb

In [None]:
#!pip install bert_score datasets rouge_score evaluate

## Подключение гугл диска

In [15]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import pickle
import warnings

import nltk
import numpy as np
import pandas as pd
import torch
from datasets import load_metric
from evaluate import load
from torch.utils.data import Dataset, random_split
from tqdm.auto import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          IntervalStrategy, Trainer, TrainingArguments)

warnings.filterwarnings("ignore")

In [17]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [18]:
print_gpu_utilization()

GPU memory occupied: 0 MB.


## Фиксация сидов

In [19]:
# зафиксируем random seed
torch.manual_seed(42)
np.random.seed(42)

# Данные

## Чтение данных

In [24]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/vkr_data/filtered_df.csv")

In [25]:
pd.set_option("display.max_colwidth", None)

In [26]:
# pd.set_option("display.max_colwidth", None)
# with open( f'../../data/filtered_df.p', 'rb') as f:
#     df = pickle.load(f)

## Разбиение на train test

In [27]:
df = df.sort_values("Q_Date")

In [28]:
df = df.loc[df.apply(lambda x: f"python" in x.Tag, axis=1)]

In [29]:
train_df, test_df = np.split(df, [int(0.85 * len(df))])

# Model part

In [20]:
device = torch.device("cuda") if torch.cuda.is_available else "cpu"

In [21]:
model_name = "EleutherAI/gpt-neo-1.3B"

## Загрузка токенизатора

In [30]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Downloading:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [None]:
print_gpu_utilization()

GPU memory occupied: 0 MB.


In [31]:
questions = df.Q_Body
max_length = max([len(tokenizer.encode(question)) for question in questions])
print(f"Max length: {max_length}")

Max length: 1055


In [None]:
a = np.array([len(tokenizer.encode(question)) for question in questions])
print(len(a))
sum(a <= 600)

1754


1745

In [None]:
max_length = 600

## Создание датасета

In [32]:
class Q_A_Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length, tag):
        df = df.loc[df.apply(lambda x: f"{tag}" in x.Tag, axis=1)]
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        self.answers = []
        self.questions = []

        for _, row in df.iterrows():
            prep_text = f"Question: {row.Q_Body}\nAnswer: {row.A_Body}"

            question_len = len(
                tokenizer(
                    f"Question: {row.Q_Body}\nAnswer:",
                )["input_ids"]
            )

            encoding_dict = tokenizer(
                prep_text, truncation=True, max_length=max_length, padding="max_length"
            )

            self.input_ids.append(torch.tensor(encoding_dict["input_ids"]))
            self.attn_masks.append(torch.tensor(encoding_dict["attention_mask"]))
            self.labels.append(torch.tensor(encoding_dict["input_ids"]))
            self.labels[-1][:question_len] = -100

            self.answers.append(row.A_Body)
            self.questions.append(row.Q_Body)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return (
            self.input_ids[idx],
            self.attn_masks[idx],
            self.labels[idx],
            self.answers[idx],
            self.questions[idx],
        )

In [33]:
train_dataset = Q_A_Dataset(train_df, tokenizer, max_length=max_length, tag="python")
test_dataset = Q_A_Dataset(test_df, tokenizer, max_length=max_length, tag="python")

In [34]:
del df, train_df, test_df

## Загрузка модели

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Downloading:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

In [None]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 2048)

### Заморозка весов

In [35]:
for n, p in model.named_parameters():
    if "transformer.h" in n:
        layer_num = int(n.split(".")[2])
        if "ln_" not in n and layer_num > 0 and layer_num < 23:
            p.requires_grad = False
            print("Freeze", n)

Freeze transformer.h.1.attn.attention.k_proj.weight
Freeze transformer.h.1.attn.attention.v_proj.weight
Freeze transformer.h.1.attn.attention.q_proj.weight
Freeze transformer.h.1.attn.attention.out_proj.weight
Freeze transformer.h.1.attn.attention.out_proj.bias
Freeze transformer.h.1.mlp.c_fc.weight
Freeze transformer.h.1.mlp.c_fc.bias
Freeze transformer.h.1.mlp.c_proj.weight
Freeze transformer.h.1.mlp.c_proj.bias
Freeze transformer.h.2.attn.attention.k_proj.weight
Freeze transformer.h.2.attn.attention.v_proj.weight
Freeze transformer.h.2.attn.attention.q_proj.weight
Freeze transformer.h.2.attn.attention.out_proj.weight
Freeze transformer.h.2.attn.attention.out_proj.bias
Freeze transformer.h.2.mlp.c_fc.weight
Freeze transformer.h.2.mlp.c_fc.bias
Freeze transformer.h.2.mlp.c_proj.weight
Freeze transformer.h.2.mlp.c_proj.bias
Freeze transformer.h.3.attn.attention.k_proj.weight
Freeze transformer.h.3.attn.attention.v_proj.weight
Freeze transformer.h.3.attn.attention.q_proj.weight
Freeze t

In [36]:
print_gpu_utilization()

GPU memory occupied: 5920 MB.


# Авторизумеся в wandb

In [37]:
import wandb

wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [38]:
run = wandb.init(project="QA specific domain", entity="myashka")

[34m[1mwandb[0m: Currently logged in as: [33mmyashka[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [39]:
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
%env WANDB_SILENT=true

env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all
env: WANDB_SILENT=true


# Trainer и обучение

In [23]:
model.config.use_cache = False

In [40]:
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/vkr_data/results",
    num_train_epochs=5,
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/Colab Notebooks/vkr_data/logs",
    report_to="wandb",  # enable logging to W&B
    run_name="gpt_neo_third_run",
    evaluation_strategy="epoch",
    gradient_checkpointing=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=lambda data: {
        "input_ids": torch.stack([f[0] for f in data]),
        "attention_mask": torch.stack([f[1] for f in data]),
        "labels": torch.stack([f[2] for f in data]),
    },
)

Using cuda_amp half precision backend


In [41]:
trainer.train(
    resume_from_checkpoint="/content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-746"
)

Loading model from /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-746.
You are resuming training from a checkpoint trained with 4.22.2 of Transformers but your current version is 4.23.1. This is not recommended and could yield to errors or unwanted behaviors.
***** Running training *****
  Num examples = 1490
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 1865
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 2
  Continuing training from global step 746
  Will skip the first 2 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


0it [00:00, ?it/s]

Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
3,0.23,0.358668
4,0.2306,0.368467
5,0.2365,0.374936


***** Running Evaluation *****
  Num examples = 264
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-1119
Configuration saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-1119/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-1119/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 264
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-1492
Configuration saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-1492/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-1492/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 264
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-1865
Configuration saved in /content/drive/MyDrive/Colab Note

TrainOutput(global_step=1865, training_loss=0.13226373387405763, metrics={'train_runtime': 7960.727, 'train_samples_per_second': 0.936, 'train_steps_per_second': 0.234, 'total_flos': 5.6988933040128e+16, 'train_loss': 0.13226373387405763, 'epoch': 5.0})

In [42]:
wandb.finish()

0,1
eval/loss,▁▅█
eval/runtime,█▁█
eval/samples_per_second,▁█▁
eval/steps_per_second,▁█▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,▂▂▃▃▄▅▄▂▅▃▄█▅█▂▃▅▂▂▂▃▁▃▃▃▃▃▃▂▄▂▂▂▂▁▂▂▂▂▄
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.37494
eval/runtime,144.4379
eval/samples_per_second,1.828
eval/steps_per_second,0.457
train/epoch,5.0
train/global_step,1865.0
train/learning_rate,0.0
train/loss,0.2365
train/total_flos,5.6988933040128e+16
train/train_loss,0.13226


# Evaluation

## Подгрузка нужной модели

In [22]:
model = AutoModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-746"
).to(device)

In [None]:
model.config.use_cache = True

In [None]:
rouge = load_metric("rouge")
bertscore = load("bertscore")

In [None]:
text_table = wandb.Table(
    columns=[
        "question",
        "generated_answer",
        "original_answer",
        "bert_precision",
        "bert_recall",
        "bert_f1",
        "rouge_score",
        "bleu_score",
        "len_of_generated_answer",
    ]
)

## Функция генерации ответа на вопрос с promt

In [None]:
def generate_answer(model, tokenizer, deivce, question, temp=0):
    model.eval()

    question_len = len(f"Question: {question}\nAnswer:")

    text_to_answer = f"Question: {question}\nAnswer:"

    enc_text_to_answer = tokenizer(text_to_answer, return_tensors="pt").input_ids.to(
        device
    )

    generated_output = model.generate(
        enc_text_to_answer,
        do_sample=False,
        top_k=50,
        max_length=max_length,
        top_p=0.9,
        temperature=temp,
        num_return_sequences=0,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id,
    ).to("cpu")

    del enc_text_to_answer

    generated_q_a = tokenizer.decode(generated_output[0], skip_special_tokens=True)

    generated_a = generated_q_a[question_len:]

    return generated_a

In [None]:
print(test_dataset[1][4])

While writing an application parsing command line arguments I would like to run it with various parameters.
I don't want to create a Run Configuration for every possible command line argument that I want my script to test with. Is there a way in PyCharm (and I guess with any JetBrains IDE) to make a Run Configuration that asks for the Script parameters when executed?
I am currently using PyCharm 3.1 (EAP).



In [None]:
text = generate_answer(model, tokenizer, device, test_dataset[1][4])

In [None]:
text

" You can use the Run Configurations feature to specify the arguments that you want your script executed with, but you can't specify a list of arguments. You'll have to use a custom Run configuration. \n"

In [None]:
print(test_dataset[1][3])

Currently the only possibility is to use the "Before launch | Show this page" option.



In [None]:
bert_score = bertscore.compute(
    predictions=[text], references=[test_dataset[1][3]], lang="en"
)
bert_score

{'precision': [0.8648298382759094],
 'recall': [0.8778269290924072],
 'f1': [0.8712799549102783],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.11(hug_trans=4.22.2)'}

## Проверка качества модели

In [None]:
def evaluate(model, test_dataset, tokenizer, device, text_table, wandb_run):
    model.eval()

    bleu_scores = []
    rouge_scores = []
    bert_scores = []
    bert_precisions = []
    bert_recalls = []
    bert_f1s = []

    for _, _, _, answer, question in tqdm(test_dataset):

        generated_a = generate_answer(model, tokenizer, device, question, 0.0)

        bert_score = bertscore.compute(
            predictions=[generated_a], references=[answer], lang="en"
        )

        bleu_score = nltk.translate.bleu_score.sentence_bleu(answer, generated_a)
        rouge_score = rouge.compute(predictions=[generated_a], references=[answer])[
            "rouge1"
        ].mid.fmeasure

        text_table.add_data(
            question,
            generated_a,
            answer,
            bert_score["precision"][0],
            bert_score["recall"][0],
            bert_score["f1"][0],
            rouge_score,
            bleu_score,
            len(generated_a),
        )

    wandb_run.log({"evaluation_table": text_table})

    return (
        np.mean(bleu_scores),
        np.mean(rouge_scores),
        np.mean(bert_precisions),
        np.mean(bert_recalls),
        np.mean(bert_f1s),
    )

In [None]:
bleu_score, rouge_score, bert_precision, bert_recall, bert_f1 = evaluate(
    model, test_dataset, tokenizer, device, text_table, run
)

  0%|          | 0/264 [00:00<?, ?it/s]

Input length of input_ids is 1060, but `max_length` is set to 600. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Input length of input_ids is 670, but `max_length` is set to 600. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


In [None]:
print(bleu_score, rouge_score, bert_precision, bert_recall, bert_f1)

nan nan nan nan nan


In [None]:
wandb.finish()