# Общая информация
__Цель:__ сделать fine-tuning GPT neo

# Начальная инициализация

## Установка библиотек

In [13]:
#!pip install pynvml

In [14]:
#!pip install transformers

In [15]:
#!pip install wandb

In [16]:
#!pip install bert_score datasets rouge_score evaluate

## Подключение гугл диска

In [5]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [6]:
import pickle
import warnings

import nltk
import numpy as np
import pandas as pd
import torch
from datasets import load_metric
from evaluate import load
from torch.utils.data import Dataset, random_split
from tqdm.auto import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          IntervalStrategy, Trainer, TrainingArguments)

warnings.filterwarnings("ignore")
from nltk.translate.bleu_score import sentence_bleu

In [7]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [8]:
print_gpu_utilization()

GPU memory occupied: 0 MB.


## Фиксация сидов

In [9]:
# зафиксируем random seed
torch.manual_seed(42)
np.random.seed(42)

# Данные

## Чтение данных

In [10]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/vkr_data/df.csv")

In [11]:
pd.set_option("display.max_colwidth", None)

In [12]:
# pd.set_option("display.max_colwidth", None)
# with open( f'../../data/filtered_df.p', 'rb') as f:
#     df = pickle.load(f)

## Разбиение на train test

In [17]:
df = df.sort_values("Q_date_open")

In [18]:
train_df, test_df = np.split(df, [int(0.85 * len(df))])

# Model part

In [19]:
device = torch.device("cuda") if torch.cuda.is_available else "cpu"

In [20]:
model_name = "EleutherAI/gpt-neo-1.3B"

## Загрузка токенизатора

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Downloading:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [22]:
print_gpu_utilization()

GPU memory occupied: 0 MB.


In [23]:
questions = df.Q_Body
q_max_length = max([len(tokenizer.encode(question)) for question in questions])
print(f"Question max length: {q_max_length}")

Question max length: 590


In [24]:
titles = df.Q_Title
t_max_length = max([len(tokenizer.encode(title)) for title in titles])
print(f"Title max length: {t_max_length}")

Title max length: 32


In [25]:
max_length = t_max_length + q_max_length

## Создание датасета

In [26]:
class Q_A_Dataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        self.answers = []
        self.questions = []
        self.titles = []

        for _, row in df.iterrows():
            prep_text = f"Answer the next question./nQuestion: {row.Q_Body}\nAnswer: {row.A_Body}"

            question_len = len(
                tokenizer(
                    f"Answer the next question./nQuestion: {row.Q_Body}\nAnswer:",
                )["input_ids"]
            )

            encoding_dict = tokenizer(
                prep_text, truncation=True, max_length=max_length, padding="max_length"
            )

            self.input_ids.append(torch.tensor(encoding_dict["input_ids"]))
            self.attn_masks.append(torch.tensor(encoding_dict["attention_mask"]))
            self.labels.append(torch.tensor(encoding_dict["input_ids"]))
            self.labels[-1][:question_len] = -100

            self.answers.append(row.A_Body)
            self.questions.append(row.Q_Body)
            self.titles.append(row.Q_Title)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return (
            self.input_ids[idx],
            self.attn_masks[idx],
            self.labels[idx],
            self.answers[idx],
            self.questions[idx],
            self.titles[idx],
        )

In [27]:
train_dataset = Q_A_Dataset(train_df, tokenizer, max_length=max_length)
test_dataset = Q_A_Dataset(test_df, tokenizer, max_length=max_length)

In [28]:
del df, train_df, test_df

## Загрузка модели

In [29]:
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Downloading:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

In [30]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 2048)

### Заморозка весов

In [31]:
for n, p in model.named_parameters():
    if "transformer.h" in n:
        layer_num = int(n.split(".")[2])
        if "ln_" not in n and layer_num > 0 and layer_num < 23:
            p.requires_grad = False
            print("Freeze", n)

Freeze transformer.h.1.attn.attention.k_proj.weight
Freeze transformer.h.1.attn.attention.v_proj.weight
Freeze transformer.h.1.attn.attention.q_proj.weight
Freeze transformer.h.1.attn.attention.out_proj.weight
Freeze transformer.h.1.attn.attention.out_proj.bias
Freeze transformer.h.1.mlp.c_fc.weight
Freeze transformer.h.1.mlp.c_fc.bias
Freeze transformer.h.1.mlp.c_proj.weight
Freeze transformer.h.1.mlp.c_proj.bias
Freeze transformer.h.2.attn.attention.k_proj.weight
Freeze transformer.h.2.attn.attention.v_proj.weight
Freeze transformer.h.2.attn.attention.q_proj.weight
Freeze transformer.h.2.attn.attention.out_proj.weight
Freeze transformer.h.2.attn.attention.out_proj.bias
Freeze transformer.h.2.mlp.c_fc.weight
Freeze transformer.h.2.mlp.c_fc.bias
Freeze transformer.h.2.mlp.c_proj.weight
Freeze transformer.h.2.mlp.c_proj.bias
Freeze transformer.h.3.attn.attention.k_proj.weight
Freeze transformer.h.3.attn.attention.v_proj.weight
Freeze transformer.h.3.attn.attention.q_proj.weight
Freeze t

In [32]:
print_gpu_utilization()

GPU memory occupied: 5948 MB.


# Авторизумеся в wandb

In [33]:
import wandb

wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [34]:
run = wandb.init(project="QA specific domain", entity="myashka")

[34m[1mwandb[0m: Currently logged in as: [33mmyashka[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [35]:
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
%env WANDB_SILENT=true

env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all
env: WANDB_SILENT=true


# Trainer и обучение

In [36]:
model.config.use_cache = False

In [39]:
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/vkr_data/results",
    num_train_epochs=4,
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy="steps",
    save_steps=100,
    evaluation_strategy="steps",
    eval_steps=15,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    report_to="wandb",  # enable logging to W&B
    run_name="gpt_neo_android_api_change",
    gradient_checkpointing=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=lambda data: {
        "input_ids": torch.stack([f[0] for f in data]),
        "attention_mask": torch.stack([f[1] for f in data]),
        "labels": torch.stack([f[2] for f in data]),
    },
)

PyTorch: setting up devices
Using cuda_amp half precision backend


In [40]:
trainer.train()

***** Running training *****
  Num examples = 602
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 755
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss
1,0.515,0.561637
2,0.4934,0.567563
3,0.4226,0.579475
4,0.255,0.591197
5,0.231,0.60007


***** Running Evaluation *****
  Num examples = 107
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-151
Configuration saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-151/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-151/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 107
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-302
Configuration saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-302/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-302/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 107
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-453
Configuration saved in /content/drive/MyDrive/Colab Notebooks/v

TrainOutput(global_step=755, training_loss=0.49372725013076074, metrics={'train_runtime': 2095.5201, 'train_samples_per_second': 1.436, 'train_steps_per_second': 0.36, 'total_flos': 1.357496360779776e+16, 'train_loss': 0.49372725013076074, 'epoch': 5.0})

In [42]:
wandb.finish()

0,1
eval/loss,▁▂▄▆█
eval/runtime,▁▁▁█▁
eval/samples_per_second,███▁█
eval/steps_per_second,███▁█
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▁▂▃▅▆████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▇▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.61222
eval/runtime,17.445
eval/samples_per_second,6.134
eval/steps_per_second,1.548
train/epoch,5.0
train/global_step,755.0
train/learning_rate,0.0
train/loss,0.2347
train/total_flos,1.357496360779776e+16
train/train_loss,0.50562


# Evaluation

## Подгрузка нужной модели

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-746"
).to(device)

In [41]:
model.config.use_cache = True

In [42]:
rouge = load_metric("rouge")
bertscore = load("bertscore")

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [43]:
test_predictions = wandb.Artifact("gpt_neo_eval", type="predictions")

text_table = wandb.Table(
    columns=[
        "title",
        "question",
        "generated_answer",
        "original_answer",
        "bert_precision",
        "bert_recall",
        "bert_f1",
        "rouge_score",
        "bleu_score",
    ]
)

## Функция генерации ответа на вопрос с promt

In [44]:
def generate_answer(model, tokenizer, deivce, question, title, temp=0):
    model.eval()

    question_len = len(f"Answer the question.\nQuestion: {question}\nAnswer:")

    text_to_answer = f"Answer the question.\nQuestion: {question}\nAnswer:"

    enc_text_to_answer = tokenizer(text_to_answer, return_tensors="pt").input_ids.to(
        device
    )

    generated_output = model.generate(
        enc_text_to_answer,
        do_sample=False,
        top_k=50,
        max_length=max_length,
        top_p=0.9,
        temperature=temp,
        num_return_sequences=0,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id,
    ).to("cpu")

    del enc_text_to_answer

    generated_q_a = tokenizer.decode(generated_output[0], skip_special_tokens=True)

    generated_a = generated_q_a[question_len:]

    return generated_a

In [45]:
print(test_dataset[1][4])

Say for instance you have an application with a landing page and you want this landing page to either show a log in button if your user is not logged in, or display the users name in a text view if the user is indeed logged in.Is it a must to create two slightly different layouts to inflate based on the condition of being logged in? Or is there a way to decide at runtime which layout component will be necessary given the state of the application?If the latter is true, how do you do this?


In [46]:
text = generate_answer(model, tokenizer, device, test_dataset[1][4], test_dataset[1][5])

In [47]:
text

' You can use a <code>View.setLayoutParams</code>, <i>or</i>, depending on what you need.  '

In [48]:
print(test_dataset[1][3])

If it's just as simple as showing a <code>TextView</code> instead of a <code>Button</code> why not just use <code>setVisibilty</code> on each of them depending on your situation. If you have a more complicated layout then I would suggest having two separate layouts and inflating the correct one. Or as @ataulm suggested use separate activities


In [49]:
bert_score = bertscore.compute(
    predictions=[text], references=[test_dataset[1][3]], lang="en"
)
bert_score

Could not locate the tokenizer configuration file, will try to use the model config instead.


Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-large/snapshots/5069d8a2a32a7df4c69ef9b56348be04152a2341/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.23.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

loading file vocab.json from cache at /root/.cache/huggingface/hub/models--roberta-large/snapshots/5069d8a2a32a7df4c69ef9b56348be04152a2341/vocab.json
loading file merges.txt from cache at /root/.cache/huggingface/hub/models--roberta-large/snapshots/5069d8a2a32a7df4c69ef9b56348be04152a2341/merges.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at None
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--roberta-large/snapshots/5069d8a2a32a7df4c69ef9b56348be04152a2341/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-large",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_

Downloading:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--roberta-large/snapshots/5069d8a2a32a7df4c69ef9b56348be04152a2341/pytorch_model.bin
All the weights of RobertaModel were initialized from the model checkpoint at roberta-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use RobertaModel for predictions without further training.


{'precision': [0.8968422412872314],
 'recall': [0.8532333374023438],
 'f1': [0.8744944334030151],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.23.1)'}

## Проверка качества модели

In [36]:
bleu_score = nltk.translate.bleu("what is weather?", "what is", weights=(1, 0, 0, 0))

In [37]:
bleu_score

1.0

In [51]:
def evaluate(model, test_dataset, tokenizer, device, text_table, wandb_run):
    model.eval()

    bleu_scores = []
    rouge_scores = []
    bert_scores = []
    bert_precisions = []
    bert_recalls = []
    bert_f1s = []

    for _, _, _, answer, question, title in tqdm(test_dataset):

        generated_a = generate_answer(model, tokenizer, device, question, title, 0.0)

        bert_score = bertscore.compute(
            predictions=[generated_a], references=[answer], lang="en"
        )

        bleu_score = sentence_bleu(answer, generated_a, weights=(1, 0, 0, 0))
        rouge_score = rouge.compute(predictions=[generated_a], references=[answer])[
            "rouge1"
        ].mid.fmeasure

        text_table.add_data(
            title,
            question,
            generated_a,
            answer,
            bert_score["precision"][0],
            bert_score["recall"][0],
            bert_score["f1"][0],
            rouge_score,
            bleu_score,
        )

    test_predictions.add(text_table, "gpt_neo_eval_v1")
    wandb_run.log_artifact(test_predictions)

    return (
        np.mean(bleu_scores),
        np.mean(rouge_scores),
        np.mean(bert_precisions),
        np.mean(bert_recalls),
        np.mean(bert_f1s),
    )

In [52]:
bleu_score, rouge_score, bert_precision, bert_recall, bert_f1 = evaluate(
    model, test_dataset, tokenizer, device, text_table, run
)

  0%|          | 0/107 [00:00<?, ?it/s]

In [53]:
print(bleu_score, rouge_score, bert_precision, bert_recall, bert_f1)

nan nan nan nan nan


In [54]:
wandb.finish()

VBox(children=(Label(value='5114.896 MB of 5114.896 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.…

0,1
eval/loss,▁▂▄▆█
eval/runtime,▁▁▁█▁
eval/samples_per_second,███▁█
eval/steps_per_second,███▁█
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/learning_rate,▁▂▃▅▆████▇▇▇▇▇▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▁▁▁
train/loss,█▇▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,0.60007
eval/runtime,17.221
eval/samples_per_second,6.213
eval/steps_per_second,1.568
train/epoch,5.0
train/global_step,755.0
train/learning_rate,0.0
train/loss,0.231
train/total_flos,1.357496360779776e+16
train/train_loss,0.49373
