# Общая информация
__Цель:__ сделать fine-tuning GPT neo

# Начальная инициализация

## Установка библиотек

In [2]:
#!pip install bert_score datasets rouge_score evaluate pynvml transformers wandb

## Подключение гугл диска

In [3]:
from google.colab import drive

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pickle
import warnings

import nltk
import numpy as np
import pandas as pd
import torch
from datasets import load_metric
from evaluate import load
from torch.utils.data import Dataset, random_split
from tqdm.auto import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
                          EarlyStoppingCallback, Trainer, TrainingArguments)

warnings.filterwarnings("ignore")
from nltk.translate.bleu_score import sentence_bleu

In [5]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [6]:
print_gpu_utilization()

GPU memory occupied: 0 MB.


## Фиксация сидов

In [7]:
# зафиксируем random seed
torch.manual_seed(42)
np.random.seed(42)

# Данные

## Чтение данных

In [8]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/vkr_data/df.csv")

In [9]:
pd.set_option("display.max_colwidth", None)

In [10]:
# pd.set_option("display.max_colwidth", None)
# with open( f'../../data/filtered_df.p', 'rb') as f:
#     df = pickle.load(f)

## Разбиение на train test

In [11]:
df = df.sort_values("Q_date_open")
train_df, test_df = np.split(df, [int(0.85 * len(df))])

# Model part

In [12]:
device = torch.device("cuda") if torch.cuda.is_available else "cpu"

In [13]:
model_name = "EleutherAI/gpt-neo-1.3B"

## Загрузка токенизатора

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

Downloading:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [15]:
answers = df.A_Body
a_max_length = max([len(tokenizer.encode(answer)) for answer in answers])
print(f"Answers max length: {a_max_length}")

Answers max length: 1027


## Создание датасета

In [18]:
promt_before_question = """You will be asked a series of questions. For each question, you must either answer the question or decline to answer, in which case you must state that you have no comment\nQuestion: """
promt_before_answer = "\nAnswer:"

In [19]:
class Q_A_Dataset(Dataset):
    def __init__(self, df, tokenizer):
        self.input_ids = []
        self.attn_masks = []
        self.labels = []

        self.answers = []
        self.questions = []
        self.titles = []

        for _, row in df.iterrows():
            prep_text = (
                f"{promt_before_question}{row.Q_Title}{promt_before_answer}{row.A_Body}"
            )

            question_len = len(
                tokenizer(
                    f"{promt_before_question}{row.Q_Title}{promt_before_answer}",
                )["input_ids"]
            )

            encoding_dict = tokenizer(
                prep_text
                # , truncation=True, max_length=128, padding="max_length"
            )

            self.input_ids.append(torch.tensor(encoding_dict["input_ids"]))
            self.attn_masks.append(torch.tensor(encoding_dict["attention_mask"]))
            self.labels.append(torch.tensor(encoding_dict["input_ids"]))
            self.labels[-1][:question_len] = -100

            self.answers.append(row.A_Body)
            self.questions.append(row.Q_Body)
            self.titles.append(row.Q_Title)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return (
            self.input_ids[idx],
            self.attn_masks[idx],
            self.labels[idx],
            self.answers[idx],
            self.questions[idx],
            self.titles[idx],
        )

In [20]:
train_dataset = Q_A_Dataset(train_df, tokenizer)
test_dataset = Q_A_Dataset(test_df, tokenizer)

In [21]:
del df, train_df, test_df

In [22]:
def collate_batch(examples, tokenizer, input_type="input_ids"):

    # Tensorize if necessary.
    if isinstance(examples[0], (list, tuple, np.ndarray)):
        examples = [torch.tensor(e, dtype=torch.long) for e in examples]

    length_of_first = examples[0].size(0)

    # Check if padding is necessary.

    are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
    if are_tensors_same_length:
        return torch.stack(examples, dim=0)

    # If yes, check if we have a `pad_token`.
    if tokenizer._pad_token is None:
        raise ValueError(
            "You are attempting to pad samples but the tokenizer you are using"
            f" ({tokenizer.__class__.__name__}) does not have a pad token."
        )

    # Creating the full tensor and filling it with our data.
    max_length = max(x.size(0) for x in examples)

    if input_type == "input_ids":
        result = examples[0].new_full(
            [len(examples), max_length], tokenizer.pad_token_id
        )
        for i, example in enumerate(examples):
            if tokenizer.padding_side == "right":
                result[i, : example.shape[0]] = example
            else:
                result[i, -example.shape[0] :] = example
    elif input_type == "attention_mask":
        result = examples[0].new_full([len(examples), max_length], 0)
        for i, example in enumerate(examples):
            if tokenizer.padding_side == "right":
                result[i, : example.shape[0]] = example
            else:
                result[i, -example.shape[0] :] = example
    return result

## Загрузка модели

In [23]:
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

Downloading:   0%|          | 0.00/5.31G [00:00<?, ?B/s]

In [22]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 2048)

### Заморозка весов

In [23]:
for n, p in model.named_parameters():
    if "transformer.h" in n:
        layer_num = int(n.split(".")[2])
        if "ln_" not in n and layer_num > 0 and layer_num < 23:
            p.requires_grad = False
            print("Freeze", n)

Freeze transformer.h.1.attn.attention.k_proj.weight
Freeze transformer.h.1.attn.attention.v_proj.weight
Freeze transformer.h.1.attn.attention.q_proj.weight
Freeze transformer.h.1.attn.attention.out_proj.weight
Freeze transformer.h.1.attn.attention.out_proj.bias
Freeze transformer.h.1.mlp.c_fc.weight
Freeze transformer.h.1.mlp.c_fc.bias
Freeze transformer.h.1.mlp.c_proj.weight
Freeze transformer.h.1.mlp.c_proj.bias
Freeze transformer.h.2.attn.attention.k_proj.weight
Freeze transformer.h.2.attn.attention.v_proj.weight
Freeze transformer.h.2.attn.attention.q_proj.weight
Freeze transformer.h.2.attn.attention.out_proj.weight
Freeze transformer.h.2.attn.attention.out_proj.bias
Freeze transformer.h.2.mlp.c_fc.weight
Freeze transformer.h.2.mlp.c_fc.bias
Freeze transformer.h.2.mlp.c_proj.weight
Freeze transformer.h.2.mlp.c_proj.bias
Freeze transformer.h.3.attn.attention.k_proj.weight
Freeze transformer.h.3.attn.attention.v_proj.weight
Freeze transformer.h.3.attn.attention.q_proj.weight
Freeze t

In [24]:
print_gpu_utilization()

GPU memory occupied: 5948 MB.


# Авторизумеся в wandb

In [24]:
import wandb

wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [138]:
run = wandb.init(project="QA specific domain", entity="myashka")

In [26]:
%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
%env WANDB_SILENT=true

env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all
env: WANDB_SILENT=true


# Trainer и обучение

In [31]:
model.config.use_cache = False

In [32]:
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/vkr_data/results",
    num_train_epochs=5,
    logging_steps=10,
    load_best_model_at_end=True,
    save_strategy="steps",
    save_steps=30,
    evaluation_strategy="steps",
    eval_steps=15,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    report_to="wandb",  # enable logging to W&B
    run_name="gpt_neo_android_api_change",
    gradient_checkpointing=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    data_collator=lambda data: {
        "input_ids": collate_batch([f[0] for f in data], tokenizer),
        "attention_mask": collate_batch(
            [f[1] for f in data], tokenizer, "attention_mask"
        ),
        "labels": collate_batch([f[2] for f in data], tokenizer),
    },
)

Using cuda_amp half precision backend


In [33]:
trainer.train()

***** Running training *****
  Num examples = 602
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 755
  Number of trainable parameters = 208009216
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
15,3.7654,3.328843
30,2.0282,1.81856
45,2.0695,1.740983
60,1.7638,1.707453
75,1.8828,1.699657
90,1.5297,1.692748
105,1.8125,1.694179


***** Running Evaluation *****
  Num examples = 107
  Batch size = 4
***** Running Evaluation *****
  Num examples = 107
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-30
Configuration saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-30/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-30/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 107
  Batch size = 4
***** Running Evaluation *****
  Num examples = 107
  Batch size = 4
Saving model checkpoint to /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-60
Configuration saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-60/config.json
Model weights saved in /content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-60/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 107
  Batch size = 4
***** Running Evaluation ****

KeyboardInterrupt: ignored

In [None]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▁▇▁▇▁▁▁▁▇▁▆▁▁▁▁█▁▁▁▁▆▁▆▁▁▁▁▇▁█▁▁
eval/samples_per_second,██▁█▁▇▇█▇▁▇▁▇▆█▇▁▇▇▇▇▁▇▁▆▇█▇▁▇▁▇▇
eval/steps_per_second,██▁█▁▇▇█▇▁▇▁▇▆█▇▁▇▇▇▇▁▇▁▆▇█▇▁▇▁▇▇
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,▁▂▃▃▄▆▆▇█████▇▇▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▅▅▅▄▄▄▄▄
train/loss,█▇▃▃▃▃▂▂▃▂▂▂▃▂▂▂▂▁▂▁▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁

0,1
eval/loss,1.7786
eval/runtime,7.1639
eval/samples_per_second,14.936
eval/steps_per_second,3.769
train/epoch,3.28
train/global_step,495.0
train/learning_rate,2e-05
train/loss,1.1007


# Evaluation

## Подгрузка нужной модели

In [10]:
model = AutoModelForCausalLM.from_pretrained(
    "/content/drive/MyDrive/Colab Notebooks/vkr_data/results/checkpoint-90"
).to(device)

In [27]:
model.config.use_cache = True

In [28]:
rouge = load_metric("rouge")
bertscore = load("bertscore")

Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [139]:
test_predictions = wandb.Artifact(
    "base_android_api_usage_question_answer", type="run_table"
)

text_table = wandb.Table(
    columns=[
        "title",
        "question",
        "generated_answer",
        "original_answer",
        "bert_precision",
        "bert_recall",
        "bert_f1",
        "rouge_score",
        "bleu_score",
    ]
)

## Функция генерации ответа на вопрос с promt

In [147]:
promt_before_question = """Question: """
promt_before_answer = "\nAnswer:"

In [148]:
def generate_answer(model, tokenizer, deivce, question, title, temp=0):
    model.eval()

    question_len = len(f"{promt_before_question}{question}{promt_before_answer}")

    text_to_answer = f"{promt_before_question}{question}{promt_before_answer}"

    enc_text_to_answer = tokenizer(text_to_answer, return_tensors="pt").input_ids.to(
        device
    )

    generated_output = model.generate(
        enc_text_to_answer,
        do_sample=False,
        top_k=50,
        top_p=0.9,
        temperature=temp,
        num_return_sequences=0,
        no_repeat_ngram_size=2,
        pad_token_id=tokenizer.eos_token_id,
        max_new_tokens=a_max_length,
    ).to("cpu")

    del enc_text_to_answer

    generated_q_a = tokenizer.decode(generated_output[0], skip_special_tokens=True)

    generated_a = generated_q_a[question_len:]

    return generated_a

In [149]:
print(test_dataset[1][4])

Say for instance you have an application with a landing page and you want this landing page to either show a log in button if your user is not logged in, or display the users name in a text view if the user is indeed logged in.Is it a must to create two slightly different layouts to inflate based on the condition of being logged in? Or is there a way to decide at runtime which layout component will be necessary given the state of the application?If the latter is true, how do you do this?


In [150]:
text = generate_answer(model, tokenizer, device, test_dataset[1][4], test_dataset[1][5])

In [151]:
text

'\nYou can use the same layout for both the cases.\n\nA:\n\n   <LinearLayout\n  android:layout_width="match_parent"\nandroid:orientation="horizontal" >\n\n <TextView\n android :id="@+id/textView1" \n style="?android :attr/borderlessButtonStyle" />\n\n </Linet\n>\n</LinerLayout>\n\n'

In [152]:
print(test_dataset[1][3])

If it's just as simple as showing a <code>TextView</code> instead of a <code>Button</code> why not just use <code>setVisibilty</code> on each of them depending on your situation. If you have a more complicated layout then I would suggest having two separate layouts and inflating the correct one. Or as @ataulm suggested use separate activities


In [153]:
bert_score = bertscore.compute(
    predictions=[text], references=[test_dataset[1][3]], lang="en"
)
bert_score

{'precision': [0.7891825437545776],
 'recall': [0.8157892227172852],
 'f1': [0.8022653460502625],
 'hashcode': 'roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.24.0)'}

## Проверка качества модели

In [131]:
bleu_score = nltk.translate.bleu("what is weather?", "what is", weights=(1, 0, 0, 0))

In [132]:
bleu_score

1.0

In [154]:
def evaluate(model, test_dataset, tokenizer, device, text_table, wandb_run):
    model.eval()

    bleu_scores = []
    rouge_scores = []
    bert_precisions = []
    bert_recalls = []
    bert_f1s = []

    for _, _, _, answer, question, title in tqdm(test_dataset):

        generated_a = generate_answer(model, tokenizer, device, question, title, 0.0)

        bert_score = bertscore.compute(
            predictions=[generated_a], references=[answer], lang="en"
        )

        bleu_score = sentence_bleu(answer, generated_a, weights=(1, 0, 0, 0))
        rouge_score = rouge.compute(predictions=[generated_a], references=[answer])[
            "rouge1"
        ].mid.fmeasure

        text_table.add_data(
            title,
            question,
            generated_a,
            answer,
            bert_score["precision"][0],
            bert_score["recall"][0],
            bert_score["f1"][0],
            rouge_score,
            bleu_score,
        )

        bleu_scores.append(bleu_score)
        rouge_scores.append(rouge_score)
        bert_precisions.append(bert_score["precision"][0])
        bert_recalls.append(bert_score["recall"][0])
        bert_f1s.append(bert_score["f1"][0])

    test_predictions.add(text_table, "gpt_neo_eval_asking_promt_200")
    wandb_run.log_artifact(test_predictions)

    return (
        np.mean(bleu_scores),
        np.mean(rouge_scores),
        np.mean(bert_precisions),
        np.mean(bert_recalls),
        np.mean(bert_f1s),
    )

In [155]:
bleu_score, rouge_score, bert_precision, bert_recall, bert_f1 = evaluate(
    model, test_dataset, tokenizer, device, text_table, run
)

  0%|          | 0/107 [00:00<?, ?it/s]

In [156]:
print(bleu_score, rouge_score, bert_precision, bert_recall, bert_f1)

0.07287170083490525 0.16138384489567162 0.7872182782565322 0.8276405958371742 0.80613757683852


In [157]:
wandb.finish()

VBox(children=(Label(value='0.201 MB of 0.201 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…