# Общая информация:
__Задачи, решаемые в ноутбуке:__

1) Сделать эвалуацию базовой GPT neo на случайно из data part 4, включающей 500 экземпляров (API Usage category, <=200 length)

2) Отобрать экземпляров, имеющих наименьшее косинусное расстояние Q_title и Q_Body

3) Сделать эвалуацию базовой модели на отобранной выборке

4) Сравнить результат

5) Сделать кластрезицию на основе BERToflow embedding

# Импорт библиотек

In [14]:
!wget https://raw.githubusercontent.com/Myashka/Diploma/master/Notebooks/Data/utils.py

--2022-11-11 15:25:44--  https://raw.githubusercontent.com/Myashka/Diploma/master/Notebooks/Data/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8256 (8.1K) [text/plain]
Saving to: ‘utils.py’


2022-11-11 15:25:45 (87.6 MB/s) - ‘utils.py’ saved [8256/8256]



In [7]:
#!pip install bert_score datasets rouge_score evaluate pynvml transformers wandb

In [3]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [14]:
import pickle

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)

from tqdm.auto import tqdm
import torch
from transformers import (AutoModelForTokenClassification, AutoTokenizer, AutoModelForCausalLM)

In [15]:
from utils import *

In [16]:
torch.manual_seed(42)
np.random.seed(42)

# Чтение данных

In [17]:
df = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/vkr_data/df_200.csv')
df = df.drop(columns=['Unnamed: 0'])
df = df.reset_index(drop=True)

In [18]:
samples = 150

In [19]:
eval_df = df.sample(samples)
eval_df = eval_df.reset_index(drop=True)

In [11]:
eval_df = df.head(samples)

In [4]:
device = torch.device("cuda") if torch.cuda.is_available else "cpu"

# Загрузка BERToflow

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("lanwuwei/BERTOverflow_stackoverflow_github")
bert_model = AutoModelForTokenClassification.from_pretrained("lanwuwei/BERTOverflow_stackoverflow_github")
bert_model = bert_model.bert

Some weights of BertForTokenClassification were not initialized from the model checkpoint at lanwuwei/BERTOverflow_stackoverflow_github and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def get_bert_embeddings(df, model, tokenizer, column, device):
    model.to(device)
    batch_size = 100
    rows = df[f'{column}']
    max_length = max([len(tokenizer.encode(row)) for row in rows])
    if max_length > 512:
        max_length = 512
    
    tokenized_matrix = [tokenizer.encode(row, add_special_tokens=True, padding='max_length', max_length=max_length, truncation=True) for row in np.array(rows)]
    tokenized_matrix = torch.tensor(tokenized_matrix).to(device)
    
    embeddings = torch.empty((tokenized_matrix.shape[0], 768)).to(device)
    
    with torch.no_grad():
        for i in tqdm(range(0, len(embeddings), batch_size)):
            last_hidden_states = model(tokenized_matrix[i: i+batch_size])
            emb_batch = last_hidden_states[0][:,0,:]
            embeddings[i: i+batch_size] = emb_batch
    return embeddings.detach().cpu()

In [None]:
question_emb = get_bert_embeddings(df, bert_model, bert_tokenizer, 'Q_Body', device)

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
title_emb = get_bert_embeddings(df, bert_model, bert_tokenizer, 'Q_Title', device)

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
cos_dist = torch.nn.functional.cosine_similarity(question_emb, title_emb, dim=1)

In [None]:
df['t_q_dist'] = pd.Series(list(cos_dist), index=df.index)

In [None]:
df = df.sort_values('t_q_dist', ascending=False)

In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/vkr_data/df_200.csv')

# Загрузка модели

In [5]:
model_name = "EleutherAI/gpt-neo-1.3B"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [20]:
#promt_1 = """You will be asked a question. For this question, you must answer by its title.\nTitle: """
promt_1 = ''
promt_2 = 'Question: '
promt_3 = '\nAnswer:'
promts = [promt_1, promt_2, promt_3]

In [21]:
use_title = False
use_question = True
dataset = Q_A_Dataset(eval_df, tokenizer, *promts, use_title=False, use_question=True)

# Инициализация в WandB

In [23]:
wandb.login()
run = wandb.init(project="QA specific domain", entity="myashka")

%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
%env WANDB_SILENT=true

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668578183331798, max=1.0…

env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all
env: WANDB_SILENT=true


In [24]:
run_table_name = 'base_android_api_usage_question_answer_samples'
run_table = wandb.Artifact(f"{run_table_name}", type="run_table")

# Запуск evaluation

In [25]:
evaluator = Evaluator(run_table, model, tokenizer)

In [None]:
eval_table, bleu, rouge, bert_pr, bert_rec, bert_f1 = evaluator.evaluate(dataset, *promts, use_title=use_title, use_question=use_question)

  0%|          | 0/100 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


ValueError: ignored

In [None]:
eval_filename = 'base_good_100_samples_title'

In [None]:
run_table.add(eval_table, f"{eval_filename}")
run.log_artifact(run_table)
wandb.finish()