# Общая информация:
__Задачи, решаемые в ноутбуке:__

1) Сделать эвалуацию базовой GPT neo на случайно из data part 4, включающей 105 и 500 экземпляров (API Usage category, <=200 length)

2) Отобрать экземпляров, имеющих наименьшее косинусное расстояние Q_title и Q_Body

3) Сделать эвалуацию базовой модели на отобранной выборке

4) Сравнить результат

5) Сделать кластрезицию на основе BERToflow embedding

# Импорт библиотек

In [11]:
if 'google.colab' in str(get_ipython()):
    !pip install bert_score datasets rouge_score evaluate pynvml transformers wandb sentence-transformers
    
    !wget https://raw.githubusercontent.com/Myashka/Diploma/master/Notebooks/utils.py

    from utils import *

    from google.colab import drive
    drive.mount("/content/drive")
else:
    import sys
    sys.path.append(r"D:\vkr\Notebooks")

    from utils import *

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
--2022-11-16 23:25:11--  https://raw.githubusercontent.com/Myashka/Diploma/master/Notebooks/utils.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8246 (8.1K) [text/plain]
Saving to: ‘utils.py.2’


2022-11-16 23:25:11 (67.1 MB/s) - ‘utils.py.2’ saved [8246/8246]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", None)

from tqdm.auto import tqdm
import torch
from transformers import (AutoModelForTokenClassification, AutoTokenizer, AutoModelForCausalLM)
from sentence_transformers import SentenceTransformer

In [2]:
torch.manual_seed(42)
np.random.seed(42)

# Чтение данных

In [5]:
df = pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/vkr_data/df_200.csv')
df = df.drop(columns=['Unnamed: 0'])
df = df.reset_index(drop=True)

In [6]:
df = df.sort_values('t_q_BERT_dist', ascending=False)
#df = df.sort_values('t_q_MPnet_dist', ascending=False)

In [7]:
samples = 105

In [None]:
eval_df = df.sample(samples)
eval_df = eval_df.reset_index(drop=True)

In [8]:
# GOOD samples
eval_df = df.head(samples)

In [None]:
# BAD samples
eval_df = df.tail(samples)

In [3]:
device = torch.device("cuda") if torch.cuda.is_available else "cpu"

# SO MPnet embeddings

In [None]:
def get_embeddings(df, model, tokenizer, column, device):
    model.to(device)
    batch_size = 100
    rows = df[f'{column}']
    max_length = max([len(tokenizer.encode(row)) for row in rows])
    if max_length > 512:
        max_length = 512
      
    embeddings = torch.empty((len(df), 768)).to(device)

    with torch.no_grad():
        for i in tqdm(range(0, len(embeddings), batch_size)):
            tokenized_batch = tokenizer.batch_encode_plus(rows[i: i+batch_size].tolist(), add_special_tokens=True, padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")

            tokenized_batch = tokenized_batch.to(device)
    
            last_hidden_states = model(**tokenized_batch)

            emb_batch = last_hidden_states[0][:,0,:]

            embeddings[i: i+batch_size] = emb_batch
    return embeddings.detach().cpu()

In [None]:
mp_tokenizer = AutoTokenizer.from_pretrained("flax-sentence-embeddings/stackoverflow_mpnet-base")
mp_model = AutoModelForTokenClassification.from_pretrained("flax-sentence-embeddings/stackoverflow_mpnet-base")
mp_model = mp_model.mpnet

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of MPNetForTokenClassification were not initialized from the model checkpoint at flax-sentence-embeddings/stackoverflow_mpnet-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
q_emb_mp = get_embeddings(df, mp_model, mp_tokenizer, 'Q_Body', device)

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
title_emb_mp = get_embeddings(df, mp_model, mp_tokenizer, 'Q_Title', device)

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
mp_cos_dist = torch.nn.functional.cosine_similarity(q_emb_mp, title_emb_mp, dim=1)

In [None]:
df['t_q_MPnet_dist'] = pd.Series(list(mp_cos_dist), index=df.index)

In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/vkr_data/df_200.csv')

# BERToflow embeddings

In [None]:
bert_tokenizer = AutoTokenizer.from_pretrained("lanwuwei/BERTOverflow_stackoverflow_github")
bert_model = AutoModelForTokenClassification.from_pretrained("lanwuwei/BERTOverflow_stackoverflow_github")
bert_model = bert_model.bert

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.24k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/660k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/156 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/596M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at lanwuwei/BERTOverflow_stackoverflow_github and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
bert_question_emb = get_embeddings(df, bert_model, bert_tokenizer, 'Q_Body', device)

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
bert_title_emb = get_bert_embeddings(df, bert_model, bert_tokenizer, 'Q_Title', device)

  0%|          | 0/8 [00:00<?, ?it/s]

In [None]:
bert_cos_dist = torch.nn.functional.cosine_similarity(bert_question_emb, bert_title_emb, dim=1)

In [None]:
df['t_q_BERT_dist'] = pd.Series(list(bert_cos_dist), index=df.index)

In [None]:
df = df.sort_values('t_q_BERT_dist', ascending=False)

In [None]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/vkr_data/df_200.csv')

## Cosine similarity между BERT И MPnet embeddings

In [None]:
df

Unnamed: 0,Id,Q_date_open,Q_Score,Q_Title,Q_Body,A_Score,A_Body,Tag,Q_len,A_len,API_CHANGE,API_USAGE,CONCEPTUAL,DISCREPANCY,DOCUMENTATION,ERRORS,REVIEW,t_q_BERT_dist,t_q_MPnet_dist
695,11947640,2012-08-14 07:27:37+00:00,16,"how to use ""Select debug app"" and ""wait for debugger"" new feature in jelly bean?","all""Select debug app"" and ""wait for debugger"" are new feature in Jelly Bean, does someone know how to use these new feature? what do they do?Thanks",2.0,"It is a mystery to me also, but I'll share my experience. I cannot see that these options change the phone's behavior. Regardless of the settings I choose, the Galaxy Nexus behaves like my older phones that do not have the settings.""Waiting for debugger"" is a dialog that I see whenever I start running an app from Eclipse. It stays up for a while, and then my app starts running. This has been the case since I started Android development, and it still works that way on my old and new devices, regardless of the setting.If I click on ""Select Debug App"", it allows me to select an app from the already installed apps on my phone that allow debugging, which are the apps that I wrote myself.On my Galaxy Nexus it does not matter whether I select ""Nothing"" or one of my debuggable apps, I can still debug any app from Eclipse. I cannot even see a difference in time it takes to load an app over the ADB connection.","['android', 'android-4.2-jelly-bean']",28,179,0,1,0,0,0,0,0,tensor(0.9910),tensor(0.8872)
692,4397360,2010-12-09 10:46:15+00:00,0,android: how to access from activity A a non static public function of activity B at runtime?,Does anyone know in android how you can call a non static public function of an activity from another activity ?example : activity A want to access public function toto of activity Bthanks,1.0,"case 1: if you are executing yours activitys into an activitygroup, then you can obtain the interested activity instance from LocalActivityManager doing something like this YouractivitygetLocalactivityManager.getActivityyourActivityId.performYourMethod...Case 2: if you aren't running your activitys into an activitygroup, then you can use the activity B method through broadcast, is other way to perform inter activity communication.Cheer","['android', 'function', 'android-activity']",33,59,0,1,0,0,0,0,0,tensor(0.9900),tensor(0.7121)
651,17446460,2013-07-03 11:12:26+00:00,0,get how much time i have before the battery goes to 0% Android,Can i get how much time i have before the battery goes to 0%? Something using a CountDownTimer and making a stime of how much mAh the battery consume? Someone can help me to do it?,1.0,"Short answer: No.Long answer: Yes, you can.. But not that easily.. Because you don't know how much energy does the device consume at a time basis, you should first try to find that out and then calculate the remaining time using that data... As you might have guessed already, it might not be that accurate at first but the more you sample, the more accurate that data will be..So to conclude here is what you have to do:First when the user opens the app, because you don't have enough data about energy consumption, You should just display something like Calculating.... Here you should decide to have a sample time. That is something like derivation in math.. Let's say you have a sample time of 10 seconds. Your actual one should be bigger I guess. You should have a variable named something like level1.. Save the current level to it. Wait for the amount of sample time which is now 10 seconds.. Now save the current value to level2.. Save the level1 - level2 product to another variable like consumed.Now here is the magic. You know how much power has been consumed in 10 seconds That is the consumed variable.. From now on you can calculate the remaining time using that data. But remember it's still one 10 second. What if that 10 seconds user was playing a heavy game or even the device was on stand by? That is why it's not accurate. But until now.. What you can do is to add the consumed variable to an array named like sample_data. Let the process go on and sample the consumption every 10 seconds and add it to the array. The more data you get, the more accurate your calculation will be.Then you can easily calculate the time remaining using the average of samples in the array.. Easy Peasy :p","['android', 'countdowntimer', 'batterylevel']",36,313,0,1,0,0,0,0,0,tensor(0.9893),tensor(0.8360)
604,3306390,2010-07-22 06:46:42+00:00,8,How to send email with attachment using GmailSender in android,I want to know about how to send email with attachment using GmailSender in android.,3.0,"this is gmailsender,but i want to get flag when username and password enter by user and check from gmail db that user is valid or not.",['android'],15,26,0,1,0,0,0,0,0,tensor(0.9891),tensor(0.8438)
690,5906670,2011-05-06 04:08:36+00:00,0,"When playing an Android video, how do you get callbacks when the user changes the position?","If you have a VideoView hooked up to a MediaController, how do you get callbacks to know when the user uses the SeekBar?",1.0,"AFAIK, you don't ""know when the user uses the"" MediaController. You can write your own controller, with your own SeekBar, to manage the user experience.","['android', 'videoview', 'mediacontroller']",23,25,0,1,0,0,0,0,0,tensor(0.9889),tensor(0.6073)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50,5689700,2011-04-16 21:20:26+00:00,1,android destroy service,"How can a user destroy a service outside of the app. I made an app that the user can never ""Exit"" because the main activity disables the ""back"" button, but now my service notification can't be removed there will eventually be conditions on when it is shown, but not right now! I know that android manages the memory when the app is in the background like that, but if I really wanted to close that service, how would this be done?",1.0,You can use the Bindservice.The Service is stoped as the Bindservice.,"['android', 'service', 'destroy']",81,11,0,1,0,0,0,0,0,tensor(0.9288),tensor(0.7562)
486,6059740,2011-05-19 13:51:22+00:00,0,Problem with intent,"I have 2 different class... de first one MainSoup is the main class and this class extends activity.The second class View2 extends View.in View2 class is where i make my OnTouchEvent and my Canvas... I also have a frameLayout with 2 layout... in the first one i put multiple textViews.On top of this first layout i put the second one wich has nothing and here is where i draw with my Canvas and touch events. At this point everythings works just fine.The problems begin when i want to make an intent... I put the intent in de Main class MainSoup: Intent i = new Intentthis, org.me.androidsoup.MainSoup.class; startActivityi; but i dont know how to trigger it since the OnTouchEvent is in the View2 class.And if i try to put it in the View2 class, i have troubles with the startActivity line, It doesnt recognize it and tells me to create a method call startActivity.",3.0,startActivity is a method that requires a context it's actually a method defined by the Context class.Views have a method called getContext that will return the context attached to that view. You could use that for invoking the Intent.Hope it helps.,"['android', 'android-intent']",171,41,0,1,0,1,0,0,0,tensor(0.9284),tensor(0.6075)
10,14850270,2013-02-13 09:29:43+00:00,0,repaint in android,"There is a method in java called canvas.repaint, but there is no such method in Android. Can you please tell me how may I supposed to implement that method in Android? I want my current screen to be repainted/refreshed.I tried invalidate/postInvalidate but couldn't accomplish the same as I am creating views dynamically using JSON and not using .xml files.If I create my own repaint method then what should I write in that method?Thank you in advance.",2.0,"Try calling View.invalidate instead of View.refreshDrawableStateInvalidate will tell the view that all of the pixels in the view need to be redrawn, if you are only updating a smaller area of the view look into the invalidateRect overload for a performance boost.","['android', 'repaint']",76,42,0,1,0,0,0,0,0,tensor(0.9280),tensor(0.7781)
322,5325330,2011-03-16 12:33:40+00:00,4,TextView setText problem,"I have a TextView and my need is to add 10 characters in that TextView by code. But when characters are greater than 10, they should be display in TextView but with 8 character + .. of that charsequence. And when I want to read text of that TextView I must get full charsequence not 8 character + .. . For exampletv.settext""ASDFGHJKLQ"" // this is 10 characters long, so no rule requiredbut tv.settext""ASDFGHJKLOP"" // this is more than 10 characters then it should be display as ASDFGHJK.. in textView, but when I retrieve the value of textview, it should return ASDFGHJKLOP instead of ASDFGHJK.. , so how can it be done.That textView is row of a listview.",1.0,I see only one solution. But maybe there are another solutions.To store you string.If string's length is greater than 10 letters set text of your TextView with 12345678..And if you want to get right text you should take the value of the string and not of the textview.,"['android', 'textview']",119,48,0,1,0,0,0,0,0,tensor(0.9279),tensor(0.5178)


In [None]:
str_1 = 'What is the weather like'
str_2 = 'What is the weather like'

tok_1 = bert_tokenizer(str_1, add_special_tokens=True, padding='max_length', max_length=512, truncation=True, return_tensors="pt")
tok_2 = bert_tokenizer(str_2, add_special_tokens=True, padding='max_length', max_length=512, truncation=True, return_tensors="pt")

tok_1 = tok_1.to(device)
tok_2 = tok_2.to(device)

emb_1 = bert_model(**tok_1)[0][:,0,:]
emb_2 = bert_model(**tok_2)[0][:,0,:]

In [None]:
torch.nn.functional.cosine_similarity(emb_1, emb_2, dim=1)

tensor([1.0000], device='cuda:0', grad_fn=<SumBackward1>)

# Загрузка модели

In [4]:
model_name = "EleutherAI/gpt-neo-1.3B"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [12]:
promt_1 = """You will be asked a question. For this question, you must answer by its title and question body.\nTitle: """
#promt_1 = ''
promt_2 = '\nQuestion body: '
#promt_2 = ''
promt_3 = '\nAnswer:'
promts = [promt_1, promt_2, promt_3]

use_title = True
use_question = True

In [13]:
dataset = Q_A_Dataset(eval_df, tokenizer, *promts, use_title=use_title, use_question=use_question)

# Инициализация в WandB

In [14]:
wandb.login()
run = wandb.init(project="QA specific domain", entity="myashka")

%env WANDB_LOG_MODEL=true
%env WANDB_WATCH=all
%env WANDB_SILENT=true

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmyashka[0m. Use [1m`wandb login --relogin`[0m to force relogin


env: WANDB_LOG_MODEL=true
env: WANDB_WATCH=all
env: WANDB_SILENT=true


In [15]:
run_table_name = 'base_android_api_usage_question_answer_samples'
run_table = wandb.Artifact(f"{run_table_name}", type="run_table")

# Запуск evaluation

In [16]:
evaluator = Evaluator(run_table, model, tokenizer)

In [None]:
eval_table, bleu, rouge, bert_pr, bert_rec, bert_f1 = evaluator.evaluate(dataset, *promts, use_title=use_title, use_question=use_question)

  0%|          | 0/105 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
eval_filename = 'base_bert_good_105_samples_title_question'

In [None]:
run_table.add(eval_table, f"{eval_filename}")
run.log_artifact(run_table)
wandb.finish()

In [None]:
print(bleu, rouge, bert_pr, bert_rec, bert_f1)

0.0807711943967461 0.13015668877684844 0.8122783655212039 0.8324493374143328 0.8214696532204038


In [None]:
print(bleu, rouge, bert_pr, bert_rec, bert_f1)

0.07735358066254513 0.11894398292346023 0.8047025003433228 0.8320511833429337 0.8173794143199921


In [None]:
print(bleu, rouge, bert_pr, bert_rec, bert_f1)

0.07419551054249134 0.13119578324627432 0.8018379024096898 0.8336302467754909 0.8167109506470817
