# Проверка качества

In [1]:
import os
import sys
import json
import warnings

from enum import Enum
from collections import defaultdict

sys.path.append('../')
sys.path.append('old_configs/')
warnings.filterwarnings('ignore')
try:
    sys.modules.pop('src.modeling.autocompletion')
except:
    print('cant do')

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from catalyst.utils import load_checkpoint, unpack_checkpoint

from src.preprocessing.preprocessing import LexerBasedPreprocessor
from src.preprocessing.tokenization import SentencepieceTokenizerWrapper
from src.generation.generation_utils import (
    BiTokenScoresPostprocessor,
    TokenScoresPostprocessor,
    NextTokenChooser
)
from src.generation.autocompletion import AutocompletionModel, BiAutocompletionModel
from src.utils.metrics import reciprocal_rank, relevant_in_k

from src.modeling.gpt2_config_initializer import GPT2ConfigInitializer
from src.modeling.bi_gpt2_config_initializer import BiGPT2ConfigInitializer

cant do


In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
CUDA_DEVICE = 'cuda:4'

In [3]:
# preprocesser
DATA_DIR = '/mnt/data/popov/rcompletion/evaluation_data/'
with open('/home/popov/data/rcompletion_files/december2020_best_model/top_tokens_bigdata_021020.json', 'r') as f:
    top_tokens = json.load(f)
top_tokens = set(top_tokens)
preprocesser = LexerBasedPreprocessor(protected_names=top_tokens)
lexer = preprocesser.lexer

In [4]:
# tokenizer
base_path = '/mnt/disk/shared/popov/data/rcompletion/bigdata_ver1/'
tokenizer = SentencepieceTokenizerWrapper(f'/mnt/data/porkhun/tokenizer/spm_cased_bpe_16.model')

In [5]:
# score postprocesser
score_postprocesser = TokenScoresPostprocessor(temperature=1)

In [6]:
class Model(Enum):
    GPT = 1
    BiGPT = 2

In [7]:
from gpt_config import Config as gpt_config
if gpt_config.TYPE_MODEL == 'GPT2':
    initializer = GPT2ConfigInitializer(gpt_config)
    model = initializer.init_model()
    model_type = Model.GPT
elif gpt_config.TYPE_MODEL == 'BiGPT2':
    initializer = BiGPT2ConfigInitializer(gpt_config)
    model = initializer.init_model()
    model_type = Model.BiGPT
else:
    raise ValueError('Strange model type')
if gpt_config.CHECKPOINT_PATH:
    checkpoint = load_checkpoint(gpt_config.CHECKPOINT_PATH)
    unpack_checkpoint(checkpoint=checkpoint, model=model)

In [8]:
# model = model.student_model
model = model.eval()
model = model.to(CUDA_DEVICE)

In [9]:
def create_df(directory):
    dict_for_df = defaultdict(list)
    with open(directory, 'r') as f:
        for line in f:
            d = json.loads(line)
            for key in d:
                dict_for_df[key].append(d[key])
    return pd.DataFrame(dict_for_df)

In [10]:
if model_type == Model.BiGPT:
    df = create_df(DATA_DIR+'extracted_events_with_right_context.json')
elif model_type == Model.GPT:
    df = create_df(DATA_DIR+'extracted_events.json')
df.head()

Unnamed: 0,url,before_cursor,after_cursor,after_cursor_token,group,prefix
0,https://api.github.com/repos/CenterForStatisti...,########################\n# ...,otlist,plotlist,f_key_argument,prefix
1,https://api.github.com/repos/VUW-FAIR/tic-pers...,"setwd(""/Users/mlr/OneDrive - Victoria Universi...",thod,method,f_key_argument,prefix
2,https://api.github.com/repos/jayhesselberth/gg...,"context(""ggsave"")\n\ntest_that(""ggsave creates...",cale,scale,f_key_argument,prefix
3,https://api.github.com/repos/gtesei/fast-furio...,require(xgboost)\nrequire(methods)\nlibrary(da...,op,drop,f_key_argument,prefix
4,https://api.github.com/repos/HadiEO/tropical_r...,require(ggplot2)\nrequire(tidyverse)\nrequire(...,ir,nir,f_key_argument,prefix


Для измерения зависимости от итерации:

In [11]:
lines_to_keep = 100

params = {
    'preprocessor': preprocesser,
    'tokenizer': tokenizer,
    'model': model,
    'next_token_chooser': NextTokenChooser(do_sample=False),
    'max_tokens_amount': 5,
    'num_beams': 5,
    'max_num_sequence_return': 20,
    'input_lines_to_keep': lines_to_keep,
}

if model_type == Model.BiGPT:
    autocompletion_model = BiAutocompletionModel(
        score_postprocesser=BiTokenScoresPostprocessor(temperature=1.5, penalty_theta=0.5),
        **params
    )
elif model_type == Model.GPT:
    autocompletion_model = AutocompletionModel(
        score_postprocesser=TokenScoresPostprocessor(temperature=1.5, penalty_theta=0.5),
        **params
    )    

In [None]:
%%time
all_real_outputs = dict()
all_model_outputs = dict()
 
model_outputs = []
real_outputs = []
bad_indexes = []
for i, elem in tqdm(df.iterrows()):
    if model_type == Model.BiGPT:
        test_sample = (elem['before_cursor'], elem['right_context'])
    else:
        test_sample = elem['before_cursor']
    try:
        one_model_outputs = autocompletion_model.autocomplete_input(
            test_sample,
            drop_last_word='always' if elem['prefix'] == 'prefix' else 'never',
        )
        one_real_output = elem['after_cursor_token']
        model_outputs.append(one_model_outputs)
        real_outputs.append(one_real_output)
    except Exception:
#         raise
        bad_indexes.append(i)
#     if i > 10:
#         break

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [None]:
len(bad_indexes)

In [None]:
# delete all brackets from left context
real_o = real_outputs
model_o = model_outputs
relevances = [
    [int(x == one_r_o) for x in one_model_o]
    for one_r_o, one_model_o in zip(real_o, model_o)
]
key_metrics = [
    [relevant_in_k(one_r, k=k) for k in range(1, 6)] + [reciprocal_rank(one_r)]
    if one_r else [0] * 6
    for one_r in relevances
]
key_metrics = np.array(key_metrics).mean(axis=0)
key_metrics