# Investigation of nearest neighbours performance

In [287]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [288]:
from torchtext.data import Dataset, Field
from torchtext import data
from datasets.CommitMessageGenerationDataset import CommitMessageGenerationDataset
from datasets.CodeChangesDataset import CodeChangesTokensDataset
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import torch
import os
from neural_editor.seq2seq.train_utils import rebatch
from sklearn.neighbors import NearestNeighbors
from neural_editor.seq2seq.experiments.BleuCalculation import BleuCalculation

In [289]:
DEVICE = torch.device('cpu')

In [290]:
config = {
        'TOKEN_MIN_FREQ': 1,
        'TOKENS_CODE_CHUNK_MAX_LEN': 121,
        'MSG_MAX_LEN': 30,
        'LOWER': True,
        'LOWER_COMMIT_MSG': True,
        'UNK_TOKEN': "<unk>",
        'PAD_TOKEN': "<pad>",
        'SOS_TOKEN': "<s>",
        'EOS_TOKEN': "</s>",
        'REPLACEMENT_TOKEN': 'замена',
        'DELETION_TOKEN': 'удаление',
        'ADDITION_TOKEN': 'добавление',
        'UNCHANGED_TOKEN': 'равенство',
        'PADDING_TOKEN': 'паддинг',
        'LEAVE_ONLY_CHANGED': True,
        'DEVICE': torch.device('cpu'),
        'BLEU_PERL_SCRIPT_PATH': '/home/mikhail/Documents/Development/embeddings-for-code-diffs/neural_editor/seq2seq/experiments/multi-bleu.perl'
    }

In [291]:
def load_dataset(dataset_path: str, dataset_path_commit: str):
    config['DATASET_ROOT'] = dataset_path
    config['DATASET_ROOT_COMMIT'] = dataset_path_commit
        
    train_dataset, val_dataset, test_dataset, diffs_field = \
        CodeChangesTokensDataset.load_data(True, config)
    train_dataset_commit, val_dataset_commit, test_dataset_commit, fields_commit = \
        CommitMessageGenerationDataset.load_data(diffs_field, True, config)
    
    data = {'train': (train_dataset, train_dataset_commit), 
            'val': (val_dataset, val_dataset_commit), 
            'test': (test_dataset, test_dataset_commit)
           }
    messages = {'train': [], 
                'val': [], 
                'test': []
               }
    for path in [dataset_path, dataset_path_commit]:
        for mode in ['train', 'val', 'test']:
            with open(os.path.join(path, mode, 'msg.txt'), mode='r', encoding='utf-8') as msg:
                messages[mode].append([l.strip() for l in msg])
    return data, messages

In [293]:
JIANG_FILTERED_PART_DATASET_PATH = '../embeddings-for-code-diffs-data/datasets/commit_message_generation/Jiang/filtered_dataset/partitioned/'
JIANG_FILTERED_PART_DATA, JIANG_FILTERED_PART_MESSAGES = load_dataset(JIANG_FILTERED_PART_DATASET_PATH + 'neural_editor', JIANG_FILTERED_PART_DATASET_PATH + 'commit_message_generator')

Data set sizes (number of sentence pairs):
train 10402
valid 1214
test 1160 

Max sequence length in tokens: 94 

First training example:
src: mmm a / changelog . md <nl> * storm - 464 : simulated time advanced after test cluster exits causes intermittent test failures <nl> * storm - 463 : added static version of metrics helpers for config <nl> * storm - 376 : add compression to serialization <nl> # # 0 . 9 . 2 - incubating <nl> * storm - 66 : send taskid on initial handshake <nl>
trg: ppp b / changelog . md <nl> * storm - 464 : simulated time advanced after test cluster exits causes intermittent test failures <nl> * storm - 463 : added static version of metrics helpers for config <nl> * storm - 376 : add compression to serialization <nl> * storm - 437 : enforce utf - 8 when multilang reads from stdin <nl> # # 0 . 9 . 2 - incubating <nl> * storm - 66 : send taskid on initial handshake <nl>
diff_alignment: замена замена добавление добавление добавление добавление добавление добавление д

In [294]:
def load_ne_model_from_experiment(experiment):
    path = f'../embeddings-for-code-diffs-data/experiment_{experiment}/model_best_on_validation_neural_editor.pt'
    return torch.load(path, map_location=DEVICE)

In [295]:
MODEL_E107 = load_ne_model_from_experiment('107')



In [296]:
def extract_features(model, dataset):
    from tqdm.auto import tqdm
    X = [None] * len(dataset)
    pad_index = dataset.fields['src'].vocab.stoi['<pad>']
    data_iterator = data.Iterator(dataset, batch_size=64, train=False,
                                  shuffle=False,
                                  sort=False,
                                  sort_within_batch=True,
                                  sort_key=lambda x: (len(x.src), len(x.trg)),
                                  device=DEVICE)
    data_iterator = [rebatch(pad_index, batch, dataset, config) for batch in data_iterator]
    for batch in tqdm(data_iterator):
        edit_final, encoder_output, encoder_final = model.encode(batch)
        edit_final = torch.cat((edit_final[0], encoder_final[0]), dim=-1)
        #edit_final = edit_final[0]
        for i, idx in enumerate(batch.ids):
            X[idx] = edit_final[-1][i].detach().numpy()
    return np.array(X)

In [297]:
X_TRAIN = extract_features(MODEL_E107, JIANG_FILTERED_PART_DATA['train'][0])

HBox(children=(IntProgress(value=0, max=163), HTML(value='')))




In [298]:
Y_TRAIN = JIANG_FILTERED_PART_MESSAGES['train'][0]

In [299]:
X_TRAIN.shape

(10402, 288)

In [300]:
NEAREST_NEIGHBOUR = NearestNeighbors(n_neighbors=1, algorithm='brute').fit(X_TRAIN)

In [301]:
X_TEST = extract_features(MODEL_E107, JIANG_FILTERED_PART_DATA['test'][1])
Y_TEST = JIANG_FILTERED_PART_MESSAGES['test'][1]

HBox(children=(IntProgress(value=0, max=19), HTML(value='')))




In [302]:
Y_PRED = [[Y_TRAIN[i[0]].lower().split()] for i in NEAREST_NEIGHBOUR.kneighbors(X_TEST)[1]]

In [286]:
BleuCalculation(config).conduct(Y_PRED, JIANG_FILTERED_PART_DATA['test'][1], 'NB dataset')

Start conducting BLEU calculation experiment for NB dataset...
b'BLEU = 18.92, 28.8/20.3/17.4/16.7 (BP=0.933, ratio=0.935, hyp_len=21382, ref_len=22871)\n'
Errors: b''


In [303]:
BleuCalculation(config).conduct(Y_PRED, JIANG_FILTERED_PART_DATA['test'][1], 'NB dataset E95')

Start conducting BLEU calculation experiment for NB dataset E95...
b'BLEU = 40.31, 44.8/39.5/39.3/40.8 (BP=0.981, ratio=0.982, hyp_len=9176, ref_len=9348)\n'
Errors: b''


In [317]:
NB = NEAREST_NEIGHBOUR.kneighbors(X_TEST)

In [318]:
print(NB[1][NB[0] == 0])
for i, d in enumerate(NEAREST_NEIGHBOUR.kneighbors(X_TEST)[0]):
    if d == 0:
        print(Y_PRED[i])
        Y_PRED[i] = [['gkldsflkjgdfncvbkjn2314234']]
        print(i, end=' ')

[ 3059  9485  7133  3974  5570  3065  4722  4536 10080 10309  8941]
[['_']]
13 [['_']]
125 [['_']]
208 [['_']]
312 [['_']]
345 [['_']]
502 [['_']]
765 [['_']]
779 [['_']]
781 [['_']]
864 [['_']]
911 

In [319]:
BleuCalculation(config).conduct(Y_PRED, JIANG_FILTERED_PART_DATA['test'][1], 'NB dataset E95')

Start conducting BLEU calculation experiment for NB dataset E95...
b'BLEU = 39.66, 44.4/39.1/38.9/40.5 (BP=0.975, ratio=0.975, hyp_len=9118, ref_len=9348)\n'
Errors: b''


In [320]:
len(Y_PRED)

1161