In [1]:
%load_ext autoreload
%autoreload 2

In [41]:
import os
from pathlib import Path
import torch
from torchtext import data
from neural_editor.seq2seq.train import load_data, load_tufano_dataset
from neural_editor.seq2seq.config import load_config
from neural_editor.seq2seq.train_utils import greedy_decode, remove_eos, lookup_words, calculate_accuracy
from neural_editor.seq2seq.datasets.dataset_utils import take_part_from_dataset
from neural_editor.seq2seq.train_utils import rebatch
import numpy as np

In [3]:
RESULTS_ROOT = '/home/mikhail/Documents/Development/embeddings-for-code-diffs-data/experiment_20'

In [4]:
CONFIG = load_config(False, Path(os.path.join(RESULTS_ROOT, 'config.pkl')))

In [5]:
train_dataset, val_dataset, test_dataset, diffs_field = load_data(verbose=True, config=CONFIG)

Data set sizes (number of sentence pairs):
train 8793
valid 1100
test 1098 

First training example:
src: public void METHOD_1 ( ) { TYPE_1 VAR_1 = new TYPE_1 ( ) ; byte [ ] VAR_2 = TYPE_2 . METHOD_2 ( ) ; VAR_1 . METHOD_3 ( VAR_2 , 0 , VAR_2 . length ) ; org.junit.Assert.assertEquals ( STRING_1 , VAR_1 . METHOD_4 ( ) . get ( STRING_2 ) ) ; }
trg: public void METHOD_1 ( ) { TYPE_1 VAR_1 = new TYPE_1 ( ) ; byte [ ] VAR_2 = TYPE_2 . METHOD_2 ( ) ; VAR_1 . METHOD_3 ( VAR_2 , 0 , VAR_2 . length ) ; assertEquals ( STRING_1 , VAR_1 . METHOD_4 ( ) . get ( STRING_2 ) ) ; }
diff_alignment: замена
diff_prev: org.junit.Assert.assertEquals
diff_updated: assertEquals 

Most common words:
         )     194026
         (     193976
         .      92332
         ;      82530
   паддинг      54790
         ,      54180
         {      42036
         }      41868
     VAR_1      41750
  удаление      41184 

First 10 words:
00 <unk>
01 <pad>
02 <s>
03 </s>
04 )
05 (
06 .
07 ;
08 паддинг
09 , 

Special

In [15]:
PAD_INDEX = diffs_field.vocab.stoi[CONFIG['PAD_TOKEN']]
SOS_INDEX = diffs_field.vocab.stoi[CONFIG['SOS_TOKEN']]
EOS_INDEX = diffs_field.vocab.stoi[CONFIG['EOS_TOKEN']]
UNK_INDEX = diffs_field.vocab.stoi[CONFIG['UNK_TOKEN']]

In [7]:
CONFIG._CONFIG['DEVICE'] = torch.device('cpu')

In [9]:
MODEL = torch.load(os.path.join(RESULTS_ROOT, 'model_best_on_validation.pt'), map_location=CONFIG['DEVICE'])

In [50]:
def calculate_unk_tokens(dataset):
    iterator = data.Iterator(dataset, batch_size=1, train=False,
                             repeat=False,
                             sort=False,
                             device=CONFIG['DEVICE'])
    src_unk = 0
    trg_unk = 0
    examples_containing_unk = 0
    src_total = 0
    trg_total = 0
    for batch in iterator:
        src_unk += (batch.src[0][0] == UNK_INDEX).sum()
        trg_unk += (batch.trg[0][0] == UNK_INDEX).sum()
        examples_containing_unk += 1 if (batch.src[0][0] == UNK_INDEX).sum() + (batch.trg[0][0] == UNK_INDEX).sum() > 0 else 0
        src_total += len(batch.src[0][0]) - 2
        trg_total += len(batch.trg[0][0]) - 2
    print(f'Src prob: {src_unk.item() / src_total}')
    print(f'Trg prob: {trg_unk.item() / trg_total}')
    print(f'Total prob: {(src_unk.item() + trg_unk.item()) / (src_total + trg_total)}')
    print(f'Examples with unk: {examples_containing_unk} / {len(dataset)} = {examples_containing_unk / len(dataset)}')
    return (src_unk.item(), src_total), (trg_unk.item(), trg_total), (examples_containing_unk, len(dataset))

In [51]:
calculate_unk_tokens(test_dataset)

Src prob: 0.0001451835361869964
Trg prob: 0.00015166835187057634
Total prob: 0.000148355112688071
Examples with unk: 3 / 1098 = 0.00273224043715847


((12, 82654), (12, 79120), (3, 1098))

In [52]:
calculate_unk_tokens(val_dataset)

Src prob: 0.00016737602218927836
Trg prob: 0.00018895257290420104
Total prob: 0.00017788246262934816
Examples with unk: 10 / 1100 = 0.00909090909090909


((14, 83644), (15, 79385), (10, 1100))

In [53]:
calculate_unk_tokens(train_dataset)

Src prob: 0.0
Trg prob: 0.0
Total prob: 0.0
Examples with unk: 0 / 8793 = 0.0


((0, 656447), (0, 628869), (0, 8793))

In [43]:
tufano_bug_fixes_0_50_dataset_train, tufano_bug_fixes_0_50_dataset_val, tufano_bug_fixes_0_50_dataset_test = \
        load_tufano_dataset(CONFIG['TUFANO_BUG_FIXES_0_50_PATH'], diffs_field, CONFIG)
tufano_bug_fixes_50_100_dataset_train, tufano_bug_fixes_50_100_dataset_val, tufano_bug_fixes_50_100_dataset_test = \
    load_tufano_dataset(CONFIG['TUFANO_BUG_FIXES_50_100_PATH'], diffs_field, CONFIG)
tufano_code_changes_0_50_dataset_train, tufano_code_changes_0_50_dataset_val, tufano_code_changes_0_50_dataset_test = \
    load_tufano_dataset(CONFIG['TUFANO_CODE_CHANGES_0_50_PATH'], diffs_field, CONFIG)
tufano_code_changes_50_100_dataset_train, tufano_code_changes_50_100_dataset_val, tufano_code_changes_50_100_dataset_test = \
    load_tufano_dataset(CONFIG['TUFANO_CODE_CHANGES_50_100_PATH'], diffs_field, CONFIG)

In [54]:
calculate_unk_tokens(tufano_bug_fixes_0_50_dataset_train)

Src prob: 0.014737728950073298
Trg prob: 0.015802480449206367
Total prob: 0.01524560801771845
Examples with unk: 15234 / 46680 = 0.3263496143958869


((21866, 1483675), (21383, 1353142), (15234, 46680))

In [55]:
calculate_unk_tokens(tufano_bug_fixes_0_50_dataset_val)

Src prob: 0.014723781641921492
Trg prob: 0.015810137295467368
Total prob: 0.01524204741409525
Examples with unk: 1905 / 5835 = 0.3264781491002571


((2746, 186501), (2690, 170144), (1905, 5835))

In [56]:
calculate_unk_tokens(tufano_bug_fixes_0_50_dataset_test)

Src prob: 0.014143485416722877
Trg prob: 0.01542370878146791
Total prob: 0.014754537633802023
Examples with unk: 1918 / 5835 = 0.32870608397600687


((2621, 185315), (2610, 169220), (1918, 5835))

In [63]:
calculate_unk_tokens(tufano_bug_fixes_50_100_dataset_train)

Src prob: 0.014354252969446419
Trg prob: 0.01484665783896647
Total prob: 0.014598006636795877
Examples with unk: 25294 / 52364 = 0.48304178443205253


((56131, 3910409), (56913, 3833388), (25294, 52364))

In [64]:
calculate_unk_tokens(tufano_bug_fixes_50_100_dataset_val)

Src prob: 0.014083816381196787
Trg prob: 0.014586544469902103
Total prob: 0.014332762264485672
Examples with unk: 3181 / 6546 = 0.48594561564314087


((6893, 489427), (7003, 480100), (3181, 6546))

In [65]:
calculate_unk_tokens(tufano_bug_fixes_50_100_dataset_test)

Src prob: 0.014637762099920333
Trg prob: 0.015096981857704205
Total prob: 0.014865329560379007
Examples with unk: 3149 / 6545 = 0.48113063407181056


((7129, 487028), (7223, 478440), (3149, 6545))

In [57]:
calculate_unk_tokens(tufano_code_changes_0_50_dataset_train)

Src prob: 0.0009798628931468759
Trg prob: 0.0012938189629918954
Total prob: 0.001129341572554869
Examples with unk: 337 / 8627 = 0.03906340558711024


((265, 270446), (318, 245784), (337, 8627))

In [59]:
calculate_unk_tokens(tufano_code_changes_0_50_dataset_val)

Src prob: 0.0009474182851729038
Trg prob: 0.0013551927017914987
Total prob: 0.0011400905825394347
Examples with unk: 43 / 1079 = 0.03985171455050973


((32, 33776), (41, 30254), (43, 1079))

In [60]:
calculate_unk_tokens(tufano_code_changes_0_50_dataset_test)

Src prob: 0.0007237146827234831
Trg prob: 0.001156849513159163
Total prob: 0.000928985882460442
Examples with unk: 38 / 1077 = 0.035283194057567316


((25, 34544), (36, 31119), (38, 1077))

In [58]:
calculate_unk_tokens(tufano_code_changes_50_100_dataset_train)

Src prob: 0.0
Trg prob: 0.0
Total prob: 0.0
Examples with unk: 0 / 8793 = 0.0


((0, 656447), (0, 628869), (0, 8793))

In [61]:
calculate_unk_tokens(tufano_code_changes_50_100_dataset_val)

Src prob: 0.00016737602218927836
Trg prob: 0.00018895257290420104
Total prob: 0.00017788246262934816
Examples with unk: 10 / 1100 = 0.00909090909090909


((14, 83644), (15, 79385), (10, 1100))

In [62]:
calculate_unk_tokens(tufano_code_changes_50_100_dataset_test)

Src prob: 0.0001451835361869964
Trg prob: 0.00015166835187057634
Total prob: 0.000148355112688071
Examples with unk: 3 / 1098 = 0.00273224043715847


((12, 82654), (12, 79120), (3, 1098))