In [89]:
import os
from tqdm.auto import tqdm
import random

In [90]:
def read_data(path):
    data = []
    with open(os.path.join(path, 'prev.txt'), mode='r', encoding='utf-8') as prev, \
        open(os.path.join(path, 'updated.txt'), mode='r', encoding='utf-8') as updated:
        for prev_line, updated_line in zip(prev, updated):
            prev_line, updated_line = prev_line.strip(), updated_line.strip()
            if prev_line != '' and updated_line != '':
                data.append({'prev': prev_line, 'updated': updated_line})
            else:
                print('EMPTY LINE IN DATASET')
    return data
            
def load_datasets(path):
    data = {'train': [], 'test': [], 'val': []}
    for k in data:
        data_root = os.path.join(path, k)
        data[k] = read_data(data_root)
    return data

In [91]:
def find_duplicates_fast_by_pred(train_dataset, test_dataset, pred, label):
    print('\nFinding duplicates...')
    print(label)
    train_set = set(pred(el) for el in train_dataset)
    duplicates = [el for el in test_dataset if pred(el) in train_set]
    print(f'Found {len(duplicates)} duplicates')
    return duplicates

def print_duplicates(duplicates):
    for duplicate in duplicates:
        print(duplicate['prev'])
        print(duplicate['updated'])
        print()

def find_duplicates_fast(train_dataset, test_dataset):
    pred1 = lambda x: x['prev']
    prev_duplicates = find_duplicates_fast_by_pred(train_dataset, test_dataset, pred1, 'Prev matching')
    print('Samples')
    print_duplicates(random.sample(prev_duplicates, k=min(len(prev_duplicates), 5)))
    pred2 = lambda x: (x['prev'], x['updated'])
    prev_and_updated_duplicates = find_duplicates_fast_by_pred(train_dataset, test_dataset, pred2, 'Prev and updated matching')
    print('Samples')
    print_duplicates(random.sample(prev_and_updated_duplicates, k=min(len(prev_and_updated_duplicates), 5)))

In [92]:
def find_duplicates_for_dataset(dataset_root):
    datasets = load_datasets(dataset_root)
    print('train vs test')
    find_duplicates_fast(datasets['train'], datasets['test'])

In [93]:
def find_duplicates_for_datasets(dataset_roots):
    for dataset_root in dataset_roots:
        print(f'\nNEW DATASET: {dataset_root}')
        find_duplicates_for_dataset(dataset_root)
        print('-------------------------------------------------------------')

In [94]:
DATASET_ROOTS_TO_CHECK = [
    '../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/0_50',
    '../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/50_100',
    '../embeddings-for-code-diffs-data/datasets/java/tufano_code_changes/0_50',
    '../embeddings-for-code-diffs-data/datasets/java/tufano_code_changes/50_100',
]

In [95]:
find_duplicates_for_datasets(DATASET_ROOTS_TO_CHECK)


NEW DATASET: ../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/0_50
train vs test

Finding duplicates...
Prev matching
Found 0 duplicates
Samples

Finding duplicates...
Prev and updated matching
Found 0 duplicates
Samples
-------------------------------------------------------------

NEW DATASET: ../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/50_100
train vs test

Finding duplicates...
Prev matching
Found 0 duplicates
Samples

Finding duplicates...
Prev and updated matching
Found 0 duplicates
Samples
-------------------------------------------------------------

NEW DATASET: ../embeddings-for-code-diffs-data/datasets/java/tufano_code_changes/0_50
train vs test

Finding duplicates...
Prev matching
Found 13 duplicates
Samples
public void METHOD_1 ( boolean VAR_1 ) { this . VAR_2 = VAR_1 ; }
public void METHOD_1 ( boolean VAR_1 ) { VAR_2 = VAR_1 ; }

public void METHOD_1 ( ) { TYPE_1 . METHOD_2 ( ) ; }
public void METHOD_1 ( ) { METHOD_2 ( ) ; }

public v

In [96]:
find_duplicates_for_dataset('../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/0_100')

train vs test

Finding duplicates...
Prev matching
Found 49 duplicates
Samples
public void METHOD_1 ( ) throws TYPE_1 { try { VAR_1 . METHOD_2 ( null ) ; VAR_2 . METHOD_3 ( VAR_3 , STRING_1 , STRING_1 ) ; } catch ( TYPE_2 VAR_4 ) { VAR_4 . METHOD_4 ( ) ; throw new TYPE_1 ( VAR_4 ) ; } }
public void METHOD_1 ( ) throws TYPE_1 { try { VAR_1 . METHOD_2 ( VAR_3 ) ; VAR_2 . METHOD_3 ( VAR_3 , STRING_1 , STRING_1 ) ; } catch ( TYPE_2 VAR_4 ) { VAR_4 . METHOD_4 ( ) ; throw new TYPE_1 ( VAR_4 ) ; } }

public void METHOD_1 ( boolean VAR_1 ) { if ( ( ( VAR_2 ) >= ( VAR_3 ) ) && ( ( VAR_4 ) <= ( VAR_3 ) ) ) { VAR_1 = true ; } else { VAR_1 = false ; } this . VAR_1 = VAR_1 ; }
public void METHOD_1 ( boolean VAR_1 ) { if ( ( ( VAR_2 ) <= ( VAR_3 ) ) && ( ( VAR_4 ) >= ( VAR_3 ) ) ) { VAR_1 = false ; } else { VAR_1 = true ; } this . VAR_1 = VAR_1 ; }

public void METHOD_1 ( ) { java.lang.String VAR_1 = STRING_1 ; java.lang.String VAR_2 = STRING_2 ; boolean VAR_3 = VAR_4 . METHOD_2 ( VAR_1 , VAR_2 ) ; 