In [51]:
import os
from tqdm.auto import tqdm
import random

In [18]:
def read_data(path):
    data = []
    with open(os.path.join(path, 'prev.txt'), mode='r', encoding='utf-8') as prev, \
        open(os.path.join(path, 'updated.txt'), mode='r', encoding='utf-8') as updated:
        for prev_line, updated_line in zip(prev, updated):
            prev_line, updated_line = prev_line.strip(), updated_line.strip()
            if prev_line != '' and updated_line != '':
                data.append({'prev': prev_line, 'updated': updated_line})
            else:
                print('EMPTY LINE IN DATASET')
    return data
            
def load_datasets(path):
    data = {'train': [], 'test': [], 'val': []}
    for k in data:
        data_root = os.path.join(path, k)
        data[k] = read_data(data_root)
    return data

In [39]:
TUFANO_BUG_FIXES_0_50 = load_datasets('../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/0_50_copy')

In [72]:
def find_duplicates_fast_by_pred(train_dataset, test_dataset, pred, label):
    print('\nFinding duplicates...')
    print(label)
    train_set = set(pred(el) for el in train_dataset)
    duplicates = [el for el in test_dataset if pred(el) in train_set]
    print(f'Found {len(duplicates)} duplicates')
    return duplicates

def print_duplicates(duplicates):
    for duplicate in duplicates:
        print(duplicate['prev'])
        print(duplicate['updated'])
        print()

def find_duplicates_fast(train_dataset, test_dataset):
    pred1 = lambda x: x['prev']
    prev_duplicates = find_duplicates_fast_by_pred(train_dataset, test_dataset, pred1, 'Prev matching')
    print('Samples')
    print_duplicates(random.sample(prev_duplicates, k=min(len(prev_duplicates), 5)))
    pred2 = lambda x: (x['prev'], x['updated'])
    prev_and_updated_duplicates = find_duplicates_fast_by_pred(train_dataset, test_dataset, pred2, 'Prev and updated matching')
    print('Samples')
    print_duplicates(random.sample(prev_and_updated_duplicates, k=min(len(prev_and_updated_duplicates), 5)))

In [73]:
find_duplicates_fast(TUFANO_BUG_FIXES_0_50['train'], TUFANO_BUG_FIXES_0_50['test'])


Finding duplicates...
Prev matching
Found 1 duplicates
Samples
public java.lang.String METHOD_1 ( ) { return new TYPE_1 ( STRING_1 ) . format ( VAR_1 [ ( ( VAR_1 . length ) - 1 ) ] . getTime ( ) ) ; }
public java.lang.String METHOD_1 ( ) { return new TYPE_1 ( STRING_1 ) . format ( VAR_1 [ ( ( type ) - 1 ) ] . getime ( ) ) ; }


Finding duplicates...
Prev and updated matching
Found 0 duplicates
Samples


In [74]:
def find_duplicates_for_dataset(dataset_root):
    datasets = load_datasets(dataset_root)
    print('train vs test')
    find_duplicates_fast(datasets['train'], datasets['test'])

In [75]:
find_duplicates_for_dataset('../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/0_50_copy')

train vs test

Finding duplicates...
Prev matching
Found 1 duplicates
Samples
public java.lang.String METHOD_1 ( ) { return new TYPE_1 ( STRING_1 ) . format ( VAR_1 [ ( ( VAR_1 . length ) - 1 ) ] . getTime ( ) ) ; }
public java.lang.String METHOD_1 ( ) { return new TYPE_1 ( STRING_1 ) . format ( VAR_1 [ ( ( type ) - 1 ) ] . getime ( ) ) ; }


Finding duplicates...
Prev and updated matching
Found 0 duplicates
Samples


In [86]:
def find_duplicates_for_datasets(dataset_roots):
    for dataset_root in dataset_roots:
        print(f'\nNEW DATASET: {dataset_root}')
        find_duplicates_for_dataset(dataset_root)
        print('-------------------------------------------------------------')

In [87]:
DATASET_ROOTS_TO_CHECK = [
    '../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/0_50',
    '../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/50_100',
    '../embeddings-for-code-diffs-data/datasets/java/tufano_code_changes/0_50',
    '../embeddings-for-code-diffs-data/datasets/java/tufano_code_changes/50_100',
]

In [88]:
find_duplicates_for_datasets(DATASET_ROOTS_TO_CHECK)


NEW DATASET: ../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/0_50
train vs test

Finding duplicates...
Prev matching
Found 0 duplicates
Samples

Finding duplicates...
Prev and updated matching
Found 0 duplicates
Samples
-------------------------------------------------------------

NEW DATASET: ../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/50_100
train vs test

Finding duplicates...
Prev matching
Found 0 duplicates
Samples

Finding duplicates...
Prev and updated matching
Found 0 duplicates
Samples
-------------------------------------------------------------

NEW DATASET: ../embeddings-for-code-diffs-data/datasets/java/tufano_code_changes/0_50
train vs test

Finding duplicates...
Prev matching
Found 13 duplicates
Samples
TYPE_1 < TYPE_2 > METHOD_1 ( TYPE_3 VAR_1 ) ;
TYPE_1 < TYPE_2 > get ( TYPE_3 VAR_1 ) ;

public void METHOD_1 ( boolean VAR_1 ) { this . VAR_2 = VAR_1 ; }
public void METHOD_1 ( boolean VAR_1 ) { VAR_2 = VAR_1 ; }

public TYPE_1 METH

In [85]:
find_duplicates_for_dataset('../embeddings-for-code-diffs-data/datasets/java/tufano_bug_fixes/0_100')

train vs test

Finding duplicates...
Prev matching
Found 49 duplicates
Samples
public boolean METHOD_1 ( TYPE_1 node ) { if ( ( ( this . VAR_1 . x ) == ( node . x ) ) && ( ( this . VAR_1 . y ) <= ( node . y ) ) ) return true ; else return false ; }
public boolean METHOD_1 ( TYPE_1 node ) { if ( ( ( this . VAR_1 . x ) == ( node . x ) ) && ( ( this . VAR_1 . y ) == ( node . y ) ) ) return true ; else return false ; }

public static TYPE_1 METHOD_1 ( final float min , final float VAR_1 ) { return TYPE_2 . METHOD_2 ( new TYPE_3 ( VAR_1 , VAR_1 ) ) . msg ( ( ( ( ( STRING_1 + min ) + STRING_2 ) + VAR_1 ) + STRING_3 ) ) ; }
public static TYPE_1 METHOD_1 ( final float min , final float VAR_1 ) { return TYPE_2 . METHOD_2 ( new TYPE_3 ( min , VAR_1 ) ) . msg ( ( ( ( ( STRING_1 + min ) + STRING_2 ) + VAR_1 ) + STRING_3 ) ) ; }

public void METHOD_1 ( TYPE_1 VAR_1 , java.util.List < TYPE_1 > VAR_2 ) { super . METHOD_1 ( VAR_1 , VAR_2 ) ; VAR_3 . error ( STRING_1 , METHOD_2 ( ) ) ; METHOD_3 ( ) . M