In [1]:
from sklearn.model_selection import train_test_split
from sentence_transformers import InputExample

import pandas as pd
import pickle

In [170]:
reports_file_path = 'data/br/openoffice/openoffice.csv'
relations_file_path = 'data/br/openoffice/openoffice_pairs.csv'

train_file_path = 'data/splits/openoffice/openoffice_train.csv'

reports_openoffice = pd.read_csv(reports_file_path, index_col='bug_id')
relations_openoffice = pd.read_csv(relations_file_path, index_col='issue_id')

train = pd.read_csv(train_file_path, index_col='bug_id')

In [2]:
def generate_triplets(reports, relations):
    duplicates_pairs_set = set()
    train_examples = []
    for index, value in reports.iterrows():
        if index in relations.index:
            duplicates_id = []
            try:
                duplicates_id = [int(id) for id in relations.loc[index].values[0].split(';')]
            except:
                duplicates_id = []

            for id in duplicates_id:
                if id in reports.index:

                    duplicates_pair = tuple(sorted([index, id]))

                    if duplicates_pair not in duplicates_pairs_set:

                        duplicates_pairs_set.add(duplicates_pair)

                        positive = reports.loc[id]
                        negative = reports.sample(n=1).iloc[0]
                        
                        while (int(negative.name) == index) or (int(negative.name) in duplicates_id) or (not isinstance(negative['description'], str)):
                            negative = reports.sample(n=1).iloc[0]

                        if isinstance(value['description'], str) and isinstance(positive['description'], str):
                            train_examples.append(
                                InputExample(texts=[
                                    value['description'],
                                    positive['description'],
                                    negative['description']
                                ])
                            )

                        else:
                            print('save')
                        
    return train_examples

In [8]:
reports_openoffice.head(2)

Unnamed: 0_level_0,bug_severity,bug_status,component,creation_ts,delta_ts,description,dup_id,priority,product,resolution,short_desc,version
bug_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
13,trivial,CLOSED,code,2000-10-16 18:33:00 +0000,2003-12-06 14:52:32 +0000,"I need to see if this works, sorry.",[],P4,Calc,NOT_AN_ISSUE,Test bug: Cell color is wrong,605
14,trivial,CLOSED,Website general issues,2000-10-17 19:40:00 +0000,2006-02-07 22:23:55 +0000,it would be nice if the combination of OpenOff...,[],P3,Infrastructure,FIXED,openoffice.org issuezillla URL's display a hor...,current


In [149]:
reports_openoffice[0:5][['description', 'bug_status']].to_dict()

{'description': {13: 'I need to see if this works, sorry.',
  14: 'it would be nice if the combination of OpenOffice.org gif and "IssueZilla:" \ntext could be artfully combined with better matching alignment, color and font.\nI tried playing with vertical alignment and it didn\'t help. \nIt seemed bogus to spend much time trying to align the OOo gif with the text\nsince it would still look bad anyways in some other browser.\n\nI\'m hoping to get some graphics assistance to make this look better.\n\nNote, see http://www.openoffice.org/issues/editparams.cgi \'bannerhtml\'\nand \'blurbhtml\' parameters for where this information is set. It\'s not like we\ncan\'t easily set the HTML different, so anybody has a suggestion for',
  15: 'this task is just a test\nthis task is just athis task is just a test\n test\nathisathistthis task is just a test\nthis task is just athis task is just a test\n testthis task is justhis task is just a test\nthis task is just athis task is just a test\n test\na

In [19]:
relations_openoffice.head(2)

Unnamed: 0_level_0,duplicate
issue_id,Unnamed: 1_level_1
51549,52538;51703;51446;51937;51488;51722;51704;5145...
37805,37411;36327;37026;37521;37706;37663;36862;37707


In [7]:
relations_openoffice[0:5].to_dict()

{'duplicate': {51549: '52538;51703;51446;51937;51488;51722;51704;51452;51824;51604;51595;51591',
  37805: '37411;36327;37026;37521;37706;37663;36862;37707',
  88488: '54568;35616;111788;79244;51118;59483;63653',
  495: nan,
  17628: '21822;6259;20644;19045;19420;23937;34990;6036;34148;17730;51547;13621;43963;11096;39750;19888;28170;7390;4930;16832;5009;2109;23787;22256;25764;35249;35039;18593;18131;10188;39670;17941;4746;8873;18296;12823;13060;6008;5142;39496;17108;23807;20737;28844;28305;22223;19421;4693;21779;36239;7764;24155;15181;24263;19433;5008;27754;19422;9557;10306;34523;9646;14818;17742;13688;18534;20106;37566;23689;32302;21656'}}

In [12]:
reports_sim_dict = {
    'description': {
        1: "This is a report",
        2: "This is another report",
        3: "I need help with this",
        4: "I can't fix this",
        5: "What is happening?"
    }
}

relations_sim_dict = {
    'duplicate': {
        1: '2',
        2: '1',
        3: '4;5',
        4: '3'
    }
}

In [20]:
reports_sim = pd.DataFrame(reports_sim_dict)
reports_sim.rename_axis('bug_id', inplace=True)


relations_sim = pd.DataFrame(relations_sim_dict)
relations_sim.rename_axis('issue_id', inplace=True)

In [21]:
relations_sim

Unnamed: 0_level_0,duplicate
issue_id,Unnamed: 1_level_1
1,2
2,1
3,4;5
4,3


In [165]:
input_ex = generate_triplets(reports_sim, relations_sim)

In [166]:
len(input_ex)

3

In [167]:
input_ex[0].texts

['This is a report', 'This is another report', "I can't fix this"]

In [168]:
input_ex[1].texts

['I need help with this', "I can't fix this", 'This is another report']

In [169]:
input_ex[2].texts

['I need help with this', 'What is happening?', 'This is another report']

In [55]:
reports_sim_dict2 = {
    'description': {
        1: "This is a report",
        2: "This is another report",
        3: "I need help with this"
    }
}

relations_sim_dict2 = {
    'duplicate': {
        1: '2',
        2: '1'
    }
}

reports_sim2 = pd.DataFrame(reports_sim_dict2)
reports_sim2.rename_axis('bug_id', inplace=True)


relations_sim2 = pd.DataFrame(relations_sim_dict2)
relations_sim2.rename_axis('issue_id', inplace=True)

In [183]:
test = generate_triplets(reports_sim2, relations_sim2)

In [184]:
test[0].texts

['This is a report', 'This is another report', 'I need help with this']

In [102]:
type(int(reports_sim2.loc[1].name))

int

In [103]:
for index, row in reports_sim2.iterrows():
    print(type(index))

<class 'int'>
<class 'int'>
<class 'int'>


In [171]:
len(train)

48054

In [173]:
trip = generate_triplets(train, relations_openoffice)

In [174]:
len(trip)

13996

In [3]:
train_file_path = 'data/splits/eclipse/eclipse_train.csv'
relations_file_path = 'data/br/eclipse/eclipse_pairs.csv'

reports_train_eclipse = pd.read_csv(train_file_path, index_col='bug_id')
relations_eclipse = pd.read_csv(relations_file_path, index_col='issue_id')

In [4]:
train_file_path = 'data/splits/openoffice/openoffice_train.csv'
relations_file_path = 'data/br/openoffice/openoffice_pairs.csv'

reports_train_openoffice = pd.read_csv(train_file_path, index_col='bug_id')
relations_openoffice = pd.read_csv(relations_file_path, index_col='issue_id')

In [5]:
train_file_path = 'data/splits/firefox/firefox_train.csv'
relations_file_path = 'data/br/firefox/firefox_pairs.csv'

reports_train_firefox = pd.read_csv(train_file_path, index_col='bug_id')
relations_firefox = pd.read_csv(relations_file_path, index_col='issue_id')

In [7]:
train_file_path = './data/splits/netbeans/netbeans_train.csv'
relations_file_path = 'data/br/netbeans/netbeans_pairs - Copia.csv'

reports_train_netbeans = pd.read_csv(train_file_path, index_col='bug_id')
relations_netbeans = pd.read_csv(relations_file_path, index_col='issue_id')

In [8]:
datasets = {}

In [9]:
datasets['eclipse'] = {
    'reports': reports_train_eclipse,
    'relations': relations_eclipse
}

datasets['openoffice'] = {
    'reports': reports_train_openoffice,
    'relations': relations_openoffice
}

datasets['firefox'] = {
    'reports': reports_train_firefox,
    'relations': relations_firefox
}

datasets['netbeans'] = {
    'reports': reports_train_netbeans,
    'relations': relations_netbeans
}

In [10]:
for key, value in datasets.items():

    # Generate triplets for the train dataset
    triplets = generate_triplets(value['reports'], value['relations'])
    print(key, len(triplets))

    # Save triplets into pickle file
    triplets_file_path = f'data/splits/refactor_triplets/{key}_triplets.pkl'
    with open(triplets_file_path, "wb") as f:
        pickle.dump(triplets, f)

    

eclipse 20550
openoffice 13996
save
save
save
save
save
save
save
save
save
save
save
save
save
save
save
firefox 37228
netbeans 22374


In [11]:
train_triplets = {}

for key, value in datasets.items():
    
    triplets_file_path = f'data/splits/refactor_triplets/{key}_triplets.pkl'

    with open(triplets_file_path, "rb") as f:
        loaded_input_examples = pickle.load(f)

    train_triplets[key] = loaded_input_examples

    print(key, len(loaded_input_examples))

eclipse 20550
openoffice 13996
firefox 37228
netbeans 22374


In [12]:
all_triplets = []

for list_of_triplets in train_triplets.values():
    all_triplets.extend(list_of_triplets)

len(all_triplets)

94148