In [233]:
import re, string, unicodedata
import ast

import pandas as pd
import numpy as np

import gensim.downloader as api
from gensim.models.word2vec import Word2Vec

from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

from pathlib import Path

## Load Data

In [231]:
data = pd.read_csv('csv/preprocessed_short_data_eclipse2017.csv')
data

Unnamed: 0,id,resolution,short_desc,dup_list,root_id,disc_id,description
0,1,FIXE,"['usability', 'issue', 'external', 'editors']",[183],1.0,1.0,"['setup', 'project', 'contain', 'gif', 'resour..."
1,2,FIXE,"['open', 'repository', 'resources', 'doesnt', ...","[94, 9779, 15392]",2.0,2.0,"['open', 'repository', 'resource', 'always', '..."
2,7,WONT,"['team', 'api', 'movecopy', 'semantics', 'pres...",[42962],7.0,7.0,"['platform', 'able', 'notify', 'people', 'reso..."
3,10,FIXE,"['api', 'vcm', 'event', 'notification']",[121067],10.0,10.0,"['seem', 'need', 'vcm', 'event', 'notification..."
4,20,FIXE,"['workspace', 'file']",[40],20.0,20.0,"['think', 'would', 'useful', 'set', 'repo', 'c..."
...,...,...,...,...,...,...,...
992,6454,FIXE,"['option', 'show', 'package', 'hierarchically']","[10029, 24425]",6454.0,6454.0,"['fully', 'aware', 'java', 'package', 'arent',..."
993,6455,DUPL,"['npe', 'kill', 'javaindexer', 'thread']",,15262.0,6455.0,"['press', 'ctrl', 'shift', 'progress', 'inform..."
994,6457,FIXE,"['rename', 'project', 'change', 'project', 'lo...",[8445],6457.0,6457.0,"['create', 'project', 'nondefault', 'location'..."
995,6460,DUPL,"['find', 'dialog', 'box', 'obscure', 'result']",,5969.0,2422.0,"['use', 'findreplace', 'dialog', 'text', 'edit..."


In [153]:
def get_corpus(data):
    corpus = []
    for str_list in data['description'].tolist():
        word_lst = ast.literal_eval(str_list)
        corpus.append(word_lst)
    return corpus

## Train Word2Vec model

In [154]:
vec_size = 300
full_corpus = get_corpus(data)
# random
model_random = Word2Vec(full_corpus, vector_size=vec_size, min_count=0)

In [163]:
# pretrained
model_pretrained = Word2Vec(vector_size=vec_size, min_count=1)
model_pretrained.build_vocab(full_corpus)

In [133]:
pretrained = api.load("glove-wiki-gigaword-300")
tmp_file = get_tmpfile("pretrained_vectors.txt")
pretrained.save_word2vec_format(tmp_file)
init_vocab = [list(pretrained.key_to_index.keys())]

In [164]:
model_pretrained.build_vocab(init_vocab, update=True)
model_pretrained.wv.vectors_lockf = np.ones(len(model_pretrained.wv))
model_pretrained.wv.intersect_word2vec_format(tmp_file, binary=False, lockf=1.0)

In [166]:
model_pretrained.train(full_corpus, total_examples=len(full_corpus), epochs=100)

(5307912, 5308200)

In [167]:
model_pretrained.save("pretrained_word2vec")

In [251]:
# fine-tuned
docs = Path('docs/eclipse5.txt').read_text()
docs = ast.literal_eval(docs)
model_finetuned = Word2Vec(vector_size=vec_size, min_count=1)
model_finetuned.build_vocab([docs])
model_finetuned.build_vocab(full_corpus)

In [252]:
model_finetuned.build_vocab(init_vocab, update=True)
model_finetuned.wv.vectors_lockf = np.ones(len(model_finetuned.wv))
model_finetuned.wv.intersect_word2vec_format(tmp_file, binary=False, lockf=1.0)

In [253]:
model_finetuned.train([docs], total_examples=1, epochs=100)
model_finetuned.train(full_corpus, total_examples=len(full_corpus), epochs=100)

(5307941, 5308200)

In [254]:
model_finetuned.save('finetuned_word2vec')

## Bug Duplication Search

In [212]:
def get_doc_embedding(doc, model):
    result = np.zeros(vec_size)
    for word in doc:
        result += model[word]
    
    return result / len(doc)

def get_reports_embeddings(model, data):
    embeddings = []
    corpus = get_corpus(data)
    for report in corpus:
        embeddings.append(get_doc_embedding(report, model))
    return embeddings

def sim(vec1, vec2):
    return np.linalg.norm(vec1 - vec2)

def find_top_duplicate(bug_descr, embeddings, model, topn=10):
    min_id = 0
    min_sim = np.inf
    doc_emb = get_doc_embedding(bug_descr, model)
    sims = []
    for report_emb in embeddings:
        sims.append(sim(report_emb, doc_emb))
    sims = np.array(sims)
    sims = sims.argsort()[1:topn]
    return sims

## Evaluation of the Approach

In [168]:
test = pd.read_csv('csv/short_dupl2017.csv')
master = pd.read_csv('csv/short_master2017.csv')

In [232]:
master

Unnamed: 0,id,resolution,short_desc,dup_list,root_id,disc_id,description
0,1,FIXE,"['usability', 'issue', 'external', 'editors']",[183],1.0,1.0,"['setup', 'project', 'contain', 'gif', 'resour..."
1,2,FIXE,"['open', 'repository', 'resources', 'doesnt', ...","[94, 9779, 15392]",2.0,2.0,"['open', 'repository', 'resource', 'always', '..."
2,7,WONT,"['team', 'api', 'movecopy', 'semantics', 'pres...",[42962],7.0,7.0,"['platform', 'able', 'notify', 'people', 'reso..."
3,10,FIXE,"['api', 'vcm', 'event', 'notification']",[121067],10.0,10.0,"['seem', 'need', 'vcm', 'event', 'notification..."
4,20,FIXE,"['workspace', 'file']",[40],20.0,20.0,"['think', 'would', 'useful', 'set', 'repo', 'c..."
...,...,...,...,...,...,...,...
563,6437,FIXE,"['timeout', 'launch']","[6919, 49659]",6437.0,6437.0,"['eclipse', 'corneri', 'continue', 'experience..."
564,6454,FIXE,"['option', 'show', 'package', 'hierarchically']","[10029, 24425]",6454.0,6454.0,"['fully', 'aware', 'java', 'package', 'arent',..."
565,6455,DUPL,"['npe', 'kill', 'javaindexer', 'thread']",,15262.0,6455.0,"['press', 'ctrl', 'shift', 'progress', 'inform..."
566,6457,FIXE,"['rename', 'project', 'change', 'project', 'lo...",[8445],6457.0,6457.0,"['create', 'project', 'nondefault', 'location'..."


In [182]:
test

Unnamed: 0,id,resolution,short_desc,dup_list,root_id,disc_id,description
0,40,DUPL,"['need', 'connect', 'team', 'stream']",,20.0,20.0,"['would', 'like', 'able', 'connect', 'team', '..."
1,48,DUPL,"['make', 'sure', 'future', 'store', 'project',...",,22.0,22.0,"['project', 'reference', 'come', 'three', 'fla..."
2,61,DUPL,"['vcmmeta', 'show', 'change']",,60.0,60.0,"['user', 'user', 'install', 'drop', 'declipseu..."
3,94,DUPL,"['repositories', 'view', 'file', 'type', 'open...",,2.0,2.0,"['browse', 'file', 'repositories', 'view', 'tr..."
4,98,DUPL,['need'],,26.0,26.0,"['need', 'new', 'button', 'repo', 'connection'..."
...,...,...,...,...,...,...,...
424,6425,DUPL,"['motif', 'dropdown', 'inconsistencies']",,193318.0,5088.0,"['public', 'class', 'dropdowntest', 'public', ..."
425,6426,DUPL,"['automatic', 'version', 'number', 'contain', ...",,6149.0,6149.0,"['versioning', 'automatic', 'version', 'name',..."
426,6449,DUPL,"['error', 'close', 'workspace']",,5903.0,5674.0,"['close', 'workspace', 'get', 'message', 'some..."
427,6453,DUPL,"['npe', 'create', 'java', 'perspective']",,6273.0,5925.0,"['upgrade', 'problems', 'lose', 'jre', 'settin..."


In [225]:
def get_recall(data, test, model):
    embeddings = get_reports_embeddings(model.wv, data)
    test_size = len(test.index)
    test_corpus = get_corpus(test)
    TP = 0.
    for ind, descr in enumerate(test_corpus):
        dupl_ids = find_top_duplicate(descr, embeddings, model.wv, topn=100)
        val = 0.
        for dupl_id in dupl_ids:
            if data.iloc[dupl_id]['disc_id'] == test.iloc[ind]['disc_id']:
                val = 1.
                break
        TP += val
    return TP / test_size

In [255]:
print(f"RECALL random = {get_recall(data, test, model_random)}")
print(f"RECALL pretrained = {get_recall(data, test, model_pretrained)}")
print(f"RECALL fine-tuned = {get_recall(data, test, model_finetuned)}")

RECALL random = 0.34032634032634035
RECALL pretrained = 0.6153846153846154
RECALL fine-tuned = 0.6363636363636364
