In [1]:
import pandas as pd
import numpy as np
import re
import string
import math
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import sys

# Load pickle files

In [2]:
bug_reports_filename = './Output/allBugReports.pickle'
bug_reports_df = pd.read_pickle(bug_reports_filename)

source_code_filename = './Output/allSourceCodes.pickle'
source_code_df = pd.read_pickle(source_code_filename)

# Preprocessing

In [3]:
bug_reports_df.head()


Unnamed: 0_level_0,fix,text,fixdate,summary,description,project,average_precision
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
23,[org.springframework.data.rest.webmvc.reposito...,,2012-07-31 11:48:16,Searches which returns null gives NPE,A search which returns null gives NullPointerE...,DATAREST,0.0
21,[org.springframework.data.rest.webmvc.reposito...,,2012-07-31 11:49:25,paging with parameters,missing parameter findBy in href .prev and .ne...,DATAREST,0.0
41,[org.springframework.data.rest.webmvc.json.rep...,,2012-09-05 06:07:58,No update on objects,"When I try to update an object, the only think...",DATAREST,0.0
45,[org.springframework.data.rest.webmvc.entityto...,,2012-09-05 06:09:20,No IDs in href of objects displayed,Browsing to: http://localhost:8080/rest/sla re...,DATAREST,0.0
38,[org.springframework.data.rest.webmvc.reposito...,,2012-09-05 06:29:43,Listing relations with Accept:application/x-sp...,When loading a relation like this: \nhttp://lo...,DATAREST,0.0


In [4]:
source_code_df.head()

Unnamed: 0,filename,unprocessed_code,project
0,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015 the original author or a...,DATAREST
1,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015-2016 the original author...,DATAREST
2,main.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015-2016 the original author...,DATAREST
3,test.java.org.springframework.data.rest.webmvc...,package org.springframework.data.rest.webmvc;\...,DATAREST
4,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2014-2015 the original author...,DATAREST


In [3]:
# add the Java key words to the stop words
java_keywords = ["abstract", "assert**","assert", "boolean", "break", "byte", "case", "catch", "char", "const", "continue", "default", "do", "double", "else", "enum", "enum****" "extends", "final", "finally", "for", "goto","goto*", "if", "implements", "import", "instanceof", "int","interface", "long", "native", "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp**","strictfp", "super", "switch", "synchornized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while"]
java_operators = ["+", "-", "*", "/", "%", "+=", "-=", "*=", "/=", "++", "--", "==", "!=", "<", ">", "<=", ">=", ".", "[", "]", "(",")", "!", "~","instanceof", "<<", ">>", ">>>", "&", "^", "|", "&&", "||", "?", ":", "^=", "%=", "<<=", ">>=", ">>>=", "&="]
stop = java_keywords + java_operators
#contains english stop words, java keywords and java operators
STOP_WORDS = ENGLISH_STOP_WORDS.union(stop)

In [4]:
def split_camel_case(text):
  words = text.split(' ')
  result = []
  
  for word in words:
    if len(word):
      # regex from https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
      result = result + re.split("(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])", word)
   
  return ' '.join(result)

def remove_trailing_and_single_dots(text):
  words = text.split(' ')
  result = []
  for word in words:
    if len(word) and word[-1] == '.':
      word = word[:-1]
    if len(word) and word[0] == '.':
      word = word[1:]
    result.append(word)
  
  return ' '.join(result)

def remove_comments(text):
  # regex found at https://blog.ostermiller.org/finding-comments-in-source-code-using-regular-expressions/
  result = re.sub("/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/", '', text)
  return result

def remove_punctuation(text):
  punctuation = "/:;<=>?@[\]^_`{|}~!\"#$%&'()*+,-"
  result = re.sub('['+punctuation+']', ' ', text)
  result = remove_trailing_and_single_dots(result)
  result = re.sub("\s+", ' ', result)
  return result

def remove_new_lines(text):
  result = []
  for line in text.splitlines():
    if len(line) <= 0:
      continue
    result.append(line)
  result = ' '.join(result)
  result = re.sub('\s+', ' ', result)
  return result

def stem_words(text):
  stemmer = PorterStemmer()   #"english"
  words = text.split(' ')
  result = []
  for word in words:
      if word not in STOP_WORDS:
          result.append(stemmer.stem(word))
  result = ' '.join(result)
  return result

In [5]:
def clean_unprocessed_code(text):
    # Remove new line breaks
    result = remove_new_lines(text)

    # Remove comments
    result = remove_comments(result)

    # Remove punctuation
    result = remove_punctuation(result)

    # Remove camelCase but not TitleCase
    result = split_camel_case(result)

    # Lowercase
    result = result.lower()

    # Stem words
    result = stem_words(result)

    return result.strip()

def clean_bug_reports(text):
    if not text:
        return ''
    # Remove new line breaks
    result = remove_new_lines(text)

    # Remove punctuation
    result = remove_punctuation(result)

    # Remove camelCase but not TitleCase
    result = split_camel_case(result)

    # Lowercase
    result = result.lower()

    # Stem words
    # result = stem_words(result)

    return result.strip()
    

In [6]:
source_code_df['processed_code'] = source_code_df.unprocessed_code.apply(clean_unprocessed_code)

In [9]:
source_code_df.head()

Unnamed: 0,filename,unprocessed_code,project,processed_code
0,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015 the original author or a...,DATAREST,org.springframework.data.rest.webmvc.halbrows ...
1,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015-2016 the original author...,DATAREST,org.springframework.data.rest.webmvc.halbrows ...
2,main.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015-2016 the original author...,DATAREST,org.springframework.data.rest.webmvc.halbrows ...
3,test.java.org.springframework.data.rest.webmvc...,package org.springframework.data.rest.webmvc;\...,DATAREST,org.springframework.data.rest.webmvc org.hamcr...
4,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2014-2015 the original author...,DATAREST,org.springframework.data.rest.webmvc org.hamcr...


In [7]:
bug_reports_df['summary'] = bug_reports_df.summary.apply(clean_bug_reports)
bug_reports_df['description'] = bug_reports_df.description.apply(clean_bug_reports)
bug_reports_df["query"] = bug_reports_df["summary"] + bug_reports_df["description"]

In [11]:
bug_reports_df.head()

Unnamed: 0_level_0,fix,text,fixdate,summary,description,project,average_precision,query
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
23,[org.springframework.data.rest.webmvc.reposito...,,2012-07-31 11:48:16,searches which returns null gives npe,a search which returns null gives null pointer...,DATAREST,0.0,searches which returns null gives npea search ...
21,[org.springframework.data.rest.webmvc.reposito...,,2012-07-31 11:49:25,paging with parameters,missing parameter find by in href prev and nex...,DATAREST,0.0,paging with parametersmissing parameter find b...
41,[org.springframework.data.rest.webmvc.json.rep...,,2012-09-05 06:07:58,no update on objects,when i try to update an object the only think ...,DATAREST,0.0,no update on objectswhen i try to update an ob...
45,[org.springframework.data.rest.webmvc.entityto...,,2012-09-05 06:09:20,no i ds in href of objects displayed,browsing to http localhost 8080 rest sla resul...,DATAREST,0.0,no i ds in href of objects displayedbrowsing t...
38,[org.springframework.data.rest.webmvc.reposito...,,2012-09-05 06:29:43,listing relations with accept application x sp...,when loading a relation like this http localho...,DATAREST,0.0,listing relations with accept application x sp...


## Fix Filenames

In [16]:
def fix_sc_filename(filename):
    file = filename.split(".")
    file = '.'.join(file[2:])
    return file

def fix_bug_filename(files):
    result = []
    for file in files:
        path = file.split(".")
        path = '.'.join(path[-2:])
        result.append(path)
    return result

In [17]:
source_code_df['filename'] = source_code_df.filename.apply(fix_sc_filename)
# bug_reports_df['fix'] = bug_reports_df.fix.apply(fix_bug_filename)

In [21]:
print(source_code_df.filename.to_numpy()[0])

org.springframework.data.rest.webmvc.halbrowser.halbrowserintegrationtests.java


# VSM and rVSM calculations

In [8]:
projects = source_code_df.project.unique()
source_code_grouped = source_code_df.groupby(source_code_df.project)
bug_reports_grouped = bug_reports_df.groupby(bug_reports_df.project)

In [9]:
print(projects)

['DATAREST' 'CONFIGURATION' 'SEC' 'LDAP' 'IO' 'ELY' 'LANG' 'SPR'
 'DATACMNS' 'SOCIALFB' 'DATAMONGO' 'COLLECTIONS']


In [10]:
def calc_VSM(document, query):
    return cosine_similarity(query, document).flatten()

def normalize_terms(terms):
    x_min = min(terms)
    x_max = max(terms)
    for idx, x in enumerate(terms):
        normalized_term = (x - x_min)/(x_max - x_min)
        terms[idx] = normalized_term

def inverse_logit_function(terms):
    for idx, term in enumerate(terms):
        inverse_term = 1 / (1 + math.exp(-1 * term))
        terms[idx] = inverse_term

def calc_rVSM(document, query, gterms):
    result = calc_VSM(document, query)
    for i in range(len(result)):
        result[i] = gterms[i] * result[i]

    return result

def check_similarities(source_code, queries):
    scores_VSM = []
    scores_rVSM = []
    vect = TfidfVectorizer(min_df=1).fit(source_code['processed_code'])
    sc_vect = vect.transform(source_code['processed_code'])

    terms = []
    for doc in source_code['processed_code']:
        terms.append(len(set(doc.split(' '))))

    normalize_terms(terms)
    inverse_logit_function(terms)
    
    for query in queries['query']:
        qr_vect = vect.transform([query])
        score_VSM = calc_VSM(sc_vect, qr_vect)
        score_rVSM = calc_rVSM(sc_vect, qr_vect, terms)
        scores_VSM.append(score_VSM)
        scores_rVSM.append(score_rVSM)

    return scores_VSM, scores_rVSM

## Evaluation

In [50]:
# Input: 
#   scores: Array of scores created by cosine similarity function
# Output:
#   Array of score indexes sorted in descending order
def rank_scores(scores):
    sorted_scores = np.array(scores).argsort()
    return np.flip(sorted_scores)

# Map fix files to files in source code (if there is one)
# Input: 
#   fixes: Array of filenames representing the fixes of bugs
#   filenames:  Array of filenames representing the source code files
# Output:
#   result: Array of indexes that map the respective fix file to the file in the source code
def get_fix_index(fixes, filenames):
    result = []
    for fix in fixes:
        fix_index = -1
        for idx, filename in enumerate(filenames):
            if filename == fix:
                fix_index = idx
                break
        result.append(fix_index)
    return result


In [26]:
def get_fix_rank(fixes_idx, file_rankings):
    result = []
    for fix_idx in fixes_idx:
        fix_file_rank = sys.maxsize
        for idx, rank in enumerate(file_rankings):
            if fix_idx == rank:
                fix_file_rank = idx + 1
                break
        result.append(fix_file_rank)
    return result

# Given the ranks, get reciprocal of first one
def calc_reciprocal_rank(ranks):
    sorted_ranks = sorted(ranks)
    if sorted_ranks[0] != sys.maxsize:
        return 1/(sorted_ranks[0])
    return 0

def calc_mean_average_precision(ranks):
    sorted_ranks = sorted(ranks)
    pos_instances = 0
    result = 0
    for rank in sorted_ranks:
        if rank == sys.maxsize:
            break
        pos_instances += 1
        result += pos_instances/rank
        
    if pos_instances != 0: 
        return result/pos_instances
    else:
        return 0



In [31]:
sorted(get_fix_rank([0,1,2,0], [1,3,2]))

[1, 3, 9223372036854775807, 9223372036854775807]

In [46]:
def run_eval(sc_project_df, br_project_df, scores, my_dict):
    ranked_scores = []
    fix_index = []
    fix_rank = []
    reciprocal_rank = []
    average_precision = []

    idx = 0
    for _, row in br_project_df.iterrows():
        ranked_scores.append(rank_scores(scores[idx]))
        fix_index.append(get_fix_index(row.fix, sc_project_df.filename))
        fix_rank.append(get_fix_rank(dict_VSM['fix_index'][-1], dict_VSM['ranked_score'][-1]))
        reciprocal_rank.append(calc_reciprocal_rank(dict_VSM['fix_rank'][-1]))
        average_precision.append(calc_mean_average_precision(dict_VSM['fix_rank'][-1]))
        idx += 1
    
    my_dict['ranked_score'].append(ranked_scores)
    my_dict['fix_index'].append(fix_index)
    my_dict['fix_rank'].append(fix_rank)
    my_dict['reciprocal_rank'].append(reciprocal_rank)
    my_dict['average_precision'].append(average_precision)


In [44]:
def start_alg():
    # scores = {name: [] for name in ('VSM', 'rVSM')}
    dict_VSM = {name: [] for name in ('score', 'ranked_score', 'fix_index', 'fix_rank', 'reciprocal_rank', 'average_precision')}
    dict_rVSM = {name: [] for name in ('score', 'ranked_score', 'fix_index', 'fix_rank', 'reciprocal_rank', 'average_precision')}
    for proj in projects:
        sc_project_df = source_code_grouped.get_group(proj)
        br_project_df = bug_reports_grouped.get_group(proj)

        scores_VSM, scores_rVSM = check_similarities(sc_project_df, br_project_df)
        dict_VSM['score'].append(scores_VSM)
        dict_rVSM['score'].append(scores_rVSM)

        run_eval(sc_project_df, br_project_df, scores_VSM, dict_VSM)
        run_eval(sc_project_df, br_project_df, scores_rVSM, dict_rVSM)

    return dict_VSM, dict_rVSM

In [47]:
dict_VSM, dict_rVSM = start_alg()

In [18]:
br_project_df = bug_reports_grouped.get_group('DATAREST')
print(br_project_df.fix.to_numpy()[9])

['org.springframework.data.rest.core.domain.mongodb.mongodbrepositoryconfig.java'
 'org.springframework.data.rest.webmvc.mongodb.mongodbrepositoryconfig.java'
 'org.springframework.data.rest.webmvc.json.persistententityjackson2module.java'
 'org.springframework.data.rest.webmvc.mongodb.mongowebtests.java']


In [36]:
# val = dict_VSM['average_precision']
# print(val)

# Visualization

In [49]:
eval_VSM_df = pd.DataFrame(dict_VSM)
eval_VSM_df.head()

Unnamed: 0,score,ranked_score,fix_index,fix_rank,reciprocal_rank,average_precision
0,"[[0.11169228189905728, 0.15166935046642802, 0....","[[301, 242, 218, 5, 294, 234, 13, 299, 142, 13...","[[149], [-1, -1, 168, -1, -1, 165, -1, 149, -1...","[[9223372036854775807], [9223372036854775807],...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[[0.0655187827268614, 0.02904409766625469, 0.0...","[[31, 112, 5, 107, 20, 110, 111, 158, 122, 213...","[[38, 116, 5], [56, 153], [144, 58], [77, 58, ...","[[9223372036854775807], [9223372036854775807],...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[[0.19094948245197083, 0.030562823039443982, 0...","[[1137, 132, 684, 0, 240, 409, 982, 87, 1227, ...","[[-1], [-1], [-1, -1], [-1], [-1], [-1], [-1, ...","[[9223372036854775807], [9223372036854775807],...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[[0.008177140553281413, 0.008190493821366726, ...","[[287, 425, 235, 70, 427, 384, 345, 241, 138, ...","[[-1, 235], [-1, -1], [-1, 388], [253, 69], [2...","[[9223372036854775807], [9223372036854775807],...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[[0.1049864675464989, 0.011959058963577173, 0....","[[76, 18, 41, 0, 37, 42, 54, 32, 7, 53, 59, 64...","[[-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1],...","[[9223372036854775807], [9223372036854775807],...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [48]:
for key in dict_VSM:
    print(len(dict_VSM[key]))

12
12
12
12
12
12
