In [1]:
import pandas as pd
import numpy as np
import re
import string
import math
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Load pickle files

In [2]:
bug_reports_filename = './Output/allBugReports.pickle'
bug_reports_df = pd.read_pickle(bug_reports_filename)

source_code_filename = './Output/allSourceCodes.pickle'
source_code_df = pd.read_pickle(source_code_filename)

# Preprocessing

In [3]:
bug_reports_df.head()


Unnamed: 0_level_0,fix,text,fixdate,summary,description,project,average_precision
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
23,[org.springframework.data.rest.webmvc.reposito...,,2012-07-31 11:48:16,Searches which returns null gives NPE,A search which returns null gives NullPointerE...,DATAREST,0.0
21,[org.springframework.data.rest.webmvc.reposito...,,2012-07-31 11:49:25,paging with parameters,missing parameter findBy in href .prev and .ne...,DATAREST,0.0
41,[org.springframework.data.rest.webmvc.json.rep...,,2012-09-05 06:07:58,No update on objects,"When I try to update an object, the only think...",DATAREST,0.0
45,[org.springframework.data.rest.webmvc.entityto...,,2012-09-05 06:09:20,No IDs in href of objects displayed,Browsing to: http://localhost:8080/rest/sla re...,DATAREST,0.0
38,[org.springframework.data.rest.webmvc.reposito...,,2012-09-05 06:29:43,Listing relations with Accept:application/x-sp...,When loading a relation like this: \nhttp://lo...,DATAREST,0.0


In [4]:
source_code_df.head()

Unnamed: 0,filename,unprocessed_code,project
0,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015 the original author or a...,DATAREST
1,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015-2016 the original author...,DATAREST
2,main.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015-2016 the original author...,DATAREST
3,test.java.org.springframework.data.rest.webmvc...,package org.springframework.data.rest.webmvc;\...,DATAREST
4,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2014-2015 the original author...,DATAREST


In [5]:
# add the Java key words to the stop words
java_keywords = ["abstract", "assert**","assert", "boolean", "break", "byte", "case", "catch", "char", "const", "continue", "default", "do", "double", "else", "enum", "enum****" "extends", "final", "finally", "for", "goto","goto*", "if", "implements", "import", "instanceof", "int","interface", "long", "native", "new", "package", "private", "protected", "public", "return", "short", "static", "strictfp**","strictfp", "super", "switch", "synchornized", "this", "throw", "throws", "transient", "try", "void", "volatile", "while"]
java_operators = ["+", "-", "*", "/", "%", "+=", "-=", "*=", "/=", "++", "--", "==", "!=", "<", ">", "<=", ">=", ".", "[", "]", "(",")", "!", "~","instanceof", "<<", ">>", ">>>", "&", "^", "|", "&&", "||", "?", ":", "^=", "%=", "<<=", ">>=", ">>>=", "&="]
stop = java_keywords + java_operators
#contains english stop words, java keywords and java operators
STOP_WORDS = ENGLISH_STOP_WORDS.union(stop)

In [7]:
def split_camel_case(text):
  words = text.split(' ')
  result = []
  
  for word in words:
    if len(word):
      # regex from https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
      result = result + re.split("(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])", word)
   
  return ' '.join(result)

def remove_trailing_and_single_dots(text):
  words = text.split(' ')
  result = []
  for word in words:
    if len(word) and word[-1] == '.':
      word = word[:-1]
    if len(word) and word[0] == '.':
      word = word[1:]
    result.append(word)
  
  return ' '.join(result)

def remove_comments(text):
  # regex found at https://blog.ostermiller.org/finding-comments-in-source-code-using-regular-expressions/
  result = re.sub("/\*([^*]|[\r\n]|(\*+([^*/]|[\r\n])))*\*+/", '', text)
  return result

def remove_punctuation(text):
  punctuation = "/:;<=>?@[\]^_`{|}~!\"#$%&'()*+,-"
  result = re.sub('['+punctuation+']', ' ', text)
  result = remove_trailing_and_single_dots(result)
  result = re.sub("\s+", ' ', result)
  return result

def remove_new_lines(text):
  result = []
  for line in text.splitlines():
    if len(line) <= 0:
      continue
    result.append(line)
  result = ' '.join(result)
  result = re.sub('\s+', ' ', result)
  return result

def stem_words(text):
  stemmer = PorterStemmer()   #"english"
  words = text.split(' ')
  result = []
  for word in words:
      if word not in STOP_WORDS:
          result.append(stemmer.stem(word))
  result = ' '.join(result)
  return result

In [8]:
def clean_unprocessed_code(text):
    # Remove new line breaks
    result = remove_new_lines(text)

    # Remove comments
    result = remove_comments(result)

    # Remove punctuation
    result = remove_punctuation(result)

    # Remove camelCase but not TitleCase
    result = split_camel_case(result)

    # Lowercase
    result = result.lower()

    # Stem words
    result = stem_words(result)

    return result.strip()

def clean_bug_reports(text):
    if not text:
        return ''
    # Remove new line breaks
    result = remove_new_lines(text)

    # Remove punctuation
    result = remove_punctuation(result)

    # Remove camelCase but not TitleCase
    result = split_camel_case(result)

    # Lowercase
    result = result.lower()

    # Stem words
    # result = stem_words(result)

    return result.strip()
    

In [9]:
source_code_df['processed_code'] = source_code_df.unprocessed_code.apply(clean_unprocessed_code)

In [10]:
source_code_df.head()

Unnamed: 0,filename,unprocessed_code,project,processed_code
0,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015 the original author or a...,DATAREST,org.springframework.data.rest.webmvc.halbrows ...
1,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015-2016 the original author...,DATAREST,org.springframework.data.rest.webmvc.halbrows ...
2,main.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2015-2016 the original author...,DATAREST,org.springframework.data.rest.webmvc.halbrows ...
3,test.java.org.springframework.data.rest.webmvc...,package org.springframework.data.rest.webmvc;\...,DATAREST,org.springframework.data.rest.webmvc org.hamcr...
4,test.java.org.springframework.data.rest.webmvc...,/*\n * Copyright 2014-2015 the original author...,DATAREST,org.springframework.data.rest.webmvc org.hamcr...


In [11]:
bug_reports_df['summary'] = bug_reports_df.summary.apply(clean_bug_reports)
bug_reports_df['description'] = bug_reports_df.description.apply(clean_bug_reports)
bug_reports_df["query"] = bug_reports_df["summary"] + bug_reports_df["description"]

In [12]:
bug_reports_df.head()

Unnamed: 0_level_0,fix,text,fixdate,summary,description,project,average_precision,query
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
23,[org.springframework.data.rest.webmvc.reposito...,,2012-07-31 11:48:16,searches which returns null gives npe,a search which returns null gives null pointer...,DATAREST,0.0,searches which returns null gives npea search ...
21,[org.springframework.data.rest.webmvc.reposito...,,2012-07-31 11:49:25,paging with parameters,missing parameter find by in href prev and nex...,DATAREST,0.0,paging with parametersmissing parameter find b...
41,[org.springframework.data.rest.webmvc.json.rep...,,2012-09-05 06:07:58,no update on objects,when i try to update an object the only think ...,DATAREST,0.0,no update on objectswhen i try to update an ob...
45,[org.springframework.data.rest.webmvc.entityto...,,2012-09-05 06:09:20,no i ds in href of objects displayed,browsing to http localhost 8080 rest sla resul...,DATAREST,0.0,no i ds in href of objects displayedbrowsing t...
38,[org.springframework.data.rest.webmvc.reposito...,,2012-09-05 06:29:43,listing relations with accept application x sp...,when loading a relation like this http localho...,DATAREST,0.0,listing relations with accept application x sp...


## Fix Filenames

In [13]:
def fix_sc_filename(filename):
    file = filename.split(".")
    file = '.'.join(file[-2:])
    return file

def fix_bug_filename(files):
    result = []
    for file in files:
        path = file.split(".")
        path = '.'.join(path[-2:])
        result.append(path)
    return result

In [14]:
source_code_df['filename'] = source_code_df.filename.apply(fix_sc_filename)
bug_reports_df['fix'] = bug_reports_df.fix.apply(fix_bug_filename)

# VSM and rVSM calculations

In [15]:
projects = source_code_df.project.unique()
source_code_grouped = source_code_df.groupby(source_code_df.project)
bug_reports_grouped = bug_reports_df.groupby(bug_reports_df.project)

In [25]:
def calc_VSM(document, query):
    return cosine_similarity(query, document).flatten()

def normalize_terms(terms):
    x_min = min(terms)
    x_max = max(terms)
    for idx, x in enumerate(terms):
        normalized_term = (x - x_min)/(x_max - x_min)
        terms[idx] = normalized_term

def inverse_logit_function(terms):
    for idx, term in enumerate(terms):
        inverse_term = 1 / (1 + math.exp(-1 * term))
        terms[idx] = inverse_term

def calc_rVSM(document, query, gterms):
    result = calc_VSM(document, query)
    for i in range(len(result)):
        result[i] = gterms[i] * result[i]

    return result

def check_similarities(source_code, queries):
    scores_VSM = []
    scores_rVSM = []
    vect = TfidfVectorizer(min_df=1).fit(source_code['processed_code'])
    sc_vect = vect.transform(source_code['processed_code'])

    terms = []
    for doc in source_code['processed_code']:
        terms.append(len(set(doc.split(' '))))

    normalize_terms(terms)
    inverse_logit_function(terms)
    
    for query in queries['query']:
        qr_vect = vect.transform([query])
        score_VSM = calc_VSM(sc_vect, qr_vect)
        score_rVSM = calc_rVSM(sc_vect, qr_vect, terms)
        scores_VSM.append(score_VSM)
        scores_rVSM.append(score_rVSM)

    return scores_VSM, scores_rVSM

## Evaluation

In [30]:
# Order the scores in descending order
def rank_scores(scores):
    for score in scores:
        sorted_scores = np.array(score).argsort()
        return np.flip(sorted_scores)

# Map fix files to files in source code (if there is one)
def get_fix_index(fixes, filenames):
    result = []
    for fix in fixes:
        fix_index = -1
        for idx, filename in enumerate(filenames):
            if filename == fix:
                fix_index = idx
                break
        result.append(fix_index)
    return result
            
            



In [29]:
val = rank_scores(dict_VSM['score'][0])
print(val)

[301 242 218   5 294 234  13 299 142 134 168 237 162 165 335 280   4 155
 289 215 201 285 211 164 127  11 172 133  29 174 244 269 186 194   2 205
  20 173 161 158 198  24  12 182 176 147  94 183 222 145 279 311 319   1
 312 318 321 320 324 317 314 313 203 144 329  26  91 160 286 316  56 323
 245  57   9 315 281 224 232 154  28 306 330  63 178 143 156 149  10 119
 216   0  15  52  23  55 223 220 177 175 192 180  17 210  21 322 328  27
 190 199 255 300  61 113 259  53  43  37 152 212  98 100 231 296 258 126
  25  22 150 167 132  16 225  67 125 148 283 264 151 239  19  54 226 227
  42  51  18  87 251 248 105  82 214 261  89 179 240 270 260 263  88  34
 169 277 171  81 102 106 188 292 197 153 146  73  40 166 181  93  75  90
 107   8  79 233 209 141 256 254 157 266 170  69   6  31 136 159  92   3
 112 327 137 278 120   7 104  45  71 288  14 297  44  86 121 122  66 191
 304 238 228  83  99 246  59 101 236 124 262 108  48 187 271 213 267  68
  96 184 295 250  41 111 115 298 268 293 110  97  7

In [23]:
def start_alg():
    # scores = {name: [] for name in ('VSM', 'rVSM')}
    dict_VSM = {name: [] for name in ('score', 'fix_index')}
    dict_rVSM = {name: [] for name in ('score', 'fix_index')}
    for proj in projects:
        sc_project_df = source_code_grouped.get_group(proj)
        br_project_df = bug_reports_grouped.get_group(proj)

        scores_VSM, scores_rVSM = check_similarities(sc_project_df, br_project_df)
        dict_VSM['score'].append(scores_VSM)
        dict_rVSM['score'].append(scores_rVSM)

        # for index, row in br_project_df.iterrows():
        #     dict_VSM['fix_index'].append(get_fix_rank(row.fix, dict_VSM['score'][index]))
        #     dict_rVSM['fix_index'].append(get_fix_rank(row.fix, dict_rVSM['score'][index]))

    return dict_VSM, dict_rVSM

In [None]:
def start_eval():
    NUM_PROJECTS = len(projects)
    for i in range(NUM_PROJECTS):
        project_scores_VSM = scores['VSM'][i]
        project_scores_rVSM = scores['rVSM'][i]

        

In [27]:
dict_VSM, dict_rVSM = start_alg()