In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import pickle
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk


In [2]:
# Remove the stop words from the preprocessed_description column using nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# PreProcessing

## 1- Convert the text to lower case

In [3]:
def convert_lower_case(data):
    """
    Convert the input data to a string and return its lowercase representation.

    Parameters:
    data (any): The input data to be converted to lowercase.

    Returns:
    str: The lowercase string representation of the input data.

    Example:
    >>> convert_lower_case("HELLO")
    'hello'
    >>> convert_lower_case(12345)
    '12345'
    """  
        
    return str(data).lower()

## 2- Remove punctuations from the text

In [4]:
def remove_punctuation(data):
    """
    Remove punctuation from the input data and replace it with a space.

    Parameters:
    data (str or np.ndarray): The input data from which punctuation will be removed.
                              It can be a string or a numpy array of strings.

    Returns:
    str or np.ndarray: The input data with punctuation replaced by spaces. The return type
                       matches the input type (str for string input, np.ndarray for numpy array input).

    Example:
    >>> remove_punctuation("Hello, World!")
    'Hello  World '
    >>> remove_punctuation(np.array(["Hello, World!", "Goodbye, World!"]))
    array(['Hello  World ', 'Goodbye  World '], dtype='<U15')
    """
    
    symbols = "!\"#$%&()*+-./:,;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


## 3- Remove Apostrophes from the text

In [5]:
def remove_apostrophe(data):
    """
    Remove all apostrophes from the input data.

    Parameters:
    data (str or np.ndarray): The input data from which apostrophes will be removed.
                              It can be a string or a numpy array of strings.

    Returns:
    str or np.ndarray: The input data with apostrophes removed. The return type
                       matches the input type (str for string input, np.ndarray for numpy array input).

    Example:
    >>> remove_apostrophe("It's a beautiful day!")
    'Its a beautiful day!'
    >>> remove_apostrophe(np.array(["It's a beautiful day!", "You're amazing!"]))
    array(['Its a beautiful day!', 'Youre amazing!'], dtype='<U20')
    """
    
    return np.char.replace(data, "'", "")

## 4- Remove Stopwords from the Text

In [6]:
def remove_stopwords(data):
    """
    Remove stopwords from the input text data.

    Args:
    - data (str or pandas.Series): Input text data from which stopwords are to be removed.

    Returns:
    - str or pandas.Series: Processed text data with stopwords removed.
    """

    # If `data` is a string, tokenize it and remove stopwords.
    if isinstance(data, str):
        return ' '.join([word for word in data.split() if word not in stop_words])
    
    # If `data` is a pandas.Series, apply the lambda function to each element.
    elif isinstance(data, pd.Series):
        return data.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    else:
        raise ValueError("Unsupported data type. Expected string or pandas.Series.")


## 5- Remove the numbers from the text

In [7]:
def remove_numbers(data):
    """
    Remove all numbers from the input data.

    Parameters:
    data (str): The input data from which numbers will be removed. It can be any data type,
                but it will be converted to a string.

    Returns:
    str: The input data with numbers removed.

    Example:
    >>> remove_numbers("There are 123 apples")
    'There are  apples'
    >>> remove_numbers("4567 Elm Street")
    ' Elm Street'
    """
    
    return re.sub(r'\d+', '', str(data))

## 6- Remove the Single Characters from the text

In [8]:
def remove_single_characters(tokens):
    """
    Remove single-character tokens from the input list of tokens.

    Parameters:
    tokens (list of str): The input list of tokens from which single-character tokens will be removed.

    Returns:
    str: A string containing the tokens that have more than one character, separated by spaces.

    Example:
    >>> remove_single_characters(['a', 'hello', 'b', 'world'])
    ' hello world'
    >>> remove_single_characters(['I', 'am', 'a', 'GPT'])
    ' am GPT'
    """
    
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

## 7- Preform Lemmatization

In [9]:
def lemmatization(data):
    """
    Perform lemmatization on the input text data.

    Parameters:
    data (str): The input text data to be lemmatized.

    Returns:
    str: The lemmatized text.

    Example:
    >>> lemmatization("The striped bats are hanging on their feet for best")
    'The striped bat are hanging on their foot for best'
    """
    
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

## Perform Preprocessing

In [10]:
def preprocess(data):
    """
    Perform text preprocessing on the input data.

    Args:
    - data (str): Input text data to be preprocessed.

    Returns:
    - str: Processed text after applying the following steps:
      1. Convert all characters to lowercase.
      2. Remove punctuation marks.
      3. Remove apostrophes.
      4. Remove numerical digits.
      5. Lemmatize words to their base form.
    """
    
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    data = remove_stopwords(data)
    return data

## Example of performing preprocessing

In [11]:
bug_report = [
    """Description:
  Exception is thrown out when link a label to another label.

Test Date:
20080102

Test Build:
BIRT2.3.0 Daily Build: v20080102-1021

Step to reproduce:

1. Creat a label, set text as "label"
2. Set it's bookmark as "label"
3. Add another label "dde"and set its hyperlink as "internal bookmark", enter "label".
4. Preview report.

Error log:
org.eclipse.birt.report.service.api.ReportServiceException: Error happened while running the report; nested exception is: org.mozilla.javascript.EcmaError: ReferenceError: "label" is not defined. (#1) 
Show Exception Stack Trace 
Hide Exception Stack Trace  
Stack Trace:
 
org.mozilla.javascript.ScriptRuntime.constructError(ScriptRuntime.java:3350)
org.mozilla.javascript.ScriptRuntime.constructError(ScriptRuntime.java:3340)
org.mozilla.javascript.ScriptRuntime.notFoundError(ScriptRuntime.java:3413)
org.mozilla.javascript.ScriptRuntime.name(ScriptRuntime.java:1612)
org.mozilla.javascript.gen.c770._c0(:1)
org.mozilla.javascript.gen.c770.call()
org.mozilla.javascript.ContextFactory.doTopCall(ContextFactory.java:393)
org.mozilla.javascript.ScriptRuntime.doTopCall(ScriptRuntime.java:2834)
org.mozilla.javascript.gen.c770.call()
org.mozilla.javascript.gen.c770.exec()
org.eclipse.birt.core.script.ScriptContext.eval(ScriptContext.java:230)
org.eclipse.birt.core.script.ScriptContext.eval(ScriptContext.java:204)
org.eclipse.birt.report.engine.executor.ExecutionContext.evaluate(ExecutionContext.java:594)
org.eclipse.birt.report.engine.executor.ReportItemExecutor.evaluate(ReportItemExecutor.java:275)
org.eclipse.birt.report.engine.executor.ReportItemExecutor.processBookmark(ReportItemExecutor.java:295)
org.eclipse.birt.report.engine.executor.LabelItemExecutor.execute(LabelItemExecutor.java:63)
org.eclipse.birt.report.engine.internal.executor.dup.SuppressDuplicateItemExecutor.execute(SuppressDuplicateItemExecutor.java:42)
org.eclipse.birt.report.engine.internal.executor.wrap.WrappedReportItemExecutor.execute(WrappedReportItemExecutor.java:45)
org.eclipse.birt.report.engine.internal.executor.l18n.LocalizedReportItemExecutor.execute(LocalizedReportItemExecutor.java:33)
org.eclipse.birt.report.engine.layout.html.HTMLBlockStackingLM.layoutNodes(HTMLBlockStackingLM.java:63)
org.eclipse.birt.report.engine.layout.html.HTMLPageLM.layout(HTMLPageLM.java:85)
org.eclipse.birt.report.engine.layout.html.HTMLReportLayoutEngine.layout(HTMLReportLayoutEngine.java:106)
org.eclipse.birt.report.engine.api.impl.RunAndRenderTask.doRun(RunAndRenderTask.java:138)
org.eclipse.birt.report.engine.api.impl.RunAndRenderTask.run(RunAndRenderTask.java:66)
org.eclipse.birt.report.service.ReportEngineService.runAndRenderReport(Unknown Source)
org.eclipse.birt.report.service.ReportEngineService.runAndRenderReport(Unknown Source)
org.eclipse.birt.report.service.BirtViewerReportService.runAndRenderReport(Unknown Source)
org.eclipse.birt.report.service.actionhandler.BirtGetPageAllActionHandler.__execute(Unknown Source)
org.eclipse.birt.report.service.actionhandler.AbstractBaseActionHandler.execute(Unknown Source)
org.eclipse.birt.report.soapengine.processor.AbstractBaseDocumentProcessor.__executeAction(Unknown Source)
org.eclipse.birt.report.soapengine.processor.AbstractBaseComponentProcessor.executeAction(Unknown Source)
org.eclipse.birt.report.soapengine.processor.BirtDocumentProcessor.handleGetPageAll(Unknown Source)
sun.reflect.GeneratedMethodAccessor97.invoke(Unknown Source)
sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
java.lang.reflect.Method.invoke(Unknown Source)
org.eclipse.birt.report.soapengine.processor.AbstractBaseComponentProcessor.process(Unknown Source)
org.eclipse.birt.report.soapengine.endpoint.BirtSoapBindingImpl.getUpdatedObjects(Unknown Source)
sun.reflect.GeneratedMethodAccessor96.invoke(Unknown Source)
sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
java.lang.reflect.Method.invoke(Unknown Source)
org.apache.axis.providers.java.RPCProvider.invokeMethod(RPCProvider.java:397)
org.apache.axis.providers.java.RPCProvider.processMessage(RPCProvider.java:186)
org.apache.axis.providers.java.JavaProvider.invoke(JavaProvider.java:323)
org.apache.axis.strategies.InvocationStrategy.visit(InvocationStrategy.java:32)
org.apache.axis.SimpleChain.doVisiting(SimpleChain.java:118)
org.apache.axis.SimpleChain.invoke(SimpleChain.java:83)
org.apache.axis.handlers.soap.SOAPService.invoke(SOAPService.java:453)
org.apache.axis.server.AxisServer.invoke(AxisServer.java:281)
org.apache.axis.transport.http.AxisServlet.doPost(AxisServlet.java:699)
org.eclipse.birt.report.servlet.BirtSoapMessageDispatcherServlet.doPost(Unknown Source)
javax.servlet.http.HttpServlet.service(HttpServlet.java:616)
org.apache.axis.transport.http.AxisServletBase.service(AxisServletBase.java:327)
javax.servlet.http.HttpServlet.service(HttpServlet.java:689)
org.eclipse.equinox.http.registry.internal.ServletManager$ServletWrapper.service(ServletManager.java:180)
org.eclipse.equinox.http.servlet.internal.ServletRegistration.handleRequest(ServletRegistration.java:90)
org.eclipse.equinox.http.servlet.internal.ProxyServlet.processAlias(ProxyServlet.java:111)
org.eclipse.equinox.http.servlet.internal.ProxyServlet.service(ProxyServlet.java:59)
javax.servlet.http.HttpServlet.service(HttpServlet.java:689)
org.eclipse.equinox.http.jetty.internal.HttpServerManager$InternalHttpServiceServlet.service(HttpServerManager.java:270)
org.mortbay.jetty.servlet.ServletHolder.handle(ServletHolder.java:428)
org.mortbay.jetty.servlet.ServletHandler.dispatch(ServletHandler.java:677)
org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:568)
org.mortbay.http.HttpContext.handle(HttpContext.java:1530)
org.mortbay.http.HttpContext.handle(HttpContext.java:1482)
org.mortbay.http.HttpServer.service(HttpServer.java:909)
org.mortbay.http.HttpConnection.service(HttpConnection.java:820)
org.mortbay.http.HttpConnection.handleNext(HttpConnection.java:986)
org.mortbay.http.HttpConnection.handle(HttpConnection.java:837)
org.mortbay.http.SocketListener.handleConnection(SocketListener.java:245)
org.mortbay.util.ThreadedServer.handle(ThreadedServer.java:357)
org.mortbay.util.ThreadPool$PoolThread.run(ThreadPool.java:534)"""
]

In [12]:
# Define the directory paths
directories = [
    ("..\\new_dataset - Copy\\eclipse", "..\\new_dataset - Copy\\eclipse"),
    ("..\\new_dataset - Copy\\firefox", "..\\new_dataset - Copy\\firefox"), 
]


In [13]:
# Example usage:
preprocessed_text = preprocess(bug_report[0])
print(preprocessed_text)

description exception thrown link label another label test date test build birt daily build v step reproduce creat label set text label set bookmark label add another label dde set hyperlink internal bookmark enter label preview report error log org eclipse birt report service api reportserviceexception error happened running report nested exception org mozilla javascript ecmaerror referenceerror label defined show exception stack trace hide exception stack trace stack trace org mozilla javascript scriptruntime constructerror scriptruntime java org mozilla javascript scriptruntime constructerror scriptruntime java org mozilla javascript scriptruntime notfounderror scriptruntime java org mozilla javascript scriptruntime name scriptruntime java org mozilla javascript gen c c org mozilla javascript gen c call org mozilla javascript contextfactory dotopcall contextfactory java org mozilla javascript scriptruntime dotopcall scriptruntime java org mozilla javascript gen c call org mozilla ja

## Loading the data set

In [14]:
def read_data(directory):
    """
    Read each bug ID and its corresponding duplicates, and make them as pairs.
    If the bug has no duplicates, then make -1 as the duplicate indicating no duplicate.

    Args:
    directory (str): Directory containing the CSV files.

    Returns:
    list: List of tuples containing bug ID, its description, and the descriptions of its duplicates.
    """
    bugs = []

    for file in os.listdir(directory):
        if file.endswith(".csv"):

            print("Reading", file, "...")

            df = pd.read_csv(os.path.join(directory, file))

            for i in range(len(df)):
                print(i)
                bug_id = df.iloc[i]["bug_id"]
                duplicates = df.iloc[i]["dup_id"]
                bug_id_des = df.iloc[i]["preprocessed_description"]

                # Make the bug_id and bug_id_des as a tuple
                bug_id = (bug_id, bug_id_des)

                if duplicates == "[]" or duplicates == "nan" or duplicates == "":
                    duplicates = -1
                # if the duplicates are nan float value, then make it as -1
                elif isinstance(duplicates, float) and np.isnan(duplicates):
                    duplicates = -1
                else:
                    # Extract the numbers from the duplicates string
                    # Get the preprocessed_description of each duplicate and make it as a tuple by iterating through the duplicates
                    duplicates = duplicates[1:-1]
                    duplicates = [int(d) for d in duplicates.split(",")]
                    duplicates = [(d, df[df["bug_id"] == d].iloc[0]["preprocessed_description"]) for d in duplicates]

                bugs.append((bug_id, duplicates))
    # print the bugs that has duplicates only
    '''
    for bug in bugs:
        if bug[1] != -1:
            print(bug)     
    '''
    return bugs

In [15]:
# test the function
# itrate over the directories and make the data in one list
bugs = []
for directory in directories:
    bugs.extend(read_data(directory[0]))

# print the first 5 bugs
for i in range(5):
    print(bugs[i])

# print the length of the bugs
print(len(bugs))

Reading eclipse_small_new.csv ...
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268


In [16]:
# Write the bugs to a text file with UTF-8 encoding
with open("bugs.txt", "w", encoding="utf-8") as f:
    for bug in bugs:
        f.write(str(bug) + "\n")


In [17]:
# write function to load the bugs from the text file
def load_bugs(file):
    """
    Load the bugs from the text file.

    Args:
    file (str): The file containing the bugs.

    Returns:
    list: List of tuples containing bug ID, its description, and the descriptions of its duplicates.
    """
    bugs = []

    with open(file, "r") as f:
        for line in f:
            bug = eval(line.strip())
            bugs.append(bug)

    return bugs


In [18]:
# load the bugs from the text file
bugs = load_bugs("bugs.txt")

# print the first 5 bugs
for i in range(5):
    print(bugs[i])

# print the length of the bugs
print(len(bugs))

((214065, 'description regression group toc created automatically preview web viewer pdf build number v step reproduce new table binding dataset new group table specify group toc preview web viewer pdf expected result default toc group actual result empty group toc created automatically preview error log'), -1)
((214070, 'output column page data set editor used resultsetcolumnhandle however cached resultsetcolumnhandle output column list could cause invalidation status resultsetcolumnhandle later better retrieve resultsetcolumnhandle request instead caching list'), -1)
((214068, 'description regression failed preview chart viewer example deploy environment build number v step reproduce deploy chart viewer sample war tomcat jboss preview http localhost chart viewer sample index jsp page expected result preview ok actual result exception occurs error log java lang noclassdeffounderror org apache batik transcoder errorhandler java lang class forname native method java lang class forname c

In [19]:
# make a function to make each bug_id and its preprocessed_description as key-value pair in dictionary
def make_dict(bugs):
    """
    Make each bug ID and its preprocessed description as a key-value pair in a dictionary.

    Args:
    bugs (list): List of tuples containing bug ID and its description.

    Returns:
    dict: Dictionary containing bug ID as key and its description as value.
    """
    bugs_dict = {}
    count = 0
    for bug in bugs:
        print(count)
        count += 1
        bugs_dict[bug[0][0]] = bug[0][1]

    return bugs_dict

In [20]:
# test the function
bug_pairs = make_dict(bugs)

# print the first 5 bug_pairs
for i, (k, v) in enumerate(bug_pairs.items()):
    print(k, v)
    if i == 5:
        break

# print the length of the bug_pairs
print(len(bug_pairs))


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [21]:
# write the bug_pairs to a text file
with open("bug_pairs.txt", "w") as f:
    for k, v in bug_pairs.items():
        f.write(str(k) + ": " + v + "\n")

In [22]:
# write function to load the bug_pairs from the text file
def load_bug_pairs(file):
    """
    Load the bug pairs from the text file.

    Args:
    file (str): The file containing the bug pairs.

    Returns:
    dict: Dictionary containing bug ID as key and its description as value.
    """
    bug_pairs = {}

    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(": ")
            bug_pairs[int(parts[0])] = parts[1]

    return bug_pairs

In [23]:
# load the bug_pairs from the text file
bug_pairs = load_bug_pairs("bug_pairs.txt")

# print the first 5 bug_pairs
for i, (k, v) in enumerate(bug_pairs.items()):
    print(k, v)
    if i == 5:
        break

# print the length of the bug_pairs
print(len(bug_pairs))

214065 description regression group toc created automatically preview web viewer pdf build number v step reproduce new table binding dataset new group table specify group toc preview web viewer pdf expected result default toc group actual result empty group toc created automatically preview error log
214070 output column page data set editor used resultsetcolumnhandle however cached resultsetcolumnhandle output column list could cause invalidation status resultsetcolumnhandle later better retrieve resultsetcolumnhandle request instead caching list
214068 description regression failed preview chart viewer example deploy environment build number v step reproduce deploy chart viewer sample war tomcat jboss preview http localhost chart viewer sample index jsp page expected result preview ok actual result exception occurs error log java lang noclassdeffounderror org apache batik transcoder errorhandler java lang class forname native method java lang class forname class java org eclipse birt

In [24]:
# make a function to take each bug_id and its duplicates and make them as pairs in a dictionary
# if there is duplicates, then make the the value of the pair as 1
# if there is no duplicates, then make the value of the pair as 0 and choose a random duplicate
# input example: ((214452, 'field method start underscore set grouping enabled preference problem get code assist code completion member'), [(214466, 'created attachment code assist grouping screenshot group completion option setting code assist turned suggest function suggest example preg match preg match preg replace callback preg replace htmlspecialchars decode htmlspecialchars look like skip name handling parent structure')])
# output example: {[214452, 214466]: 1}
# input example: ((214452, 'field method start underscore set grouping enabled preference problem get code assist code completion member'), -1)
# output example: {[214452, 214466]: 0}
import random

def make_pairs_dict(bugs):
    """
    Create a dictionary of bug pairs with values indicating if they are duplicates (1) or not (0).

    Args:
    bugs (list): List of tuples containing bug IDs, descriptions, and duplicates.

    Returns:
    dict: Dictionary of bug pairs with duplicate indicators.
    """
    pairs_dict = {}

    all_bug_ids = [bug[0][0] for bug in bugs]

    count = 0
    for bug in bugs:
        print(count)
        count += 1
        bug_id = bug[0][0]
        duplicates = bug[1]

        if duplicates != -1:
            for dup in duplicates:
                pair = (bug_id, dup[0])
                pairs_dict[pair] = 1
        else:
            # Choose a random bug ID for the non-duplicate pair
            random_bug_id = random.choice([id for id in all_bug_ids if id != bug_id])
            pair = (bug_id, random_bug_id)
            pairs_dict[pair] = 0

    return pairs_dict


In [25]:
# test the function
pairs = make_pairs_dict(bugs)

# print the first 5 pairs
for i, (k, v) in enumerate(pairs.items()):
    print(k, v)
    if i == 5:
        break

# print the length of the pairs
print(len(pairs))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [26]:
# write the pairs to a text file
with open("pairs.txt", "w") as f:
    for k, v in pairs.items():
        f.write(str(k) + ": " + str(v) + "\n")

In [27]:
# write function to load the pairs from the text file
def load_pairs(file):
    """
    Load the pairs from the text file.

    Args:
    file (str): The file containing the pairs.

    Returns:
    dict: Dictionary of bug pairs with duplicate indicators.
    """
    pairs = {}

    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(": ")
            pair = eval(parts[0])
            duplicate = int(parts[1])
            pairs[pair] = duplicate

    return pairs


In [28]:
# load the pairs from the text file
pairs = load_pairs("pairs.txt")

# print the first 5 pairs
for i, (k, v) in enumerate(pairs.items()):
    print(k, v)
    if i == 5:
        break

# print the length of the pairs
print(len(pairs))

(214065, 214784) 0
(214070, 214972) 0
(214068, 214069) 1
(214072, 214988) 0
(214071, 214171) 0
(214069, 214068) 1
923


In [29]:
# make a function to split the data into train and test
def split_data(pairs, test_size=0.2):
    """
    Split the data into training and testing sets.

    Args:
    pairs (dict): Dictionary of bug pairs with duplicate indicators.
    test_size (float): Size of the testing set.

    Returns:
    tuple: Training and testing sets.
    """
    pairs_list = list(pairs.keys())
    labels = list(pairs.values())

    pairs_train, pairs_test, labels_train, labels_test = train_test_split(
        pairs_list, labels, test_size=test_size, random_state=44
    )

    return pairs_train, pairs_test, labels_train, labels_test


In [30]:
# test the function
train, test, labels_train, labels_test = split_data(pairs)
print(train[:3])
print(labels_train[:3])
print(test[:3])
print(labels_test[:3])

# print the number of train and test data
print(len(train))
print(len(test))

[(214600, 215035), (214785, 214272), (214984, 214203)]
[0, 0, 0]
[(214265, 214422), (214410, 214953), (214582, 214470)]
[0, 0, 0]
738
185


In [31]:

def extract_features(train, labels_train, bug_pairs):
    
    # Iterate over the training pairs
    features = []
    labels = []
    for pair, label in zip(train, labels_train):
        
        bug1 = bug_pairs[pair[0]]
        bug2 = bug_pairs[pair[1]]

        # Combine the descriptions of the two bugs
        combined = bug1 + " " + bug2

        features.append(combined)
        labels.append(label)

    return features, labels

In [32]:
# test the function
features, labels = extract_features(train, labels_train, bug_pairs)
print(features[:3])
print(labels[:3])


['bring target platform materializer build platform update site build id step reproduce built eclipse cdt installed ptp src downloaded plugins put eclipse install try launch simple ptp job get error note orte seems launched ok one machine white box entry com ibm jdge concurrency message execution error stack java lang nullpointerexception org eclipse ptp internal core pprocess outputdirpath pprocess java org eclipse ptp internal core pprocess deleteoutputfiles pprocess java org eclipse ptp internal core modelmanager newjob modelmanager java org eclipse ptp internal core modelmanager run modelmanager java org eclipse ptp launch internal parallellaunchconfigurationdelegate runtimeapplication parallellaunchconfigurationdelegate java org eclipse ptp launch internal parallellaunchconfigurationdelegate launch parallellaunchconfigurationdelegate java org eclipse ptp tau performance internal performancelaunchsteps performlaunch performancelaunchsteps java org eclipse ptp tau performance intern

## TF-IDF from sklearn

In [35]:
# make a function to extract the features from the text using tfidf
# We have each pair of bug and its duplicate concatenated as a single string and the label indicating if they are duplicates 1 or not 0
# I want to try all n-gram ranges till (6,7)
#ngram_ranges = [(1, 1), (1, 2), (1, 3) , (1,4) , (1,5) , (1,6) , (1,7) , (1,8) , (1,9) , (1,10) ,(1,11) , (1,12) , (1,13) , (1,14) , (1,15)
#                , (2,2) , (2,3) , (2,4) , (2,5) , (2,6) , (2,7) , (2,8) , (2,9) , (2,10) , (2,11) , (2,12) , (2,13) , (2,14) , (2,15) ]

def extract_tfidf_features(features, labels, ngram_range=(1, 1)):
    """
    Extract TF-IDF features from the text.

    Args:
    features (list): List of text data.
    labels (list): List of labels.
    ngram_range (tuple): Range of n-grams.

    Returns:
    tuple: TF-IDF features and labels.
    """
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(features)
    y = np.array(labels)

    return X, y, vectorizer

In [36]:
# function to save the vectorizer and the features
def save_features(X, y, vectorizer, file):
    """
    Save the features and labels to a file.

    Args:
    X (np.ndarray): Array of features.
    y (np.ndarray): Array of labels.
    vectorizer: The vectorizer used to extract features.
    file (str): File to save the features and labels.
    """
    with open(file, "wb") as f:
        pickle.dump((X, y, vectorizer), f)

In [37]:
#ngram_ranges = [(1, 1), (1, 2), (1, 3) , (1,4) , (1,5) , (1,6) , (1,7) , (1,8) , (1,9) , (1,10) ,(1,11) , (1,12) , (1,13) , (1,14) , (1,15)
#                , (2,2) , (2,3) , (2,4) , (2,5) , (2,6) , (2,7) , (2,8) , (2,9) , (2,10) , (2,11) , (2,12) , (2,13) , (2,14) , (2,15) ]
# iterate over the ngram_ranges and extract the features and save the vectorizer

'''
ngram_ranges = [(1, 1), (1, 2), (1, 3) , (1,4) , (1,5) , (1,6) , (1,7) 
                , (2,2) , (2,3) , (2,4) , (2,5) , (2,6) , (2,7)
                , (3,3) , (3,4) , (3,5) , (3,6) , (3,7)
                , (4,4) , (4,5) , (4,6) , (4,7)
                , (5,5) , (5,6) , (5,7)
                , (6,6) , (6,7) ]
'''

ngram_ranges = [(1, 1), (1, 2), (1, 3) ]

for ngram_range in ngram_ranges:
    X, y, vectorizer = extract_tfidf_features(features, labels, ngram_range=ngram_range)
    save_features(X, y, vectorizer, "features_" + str(ngram_range) + ".pkl")

In [38]:
# make a function to extract the features from the test data
def extract_test_features(test, bug_pairs):
    """
    Extract features from the test data.

    Args:
    test (list): List of test data.
    bug_pairs (dict): Dictionary of bug pairs.

    Returns:
    list: List of test features.
    """
    features = []
    for pair in test:
        bug1 = bug_pairs[pair[0]]
        bug2 = bug_pairs[pair[1]]

        combined = bug1 + " " + bug2
        features.append(combined)

    return features


In [39]:
# test the function
test_features = extract_test_features(test, bug_pairs)
print(test_features[:3])


['open http help eclipse org help index jsp painful locate specific topic content list sorted alphabetically bug added eclipse feed planet eclipse im longer blogging eclipse personal blog http litrik blogspot com search label eclipse instead eclipse related post appear new company blog please update feed url http feed feedburner com norio eclipse earliest convenience thanks litrik de roy', 'download eclipse sdk orbit orbit extract eclipse sdk one orbit create new hello world plug add dependency org apache batik transcoder bundle additionally add one class import org apache batik transcoder transcoder import org apache batik transcoder svgsvg svgtranscoder transcoder transcoder new svgtranscoder everything compiles happy open property plug look java build path expand org apache batik transcoder plug dependency see access rule defined org apache batik transcoder course exported package org apache batik transcoder orbit extracted plug longer compiles expanding org apache batik transcoder 

In [40]:
# make a function to extract the features from the test data using the vectorizer
def extract_tfidf_test_features(vectorizer, test_features):
    """
    Extract TF-IDF features from the test data.

    Args:
    vectorizer: TF-IDF vectorizer.
    test_features (list): List of test features.

    Returns:
    array: Test features.
    """
    X = vectorizer.transform(test_features)

    return X


In [41]:
# Iterate over the ngram_ranges and extract the test features
for ngram_range in ngram_ranges:
    with open("features_" + str(ngram_range) + ".pkl", "rb") as f:
        X, y, vectorizer = pickle.load(f)

    test_features = extract_test_features(test, bug_pairs)
    X_test = extract_tfidf_test_features(vectorizer, test_features)

    # Save the test features
    save_features(X_test, y, vectorizer, "test_features_" + str(ngram_range) + ".pkl")

In [42]:
# Load the test features and labels
def load_features(file):
    """
    Load the features and labels from a file.

    Args:
    file (str): File containing the features and labels.

    Returns:
    tuple: Features, labels, and vectorizer.
    """
    with open(file, "rb") as f:
        X, y, vectorizer = pickle.load(f)

    return X, y, vectorizer
    

In [43]:
# function to get the description of the bug from the file given the id
def get_bug_description(file, bug_id):
    """
    Get the description of a bug given its ID.

    Args:
    file (str): File containing the bug descriptions.
    bug_id (int): ID of the bug.

    Returns:
    str: Description of the bug.
    """
    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(": ")
            if int(parts[0]) == bug_id:
                return parts[1]

    return None

In [44]:
# Iterate over the files of the directories and make dictionary of the bug_id and its description

def make_bug_dict(directory):
    """
    Make a dictionary of bug IDs and their descriptions.

    Args:
    directory (str): Directory containing the bug descriptions.

    Returns:
    dict: Dictionary of bug IDs and descriptions.
    """
    bug_dict = {}
    for file in os.listdir(directory):
            if file.endswith(".csv"):

                print("Reading", file, "...")

                df = pd.read_csv(os.path.join(directory, file))

                for i in range(len(df)):
                    print(i)
                    bug_id = df.iloc[i]["bug_id"]
                    bug_id_des = df.iloc[i]["description"]
                    bug_dict[bug_id] = bug_id_des

    return bug_dict

In [45]:
# Iterate over the directories and make a dictionary of the bug_id and its description
directories = [
    ("..\\new_dataset - Copy\\eclipse", "..\\new_dataset - Copy\\eclipse"),
    ("..\\new_dataset - Copy\\firefox", "..\\new_dataset - Copy\\firefox"), 
]

bug_dict = {}
for directory in directories:
    bug_dict.update(make_bug_dict(directory[1]))

# print the first 5 bug_dict
for i, (k, v) in enumerate(bug_dict.items()):
    print(k, v)
    if i == 5:
        break

# print the length of the bug_dict
print(len(bug_dict))

Reading eclipse_small_new.csv ...
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268


In [46]:
# Write the bug dictionary to a text file
with open("bug_dict.txt", "w", encoding="utf-8") as f:
    for k, v in bug_dict.items():
        f.write(str(k) + ": " + v + "\n")

In [47]:
# Load the bug dictionary from the text file
def load_bug_dict(file):
    """
    Load the bug dictionary from the text file.

    Args:
    file (str): File containing the bug dictionary.

    Returns:
    dict: Dictionary of bug IDs and descriptions.
    """
    bug_dict = {}

    with open(file, "r") as f:
        for line in f:
            parts = line.strip().split(": ")
            bug_dict[int(parts[0])] = parts[1]

    return bug_dict


In [48]:
# Load each vectorizer and the features and make predictions
# make predictions using cosine similarity between each example of test data and the whole training data
# then take the top 10 similar examples and get their descriptions

# Load the test features and labels
for ngram_range in ngram_ranges:
    print("N-gram range:", ngram_range)
    X_test, y_test, vectorizer = load_features("test_features_" + str(ngram_range) + ".pkl")
    X_train, y_train, vectorizer = load_features("features_" + str(ngram_range) + ".pkl")

    # Calculate the cosine similarity between the test and training data
    similarities = cosine_similarity(X_test, X_train)

    # Get the top 10 most similar training examples for each test example
    top_similar = np.argsort(similarities, axis=1)[:, -3:]

    # Get the descriptions of the top similar examples
    for i, indices in enumerate(top_similar):
        print("Test example id:", test[i])
        for j, index in enumerate(indices):
            print("Similar example similarity:", similarities[i][index])
            print("Similar example id:", train[index])
            print("Similar example description:", features[index])
        print()
        

N-gram range: (1, 1)
Test example id: (214265, 214422)
Similar example similarity: 0.2617225829284653
Similar example id: (214356, 214677)
Similar example description: download stats show driver example appear candidate build stable build see attached hi wa wondering getting blog aggregated planeteclipse really blogged eclipse presenting eclipsecon thought may start little public blog eclipse doe aggregator work tagging post eclipse e use existing blog mixed non eclipse stuff dont really want start second blog dont thanks http markmelvin blogspot com
Similar example similarity: 0.3147296085611003
Similar example id: (214759, 214862)
Similar example description: hi ive decided move eclipse related blogging relation seam hibernate group blog eclipse related feed http relation servlets feed atom seam feedid represents eclipse tagged posting http relation blogger max process could tell get name changed max rydahl andersen plus get eclipse circle replaced look like picture max rydahl anders

In [49]:
# If I have new bug description , I want to perform the same preprocessing steps and then extract the features using the vectorizer
# then make predictions using cosine similarity between the new bug description and the whole training data
# then take the top 5 similar examples and get their descriptions

# Preprocess the new bug description
new_bug_description = """Created attachment 86296
Code Assist (grouping) screenshot

When "Group completion options" setting in Code Assist have been turned on it suggest only some functions.

It suggest only (for example): 
* preg_match_all (not preg_match)
* preg_replace_callback (not preg_replace) 
* htmlspecialchars_decode (not htmlspecialchars)

Looks like it skips some names handling them as parent in structure(?)
"""
new_bug_description = preprocess(new_bug_description)

# Extract features from the new bug description
new_bug_features = vectorizer.transform([new_bug_description])

# Calculate the cosine similarity between the new bug description and the training data
similarities = cosine_similarity(new_bug_features, X_train)

# Get the top 5 most similar training examples for the new bug description
# make the maximum similarity as the first element in the array
top_similar = np.argsort(similarities, axis=1)[:, -3:]

print("New bug description:", new_bug_description)

# Get the descriptions of the top similar examples
for i in range(top_similar.shape[1]):
    index = top_similar[0, i]
    print("Similar example similarity:", similarities[0, index])

    # To get the similar bug description, use bug_dict to get the description of the bug_id
    bug_id = train[index][0]
    bug_description = bug_dict[bug_id]
    print("Similar example description:", bug_description)

    print()
    
    

New bug description: created attachment code assist grouping screenshot group completion option setting code assist turned suggest function suggest example preg match preg match preg replace callback preg replace htmlspecialchars decode htmlspecialchars look like skip name handling parent structure
Similar example similarity: 0.8146182110123652
Similar example description: Only variables definitions contained in imperative operationa are 
propagated into environments.
Block expression var declarations should be supported too.


if(true)  then {
   var i : Integer := 10;
   i <> null; -- error is not resolved

} endif

Similar example similarity: 0.8757583198131241
Similar example description: Created attachment 86296
Code Assist (grouping) screenshot

When "Group completion options" setting in Code Assist have been turned on it suggest only some functions.

It suggest only (for example): 
* preg_match_all (not preg_match)
* preg_replace_callback (not preg_replace) 
* htmlspecialchars_d