In [44]:
import pandas as pd
import os
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [45]:
# Remove the stop words from the preprocessed_description column using nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [46]:
# Define the directory paths
directories = [
    ("..\\new_dataset\\eclipse", "..\\new_dataset\\eclipse"),
    ("..\\new_dataset\\firefox", "..\\new_dataset\\firefox"),
]

# PreProcessing

## 1- Convert the text to lower case

In [47]:
def convert_lower_case(data):
    """
    Convert the input data to a string and return its lowercase representation.

    Parameters:
    data (any): The input data to be converted to lowercase.

    Returns:
    str: The lowercase string representation of the input data.

    Example:
    >>> convert_lower_case("HELLO")
    'hello'
    >>> convert_lower_case(12345)
    '12345'
    """  
        
    return str(data).lower()

## 2- Remove punctuations from the text

In [48]:
def remove_punctuation(data):
    """
    Remove punctuation from the input data and replace it with a space.

    Parameters:
    data (str or np.ndarray): The input data from which punctuation will be removed.
                              It can be a string or a numpy array of strings.

    Returns:
    str or np.ndarray: The input data with punctuation replaced by spaces. The return type
                       matches the input type (str for string input, np.ndarray for numpy array input).

    Example:
    >>> remove_punctuation("Hello, World!")
    'Hello  World '
    >>> remove_punctuation(np.array(["Hello, World!", "Goodbye, World!"]))
    array(['Hello  World ', 'Goodbye  World '], dtype='<U15')
    """
    
    symbols = "!\"#$%&()*+-./:,;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


## 3- Remove Apostrophes from the text

In [49]:
def remove_apostrophe(data):
    """
    Remove all apostrophes from the input data.

    Parameters:
    data (str or np.ndarray): The input data from which apostrophes will be removed.
                              It can be a string or a numpy array of strings.

    Returns:
    str or np.ndarray: The input data with apostrophes removed. The return type
                       matches the input type (str for string input, np.ndarray for numpy array input).

    Example:
    >>> remove_apostrophe("It's a beautiful day!")
    'Its a beautiful day!'
    >>> remove_apostrophe(np.array(["It's a beautiful day!", "You're amazing!"]))
    array(['Its a beautiful day!', 'Youre amazing!'], dtype='<U20')
    """
    
    return np.char.replace(data, "'", "")

## 4- Remove Stopwords from the Text

In [50]:
def remove_stopwords(data):
    """
    Remove stopwords from the input text data.

    Args:
    - data (str or pandas.Series): Input text data from which stopwords are to be removed.

    Returns:
    - str or pandas.Series: Processed text data with stopwords removed.
    """

    # If `data` is a string, tokenize it and remove stopwords.
    if isinstance(data, str):
        return ' '.join([word for word in data.split() if word not in stop_words])
    
    # If `data` is a pandas.Series, apply the lambda function to each element.
    elif isinstance(data, pd.Series):
        return data.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    else:
        raise ValueError("Unsupported data type. Expected string or pandas.Series.")


## 5- Remove the numbers from the text

In [51]:
def remove_numbers(data):
    """
    Remove all numbers from the input data.

    Parameters:
    data (str): The input data from which numbers will be removed. It can be any data type,
                but it will be converted to a string.

    Returns:
    str: The input data with numbers removed.

    Example:
    >>> remove_numbers("There are 123 apples")
    'There are  apples'
    >>> remove_numbers("4567 Elm Street")
    ' Elm Street'
    """
    
    return re.sub(r'\d+', '', str(data))

## 6- Remove the Single Characters from the text

In [52]:
def remove_single_characters(tokens):
    """
    Remove single-character tokens from the input list of tokens.

    Parameters:
    tokens (list of str): The input list of tokens from which single-character tokens will be removed.

    Returns:
    str: A string containing the tokens that have more than one character, separated by spaces.

    Example:
    >>> remove_single_characters(['a', 'hello', 'b', 'world'])
    ' hello world'
    >>> remove_single_characters(['I', 'am', 'a', 'GPT'])
    ' am GPT'
    """
    
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

## 7- Preform Lemmatization

In [53]:
def lemmatization(data):
    """
    Perform lemmatization on the input text data.

    Parameters:
    data (str): The input text data to be lemmatized.

    Returns:
    str: The lemmatized text.

    Example:
    >>> lemmatization("The striped bats are hanging on their feet for best")
    'The striped bat are hanging on their foot for best'
    """
    
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

## Perform Preprocessing

In [54]:
def preprocess(data):
    """
    Perform text preprocessing on the input data.

    Args:
    - data (str): Input text data to be preprocessed.

    Returns:
    - str: Processed text after applying the following steps:
      1. Convert all characters to lowercase.
      2. Remove punctuation marks.
      3. Remove apostrophes.
      4. Remove numerical digits.
      5. Lemmatize words to their base form.
    """
    
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    data = remove_stopwords(data)
    return data

## Example of performing preprocessing

In [55]:
bug_report = [
    """Description:
  Exception is thrown out when link a label to another label.

Test Date:
20080102

Test Build:
BIRT2.3.0 Daily Build: v20080102-1021

Step to reproduce:

1. Creat a label, set text as "label"
2. Set it's bookmark as "label"
3. Add another label "dde"and set its hyperlink as "internal bookmark", enter "label".
4. Preview report.

Error log:
org.eclipse.birt.report.service.api.ReportServiceException: Error happened while running the report; nested exception is: org.mozilla.javascript.EcmaError: ReferenceError: "label" is not defined. (#1) 
Show Exception Stack Trace 
Hide Exception Stack Trace  
Stack Trace:
 
org.mozilla.javascript.ScriptRuntime.constructError(ScriptRuntime.java:3350)
org.mozilla.javascript.ScriptRuntime.constructError(ScriptRuntime.java:3340)
org.mozilla.javascript.ScriptRuntime.notFoundError(ScriptRuntime.java:3413)
org.mozilla.javascript.ScriptRuntime.name(ScriptRuntime.java:1612)
org.mozilla.javascript.gen.c770._c0(:1)
org.mozilla.javascript.gen.c770.call()
org.mozilla.javascript.ContextFactory.doTopCall(ContextFactory.java:393)
org.mozilla.javascript.ScriptRuntime.doTopCall(ScriptRuntime.java:2834)
org.mozilla.javascript.gen.c770.call()
org.mozilla.javascript.gen.c770.exec()
org.eclipse.birt.core.script.ScriptContext.eval(ScriptContext.java:230)
org.eclipse.birt.core.script.ScriptContext.eval(ScriptContext.java:204)
org.eclipse.birt.report.engine.executor.ExecutionContext.evaluate(ExecutionContext.java:594)
org.eclipse.birt.report.engine.executor.ReportItemExecutor.evaluate(ReportItemExecutor.java:275)
org.eclipse.birt.report.engine.executor.ReportItemExecutor.processBookmark(ReportItemExecutor.java:295)
org.eclipse.birt.report.engine.executor.LabelItemExecutor.execute(LabelItemExecutor.java:63)
org.eclipse.birt.report.engine.internal.executor.dup.SuppressDuplicateItemExecutor.execute(SuppressDuplicateItemExecutor.java:42)
org.eclipse.birt.report.engine.internal.executor.wrap.WrappedReportItemExecutor.execute(WrappedReportItemExecutor.java:45)
org.eclipse.birt.report.engine.internal.executor.l18n.LocalizedReportItemExecutor.execute(LocalizedReportItemExecutor.java:33)
org.eclipse.birt.report.engine.layout.html.HTMLBlockStackingLM.layoutNodes(HTMLBlockStackingLM.java:63)
org.eclipse.birt.report.engine.layout.html.HTMLPageLM.layout(HTMLPageLM.java:85)
org.eclipse.birt.report.engine.layout.html.HTMLReportLayoutEngine.layout(HTMLReportLayoutEngine.java:106)
org.eclipse.birt.report.engine.api.impl.RunAndRenderTask.doRun(RunAndRenderTask.java:138)
org.eclipse.birt.report.engine.api.impl.RunAndRenderTask.run(RunAndRenderTask.java:66)
org.eclipse.birt.report.service.ReportEngineService.runAndRenderReport(Unknown Source)
org.eclipse.birt.report.service.ReportEngineService.runAndRenderReport(Unknown Source)
org.eclipse.birt.report.service.BirtViewerReportService.runAndRenderReport(Unknown Source)
org.eclipse.birt.report.service.actionhandler.BirtGetPageAllActionHandler.__execute(Unknown Source)
org.eclipse.birt.report.service.actionhandler.AbstractBaseActionHandler.execute(Unknown Source)
org.eclipse.birt.report.soapengine.processor.AbstractBaseDocumentProcessor.__executeAction(Unknown Source)
org.eclipse.birt.report.soapengine.processor.AbstractBaseComponentProcessor.executeAction(Unknown Source)
org.eclipse.birt.report.soapengine.processor.BirtDocumentProcessor.handleGetPageAll(Unknown Source)
sun.reflect.GeneratedMethodAccessor97.invoke(Unknown Source)
sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
java.lang.reflect.Method.invoke(Unknown Source)
org.eclipse.birt.report.soapengine.processor.AbstractBaseComponentProcessor.process(Unknown Source)
org.eclipse.birt.report.soapengine.endpoint.BirtSoapBindingImpl.getUpdatedObjects(Unknown Source)
sun.reflect.GeneratedMethodAccessor96.invoke(Unknown Source)
sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
java.lang.reflect.Method.invoke(Unknown Source)
org.apache.axis.providers.java.RPCProvider.invokeMethod(RPCProvider.java:397)
org.apache.axis.providers.java.RPCProvider.processMessage(RPCProvider.java:186)
org.apache.axis.providers.java.JavaProvider.invoke(JavaProvider.java:323)
org.apache.axis.strategies.InvocationStrategy.visit(InvocationStrategy.java:32)
org.apache.axis.SimpleChain.doVisiting(SimpleChain.java:118)
org.apache.axis.SimpleChain.invoke(SimpleChain.java:83)
org.apache.axis.handlers.soap.SOAPService.invoke(SOAPService.java:453)
org.apache.axis.server.AxisServer.invoke(AxisServer.java:281)
org.apache.axis.transport.http.AxisServlet.doPost(AxisServlet.java:699)
org.eclipse.birt.report.servlet.BirtSoapMessageDispatcherServlet.doPost(Unknown Source)
javax.servlet.http.HttpServlet.service(HttpServlet.java:616)
org.apache.axis.transport.http.AxisServletBase.service(AxisServletBase.java:327)
javax.servlet.http.HttpServlet.service(HttpServlet.java:689)
org.eclipse.equinox.http.registry.internal.ServletManager$ServletWrapper.service(ServletManager.java:180)
org.eclipse.equinox.http.servlet.internal.ServletRegistration.handleRequest(ServletRegistration.java:90)
org.eclipse.equinox.http.servlet.internal.ProxyServlet.processAlias(ProxyServlet.java:111)
org.eclipse.equinox.http.servlet.internal.ProxyServlet.service(ProxyServlet.java:59)
javax.servlet.http.HttpServlet.service(HttpServlet.java:689)
org.eclipse.equinox.http.jetty.internal.HttpServerManager$InternalHttpServiceServlet.service(HttpServerManager.java:270)
org.mortbay.jetty.servlet.ServletHolder.handle(ServletHolder.java:428)
org.mortbay.jetty.servlet.ServletHandler.dispatch(ServletHandler.java:677)
org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:568)
org.mortbay.http.HttpContext.handle(HttpContext.java:1530)
org.mortbay.http.HttpContext.handle(HttpContext.java:1482)
org.mortbay.http.HttpServer.service(HttpServer.java:909)
org.mortbay.http.HttpConnection.service(HttpConnection.java:820)
org.mortbay.http.HttpConnection.handleNext(HttpConnection.java:986)
org.mortbay.http.HttpConnection.handle(HttpConnection.java:837)
org.mortbay.http.SocketListener.handleConnection(SocketListener.java:245)
org.mortbay.util.ThreadedServer.handle(ThreadedServer.java:357)
org.mortbay.util.ThreadPool$PoolThread.run(ThreadPool.java:534)"""
]

In [56]:
# Example usage:
preprocessed_text = preprocess(bug_report[0])
print(preprocessed_text)


description exception thrown link label another label test date test build birt daily build v step reproduce creat label set text label set bookmark label add another label dde set hyperlink internal bookmark enter label preview report error log org eclipse birt report service api reportserviceexception error happened running report nested exception org mozilla javascript ecmaerror referenceerror label defined show exception stack trace hide exception stack trace stack trace org mozilla javascript scriptruntime constructerror scriptruntime java org mozilla javascript scriptruntime constructerror scriptruntime java org mozilla javascript scriptruntime notfounderror scriptruntime java org mozilla javascript scriptruntime name scriptruntime java org mozilla javascript gen c c org mozilla javascript gen c call org mozilla javascript contextfactory dotopcall contextfactory java org mozilla javascript scriptruntime dotopcall scriptruntime java org mozilla javascript gen c call org mozilla ja

## TF-IDF

In [57]:
def vectorize_data(X_train, X_test):
    """
    Vectorizes the text data using TF-IDF.

    Args:
    X_train (DataFrame): Training DataFrame.
    X_test (DataFrame): Testing DataFrame.

    Returns:
    tuple: Transformed training and testing data, and the vectorizer.

    Example:
    X_train, X_test = split_data(df)
    X_train_tfidf, X_test_tfidf, vectorizer = vectorize_data(X_train, X_test)
    """

    # Vectorize the text data using TF-IDF
    vectorizer = TfidfVectorizer()

    # Drop rows with missing values
    X_train = X_train.dropna(subset=['preprocessed_description'])
    X_test = X_test.dropna(subset=['preprocessed_description'])

    # Fit and transform the training data
    X_train_tfidf = vectorizer.fit_transform(X_train['preprocessed_description'])
    X_test_tfidf = vectorizer.transform(X_test['preprocessed_description'])


    return X_train_tfidf, X_test_tfidf, vectorizer


## Split the data to train and test

In [58]:
def split_data(df):
    """
    Splits the data into training and testing sets.

    Args:
    df (DataFrame): The input DataFrame.

    Returns:
    tuple: Training and testing DataFrames.

    Example:
    df = pd.read_csv("data/source/dir1/example.csv")
    X_train, X_test = split_data(df)
    """

    # Split the data into training and testing sets
    X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

    # print the number of traning examples and testing examples
    print("Number of training examples: ", X_train.shape[0])
    print("Number of testing examples: ", X_test.shape[0])
        
    return X_train, X_test


## Reading the files

In [59]:
def read_and_print_csv(source_dir, file_name):
    """
    Reads a CSV file and prints the first 5 rows of the 'preprocessed_description' column.

    Args:
    source_dir (str): Source directory of the CSV file.
    file_name (str): Name of the CSV file.

    Returns:
    DataFrame: The loaded DataFrame from the CSV file.

    Example:
    source_dir = "data/source/dir1"
    file_name = "example.csv"
    df = read_and_print_csv(source_dir, file_name)
    """

    # Read the CSV file
    file_path = os.path.join(source_dir, file_name)
    df = pd.read_csv(file_path)

    # print the name of the file
    print("File name: ", file_name)
    
    return df


## preprocess all dataset

In [75]:
def process_duplicates(X_train, X_test, X_train_tfidf, X_test_tfidf, threshold):
    """
    Processes and predicts duplicates for test data based on a similarity threshold.

    Args:
    X_train (DataFrame): Training DataFrame.
    X_test (DataFrame): Testing DataFrame.
    X_train_tfidf (sparse matrix): TF-IDF transformed training data.
    X_test_tfidf (sparse matrix): TF-IDF transformed testing data.
    threshold (float): Similarity threshold for predicting duplicates.

    Returns:
    tuple: Dictionaries of predicted and true duplicates.

    Example:
    predicted_duplicates_dict, true_duplicates_dict = process_duplicates(X_train, X_test, X_train_tfidf, X_test_tfidf, 0.9)
    """

    # Intialize the dictionaries
    predicted_duplicates_dict = {}

    # print the similarity threshold
    print("Similarity Threshold:", threshold)

    # Iterate over the test data
    for i in range(X_test_tfidf.shape[0]):

        print("Test example:", i)
        
        # Get the bug_id of the test data
        test_bug_id = X_test['bug_id'].values[i]

        # Initialize the lists
        predicted_duplicates_dict[test_bug_id] = []
        
        # Calculate the cosine similarity between the test exapmle and the whole training set
        similarity = cosine_similarity(X_test_tfidf[i], X_train_tfidf)
        bug_ids = X_train['bug_id'].values[np.where(similarity >= threshold)[1]]

        # If there are no duplicates, add -1 to the list
        if len(bug_ids) == 0:
            bug_ids = [-1]

        # If there are duplicates, add them to the list
        if not np.isnan(test_bug_id):
            predicted_duplicates_dict[test_bug_id].extend(bug_ids)

            dup_ids = X_test['dup_id'].values[i]
            dup_ids = dup_ids.split(',')
            dup_ids = [dup_id.replace('[', '').replace(']', '').replace(' ', '') for dup_id in dup_ids]

            if '' in dup_ids:
                dup_ids.remove('')

            if dup_ids == ['[]']:
                dup_ids = [-1]
            dup_ids = [int(dup_id) for dup_id in dup_ids]

            if len(dup_ids) == 0:
                dup_ids = [-1]

            # Iterate over each duplicate in the predicted duplicates dictionary and get the description of each duplicate
            for dup_id in bug_ids:
                if dup_id != -1:
                    dup_description = X_train[X_train['bug_id'] == dup_id]['description'].values[0]
                    print("Duplicate bug_id:", dup_id)
                    print("Duplicate description:", dup_description)
                else:
                    print("No duplicates found")
                                    
    return predicted_duplicates_dict


## Different Similarities

In [76]:
def process_file(source_dir, target_dir, file_name):
    """
    Processes a given CSV file by reading, splitting, vectorizing, processing duplicates, and evaluating predictions.

    Args:
    source_dir (str): Source directory of the CSV file.
    target_dir (str): Target directory for saving processed results.
    file_name (str): Name of the CSV file.

    Example:
    source_dir = "data/source/dir1"
    target_dir = "data/target/dir1"
    file_name = "example.csv"
    process_file(source_dir, target_dir, file_name)
    """

    # Read the CSV file
    df = read_and_print_csv(source_dir, file_name)

    # Split the data into training and testing sets
    X_train, X_test = split_data(df)

    # Change the X_test here
    X_test = X_test.head(1)

    # Vectorize the text data using TF-IDF
    X_train_tfidf, X_test_tfidf, vectorizer = vectorize_data(X_train, X_test)

    similarity_thresholds = [0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6, 0.55, 0.5]

    # Process duplicates for the test data based on a similarity threshold
    for threshold in similarity_thresholds:

        # Process duplicates
        predicted_duplicates_dict = process_duplicates(X_train, X_test, X_train_tfidf, X_test_tfidf, threshold)

        # Print the predicted and true duplicates
        print("Predicted Duplicates:", predicted_duplicates_dict)
        


In [77]:
def iterate_directories(directories):
    """
    Iterates over each directory and file in the provided list of directories.
    Processes each CSV file found in the directories.

    Args:
    directories (list): List of tuples, each containing a source directory and a target directory.

    Example:
    directories = [("path/to/source/dir1", "path/to/target/dir1"), 
                   ("path/to/source/dir2", "path/to/target/dir2")]
    iterate_directories(directories)
    """

    # Iterate over the directories
    for source_dir, target_dir in directories:

        # Iterate over the files in the source directory
        for file_name in os.listdir(source_dir):
        
            # process only CSV files
            if file_name.endswith(".csv"):
                process_file(source_dir, target_dir, file_name)


In [78]:
# Example usage
iterate_directories(directories)

File name:  eclipse_small_new.csv
Number of training examples:  696
Number of testing examples:  174
Similarity Threshold: 0.95
Test example: 0
No duplicates found
Predicted Duplicates: {214519: [-1]}
Similarity Threshold: 0.9
Test example: 0
No duplicates found
Predicted Duplicates: {214519: [-1]}
Similarity Threshold: 0.85
Test example: 0
No duplicates found
Predicted Duplicates: {214519: [-1]}
Similarity Threshold: 0.8
Test example: 0
No duplicates found
Predicted Duplicates: {214519: [-1]}
Similarity Threshold: 0.75
Test example: 0
No duplicates found
Predicted Duplicates: {214519: [-1]}
Similarity Threshold: 0.7
Test example: 0
No duplicates found
Predicted Duplicates: {214519: [-1]}
Similarity Threshold: 0.65
Test example: 0
No duplicates found
Predicted Duplicates: {214519: [-1]}
Similarity Threshold: 0.6
Test example: 0
No duplicates found
Predicted Duplicates: {214519: [-1]}
Similarity Threshold: 0.55
Test example: 0
No duplicates found
Predicted Duplicates: {214519: [-1]}
Sim