In [1]:
import pandas as pd
import os
import numpy as np
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [2]:
# Remove the stop words from the preprocessed_description column using nltk
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Define the directory paths
directories = [
    ("..\\new_dataset\\eclipse", "..\\new_dataset\\eclipse"),
    ("..\\new_dataset\\eclipse_test", "..\\new_dataset\\eclipse_test"),
    ("..\\new_dataset\\firefox", "..\\new_dataset\\firefox"),
]

In [4]:
def convert_lower_case(data):
    """
    Convert the input data to a string and return its lowercase representation.

    Parameters:
    data (any): The input data to be converted to lowercase.

    Returns:
    str: The lowercase string representation of the input data.

    Example:
    >>> convert_lower_case("HELLO")
    'hello'
    >>> convert_lower_case(12345)
    '12345'
    """  
        
    return str(data).lower()

In [5]:
def remove_punctuation(data):
    """
    Remove punctuation from the input data and replace it with a space.

    Parameters:
    data (str or np.ndarray): The input data from which punctuation will be removed.
                              It can be a string or a numpy array of strings.

    Returns:
    str or np.ndarray: The input data with punctuation replaced by spaces. The return type
                       matches the input type (str for string input, np.ndarray for numpy array input).

    Example:
    >>> remove_punctuation("Hello, World!")
    'Hello  World '
    >>> remove_punctuation(np.array(["Hello, World!", "Goodbye, World!"]))
    array(['Hello  World ', 'Goodbye  World '], dtype='<U15')
    """
    
    symbols = "!\"#$%&()*+-./:,;<=>?@[\]^_`{|}~\n"
    for i in symbols:
        data = np.char.replace(data, i, ' ')

    return str(data)


In [6]:
def remove_apostrophe(data):
    """
    Remove all apostrophes from the input data.

    Parameters:
    data (str or np.ndarray): The input data from which apostrophes will be removed.
                              It can be a string or a numpy array of strings.

    Returns:
    str or np.ndarray: The input data with apostrophes removed. The return type
                       matches the input type (str for string input, np.ndarray for numpy array input).

    Example:
    >>> remove_apostrophe("It's a beautiful day!")
    'Its a beautiful day!'
    >>> remove_apostrophe(np.array(["It's a beautiful day!", "You're amazing!"]))
    array(['Its a beautiful day!', 'Youre amazing!'], dtype='<U20')
    """
    
    return np.char.replace(data, "'", "")

In [7]:
def remove_stopwords(data):
    """
    Remove stopwords from the input text data.

    Args:
    - data (str or pandas.Series): Input text data from which stopwords are to be removed.

    Returns:
    - str or pandas.Series: Processed text data with stopwords removed.
    """

    # If `data` is a string, tokenize it and remove stopwords.
    if isinstance(data, str):
        return ' '.join([word for word in data.split() if word not in stop_words])
    
    # If `data` is a pandas.Series, apply the lambda function to each element.
    elif isinstance(data, pd.Series):
        return data.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

    else:
        raise ValueError("Unsupported data type. Expected string or pandas.Series.")


In [8]:
def remove_numbers(data):
    """
    Remove all numbers from the input data.

    Parameters:
    data (str): The input data from which numbers will be removed. It can be any data type,
                but it will be converted to a string.

    Returns:
    str: The input data with numbers removed.

    Example:
    >>> remove_numbers("There are 123 apples")
    'There are  apples'
    >>> remove_numbers("4567 Elm Street")
    ' Elm Street'
    """
    
    return re.sub(r'\d+', '', str(data))

In [9]:
def remove_single_characters(tokens):
    """
    Remove single-character tokens from the input list of tokens.

    Parameters:
    tokens (list of str): The input list of tokens from which single-character tokens will be removed.

    Returns:
    str: A string containing the tokens that have more than one character, separated by spaces.

    Example:
    >>> remove_single_characters(['a', 'hello', 'b', 'world'])
    ' hello world'
    >>> remove_single_characters(['I', 'am', 'a', 'GPT'])
    ' am GPT'
    """
    
    new_text = ""
    for w in tokens:
        if len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [10]:
def lemmatization(data):
    """
    Perform lemmatization on the input text data.

    Parameters:
    data (str): The input text data to be lemmatized.

    Returns:
    str: The lemmatized text.

    Example:
    >>> lemmatization("The striped bats are hanging on their feet for best")
    'The striped bat are hanging on their foot for best'
    """
    
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(data)
    data = remove_single_characters(tokens)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    return lemmatized_output

In [11]:
def preprocess(data):
    """
    Perform text preprocessing on the input data.

    Args:
    - data (str): Input text data to be preprocessed.

    Returns:
    - str: Processed text after applying the following steps:
      1. Convert all characters to lowercase.
      2. Remove punctuation marks.
      3. Remove apostrophes.
      4. Remove numerical digits.
      5. Lemmatize words to their base form.
    """
    
    data = convert_lower_case(data)
    data = remove_punctuation(data)
    data = remove_apostrophe(data)
    data = remove_numbers(data)
    data = lemmatization(data)
    data = remove_stopwords(data)
    return data

In [12]:
bug_report = [
    """Description:
  Exception is thrown out when link a label to another label.

Test Date:
20080102

Test Build:
BIRT2.3.0 Daily Build: v20080102-1021

Step to reproduce:

1. Creat a label, set text as "label"
2. Set it's bookmark as "label"
3. Add another label "dde"and set its hyperlink as "internal bookmark", enter "label".
4. Preview report.

Error log:
org.eclipse.birt.report.service.api.ReportServiceException: Error happened while running the report; nested exception is: org.mozilla.javascript.EcmaError: ReferenceError: "label" is not defined. (#1) 
Show Exception Stack Trace 
Hide Exception Stack Trace  
Stack Trace:
 
org.mozilla.javascript.ScriptRuntime.constructError(ScriptRuntime.java:3350)
org.mozilla.javascript.ScriptRuntime.constructError(ScriptRuntime.java:3340)
org.mozilla.javascript.ScriptRuntime.notFoundError(ScriptRuntime.java:3413)
org.mozilla.javascript.ScriptRuntime.name(ScriptRuntime.java:1612)
org.mozilla.javascript.gen.c770._c0(:1)
org.mozilla.javascript.gen.c770.call()
org.mozilla.javascript.ContextFactory.doTopCall(ContextFactory.java:393)
org.mozilla.javascript.ScriptRuntime.doTopCall(ScriptRuntime.java:2834)
org.mozilla.javascript.gen.c770.call()
org.mozilla.javascript.gen.c770.exec()
org.eclipse.birt.core.script.ScriptContext.eval(ScriptContext.java:230)
org.eclipse.birt.core.script.ScriptContext.eval(ScriptContext.java:204)
org.eclipse.birt.report.engine.executor.ExecutionContext.evaluate(ExecutionContext.java:594)
org.eclipse.birt.report.engine.executor.ReportItemExecutor.evaluate(ReportItemExecutor.java:275)
org.eclipse.birt.report.engine.executor.ReportItemExecutor.processBookmark(ReportItemExecutor.java:295)
org.eclipse.birt.report.engine.executor.LabelItemExecutor.execute(LabelItemExecutor.java:63)
org.eclipse.birt.report.engine.internal.executor.dup.SuppressDuplicateItemExecutor.execute(SuppressDuplicateItemExecutor.java:42)
org.eclipse.birt.report.engine.internal.executor.wrap.WrappedReportItemExecutor.execute(WrappedReportItemExecutor.java:45)
org.eclipse.birt.report.engine.internal.executor.l18n.LocalizedReportItemExecutor.execute(LocalizedReportItemExecutor.java:33)
org.eclipse.birt.report.engine.layout.html.HTMLBlockStackingLM.layoutNodes(HTMLBlockStackingLM.java:63)
org.eclipse.birt.report.engine.layout.html.HTMLPageLM.layout(HTMLPageLM.java:85)
org.eclipse.birt.report.engine.layout.html.HTMLReportLayoutEngine.layout(HTMLReportLayoutEngine.java:106)
org.eclipse.birt.report.engine.api.impl.RunAndRenderTask.doRun(RunAndRenderTask.java:138)
org.eclipse.birt.report.engine.api.impl.RunAndRenderTask.run(RunAndRenderTask.java:66)
org.eclipse.birt.report.service.ReportEngineService.runAndRenderReport(Unknown Source)
org.eclipse.birt.report.service.ReportEngineService.runAndRenderReport(Unknown Source)
org.eclipse.birt.report.service.BirtViewerReportService.runAndRenderReport(Unknown Source)
org.eclipse.birt.report.service.actionhandler.BirtGetPageAllActionHandler.__execute(Unknown Source)
org.eclipse.birt.report.service.actionhandler.AbstractBaseActionHandler.execute(Unknown Source)
org.eclipse.birt.report.soapengine.processor.AbstractBaseDocumentProcessor.__executeAction(Unknown Source)
org.eclipse.birt.report.soapengine.processor.AbstractBaseComponentProcessor.executeAction(Unknown Source)
org.eclipse.birt.report.soapengine.processor.BirtDocumentProcessor.handleGetPageAll(Unknown Source)
sun.reflect.GeneratedMethodAccessor97.invoke(Unknown Source)
sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
java.lang.reflect.Method.invoke(Unknown Source)
org.eclipse.birt.report.soapengine.processor.AbstractBaseComponentProcessor.process(Unknown Source)
org.eclipse.birt.report.soapengine.endpoint.BirtSoapBindingImpl.getUpdatedObjects(Unknown Source)
sun.reflect.GeneratedMethodAccessor96.invoke(Unknown Source)
sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
java.lang.reflect.Method.invoke(Unknown Source)
org.apache.axis.providers.java.RPCProvider.invokeMethod(RPCProvider.java:397)
org.apache.axis.providers.java.RPCProvider.processMessage(RPCProvider.java:186)
org.apache.axis.providers.java.JavaProvider.invoke(JavaProvider.java:323)
org.apache.axis.strategies.InvocationStrategy.visit(InvocationStrategy.java:32)
org.apache.axis.SimpleChain.doVisiting(SimpleChain.java:118)
org.apache.axis.SimpleChain.invoke(SimpleChain.java:83)
org.apache.axis.handlers.soap.SOAPService.invoke(SOAPService.java:453)
org.apache.axis.server.AxisServer.invoke(AxisServer.java:281)
org.apache.axis.transport.http.AxisServlet.doPost(AxisServlet.java:699)
org.eclipse.birt.report.servlet.BirtSoapMessageDispatcherServlet.doPost(Unknown Source)
javax.servlet.http.HttpServlet.service(HttpServlet.java:616)
org.apache.axis.transport.http.AxisServletBase.service(AxisServletBase.java:327)
javax.servlet.http.HttpServlet.service(HttpServlet.java:689)
org.eclipse.equinox.http.registry.internal.ServletManager$ServletWrapper.service(ServletManager.java:180)
org.eclipse.equinox.http.servlet.internal.ServletRegistration.handleRequest(ServletRegistration.java:90)
org.eclipse.equinox.http.servlet.internal.ProxyServlet.processAlias(ProxyServlet.java:111)
org.eclipse.equinox.http.servlet.internal.ProxyServlet.service(ProxyServlet.java:59)
javax.servlet.http.HttpServlet.service(HttpServlet.java:689)
org.eclipse.equinox.http.jetty.internal.HttpServerManager$InternalHttpServiceServlet.service(HttpServerManager.java:270)
org.mortbay.jetty.servlet.ServletHolder.handle(ServletHolder.java:428)
org.mortbay.jetty.servlet.ServletHandler.dispatch(ServletHandler.java:677)
org.mortbay.jetty.servlet.ServletHandler.handle(ServletHandler.java:568)
org.mortbay.http.HttpContext.handle(HttpContext.java:1530)
org.mortbay.http.HttpContext.handle(HttpContext.java:1482)
org.mortbay.http.HttpServer.service(HttpServer.java:909)
org.mortbay.http.HttpConnection.service(HttpConnection.java:820)
org.mortbay.http.HttpConnection.handleNext(HttpConnection.java:986)
org.mortbay.http.HttpConnection.handle(HttpConnection.java:837)
org.mortbay.http.SocketListener.handleConnection(SocketListener.java:245)
org.mortbay.util.ThreadedServer.handle(ThreadedServer.java:357)
org.mortbay.util.ThreadPool$PoolThread.run(ThreadPool.java:534)"""
]

In [13]:
# Example usage:
preprocessed_text = preprocess(bug_report[0])
print(preprocessed_text)


description exception thrown link label another label test date test build birt daily build v step reproduce creat label set text label set bookmark label add another label dde set hyperlink internal bookmark enter label preview report error log org eclipse birt report service api reportserviceexception error happened running report nested exception org mozilla javascript ecmaerror referenceerror label defined show exception stack trace hide exception stack trace stack trace org mozilla javascript scriptruntime constructerror scriptruntime java org mozilla javascript scriptruntime constructerror scriptruntime java org mozilla javascript scriptruntime notfounderror scriptruntime java org mozilla javascript scriptruntime name scriptruntime java org mozilla javascript gen c c org mozilla javascript gen c call org mozilla javascript contextfactory dotopcall contextfactory java org mozilla javascript scriptruntime dotopcall scriptruntime java org mozilla javascript gen c call org mozilla ja

In [14]:
# Iterate over each directory
for source_dir, target_dir in directories:

    # Iterate over each file in the directory
    for file_name in os.listdir(source_dir):
        print(file_name)

        # Check if the file is a CSV file        
        if file_name.endswith(".csv"):
            
            # Load the CSV file
            df = pd.read_csv(os.path.join(source_dir, file_name))
            
            # Check if the DataFrame is empty (end of file reached)
            if df.empty:
                print("End of file reached for:", file_name)
                continue
            
            # Preprocess the bug report
            df['preprocessed_description'] = df['description'].apply(preprocess)

            # Save the preprocessed DataFrame to the existing CSV file in new column named 'preprocessed_description'
            df.to_csv(os.path.join(target_dir, file_name), index=False)
            print("Preprocessing done for:", file_name)
        else:
            print("Not a CSV file:", file_name)
            continue
    print("Preprocessing done for all files in:", source_dir)
print("Preprocessing done for all files in all directories")


eclipse_new.csv
Preprocessing done for: eclipse_new.csv
eclipse_small_new.csv
Preprocessing done for: eclipse_small_new.csv
Preprocessing done for all files in: ..\new_dataset\eclipse
eclipse_test_new.csv
Preprocessing done for: eclipse_test_new.csv
Preprocessing done for all files in: ..\new_dataset\eclipse_test
firefox_new.csv
Preprocessing done for: firefox_new.csv
Preprocessing done for all files in: ..\new_dataset\firefox
Preprocessing done for all files in all directories
