In [94]:
import re
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split


In [95]:
# Define the directory paths
directories = [
    ("..\\new_dataset\\eclipse"),
    #("..\\new_dataset\\firefox", "..\\BOW_dataset\\firefox"),
    #("..\\new_dataset\\netbeans", "..\\BOW_dataset\\netbeans"),
    #("..\\new_dataset\\openoffice", "..\\BOW_dataset\\openoffice")
]


In [96]:
# Function to preprocess bug description text
def preprocess_bug_description(description):

    # convert description to string
    description = str(description)
    
    # Convert to lowercase
    description = description.lower()

    # Remove punctuation and special characters
    description = re.sub(r'[^a-zA-Z0-9\s]', '', description)

    # Remove numbers
    description = re.sub(r'\b\d+\b', '', description)

    return description

In [97]:
# Preprocess Bug Reports
def preprocess_bug_reports(bug_reports):
    preprocessed_reports = []
    for report in bug_reports:
        # Implement your preprocessing steps here
        preprocessed_report = preprocess_bug_description(report)
        preprocessed_reports.append(preprocessed_report)
    return preprocessed_reports

In [98]:
def calculate_tfidf_similarity(issue_desc, duplicate_desc, tfidf_vectorizer):
    # Transform bug descriptions into TF-IDF vectors
    tfidf_issue = tfidf_vectorizer.transform([issue_desc])
    tfidf_duplicate = tfidf_vectorizer.transform([duplicate_desc])

    # Compute cosine similarity
    similarity = cosine_similarity(tfidf_issue, tfidf_duplicate)[0][0]
    return similarity

In [99]:
# Function to get bug descriptions based on bug IDs
def get_bug_description(bug_id, file_data):    
    
    return file_data[file_data['bug_id'] == bug_id]['description'].values[0]


In [100]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

def train_model(train_data, bug_reports_file):
    # Function to split duplicates based on ;
    def split_and_join(duplicates):
        return duplicates.split(';')

    # Apply the function to split duplicates and store them in a new variable
    train_data['duplicate'] = train_data['duplicate'].apply(split_and_join)

    # print each issue_id and its duplicates
    for index, row in train_data.iterrows():
        print(row['issue_id'], row['duplicate'])
    
    # Fit the TF-IDF vectorizer on the training data
    train_descriptions = [get_bug_description(issue_id, bug_reports_file) for issue_id in train_data['issue_id']]
    tfidf_vectorizer.fit(train_descriptions)

    print("1: ", preprocess_bug_description(get_bug_description(row['issue_id'], bug_reports_file)))
    print("12154: ",row['duplicate'] )
    
    print ("111: ", get_bug_description(row['duplicate'], bug_reports_file))
    print("2: ", preprocess_bug_description(get_bug_description(row['duplicate'], bug_reports_file)))

    # Compute TF-IDF similarity for training pairs
    train_data['tfidf_similarity'] = train_data.apply(lambda row: calculate_tfidf_similarity(
        preprocess_bug_description(get_bug_description(row['issue_id'], bug_reports_file)),
        preprocess_bug_description(get_bug_description(row['duplicate'], bug_reports_file)),
        tfidf_vectorizer
    ), axis=1)

    return train_data


In [101]:
def tfidf_similarity(bug1, bug2):

    # Apply TF-IDF to bug reports
    tfidf_matrix = tfidf_vectorizer.fit_transform([bug1, bug2])
    
    # Calculate cosine similarity
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])

    return similarity[0][0]

In [102]:
def test_model(test_data, bug_reports_file):
    # Define a lambda function to compute TF-IDF similarity for each pair of bug reports
    compute_similarity = lambda row: tfidf_similarity(get_bug_description(row['issue_id'], bug_reports_file), 
                                                      get_bug_description(row['duplicate_id'], bug_reports_file))

    # Apply the lambda function to each row in the test data DataFrame to compute TF-IDF similarity
    test_data['tfidf_similarity'] = test_data.apply(compute_similarity, axis=1)

    return test_data


In [103]:
# Iterate over each directory
for source_dir in directories:
    # Iterate over each file in the directory
    for file_name in os.listdir(source_dir):
        # Check if the file is a CSV file and contains "_new_similarity" in its name
        if file_name.endswith(".csv") and "_pairs" in file_name:
            # Load the CSV file with similarity data
            pairs_data = pd.read_csv(os.path.join(source_dir, file_name))
            print("Pairs file loaded:", file_name)

            # Load the original bug reports file
            original_file_name = file_name.replace("_pairs.csv", "_new.csv")
            bug_reports_file = pd.read_csv(os.path.join(source_dir, original_file_name))
            print("Original bug reports file loaded:", original_file_name)

            # Split the data into training and testing sets (80% for training, 20% for testing)
            train_data, test_data = train_test_split(pairs_data, test_size=0.2, random_state=42)

            # Train the model
            train_data = train_model(train_data, bug_reports_file)

            # Test the model
            test_data = test_model(test_data, bug_reports_file)

            # Print the updated DataFrame with TF-IDF similarity scores
            print("Updated training data:", train_data.head())
            print("Updated testing data:", test_data.head())


Pairs file loaded: eclipse_small_pairs.csv
Original bug reports file loaded: eclipse_small_new.csv
214429 ['214074']
214577 ['214074']
214297 ['214074']
214328 ['214074']
214988 ['214989']
214855 ['214074']
214713 ['214074']
214748 ['214074']
215052 ['214074']
214505 ['214074']
214089 ['214080']
214173 ['214074']
214755 ['214744']
214876 ['214074']
214197 ['214074']
214551 ['214074']
214837 ['214074']
215022 ['214074']
214687 ['214074']
214524 ['214074']
214118 ['214074']
214930 ['214074']
214209 ['214074']
214576 ['214074']
214289 ['214074']
214097 ['214074']
214691 ['214074']
215018 ['214074']
214309 ['214074']
214633 ['214074']
214563 ['214074']
214669 ['214074']
214772 ['214074']
214445 ['214451']
214631 ['214074']
214296 ['214074']
214500 ['214074']
214632 ['214074']
214135 ['214074']
214580 ['214578']
214235 ['214074']
214366 ['214074']
214836 ['214074']
214153 ['214074']
214219 ['214074']
214976 ['214074']
214764 ['214074']
214862 ['214759']
214897 ['214074']
214806 ['214074']
2

ValueError: ('Lengths must match to compare', (878,), (1,))