In [1]:
import pandas as pd
import os
import numpy as np


### 1. Clean up the data files

In [2]:
# Define the directory paths
directories = [
    ("..\\dataset\\eclipse", "..\\new_dataset\\eclipse"),
    #("..\\dataset\\eclipse_test", "..\\new_dataset\\eclipse_test"),
    #("..\\dataset\\firefox", "..\\new_dataset\\firefox"),
    #("..\\dataset\\netbeans", "..\\new_dataset\\netbeans"),
    #("..\\dataset\\openoffice", "..\\new_dataset\\openoffice")
]

In [3]:
# Iterate over each directory
for source_dir, target_dir in directories:
    # Iterate over each file in the directory
    for file_name in os.listdir(source_dir):
        print(file_name)
        # Check if the file is a CSV file        
        if file_name.endswith(".csv") and "pairs" not in file_name:
            # Load the CSV file
            df = pd.read_csv(os.path.join(source_dir, file_name))
            
            # Check if the DataFrame is empty (end of file reached)
            if df.empty:
                print("End of file reached for:", file_name)
                continue
            
            # Create a new DataFrame with four columns
            new_df = pd.DataFrame(columns=["bug_id", "bug_severity", "description", "priority"])

            # Populate the new DataFrame with data from the loaded DataFrame
            new_df["bug_id"] = df["bug_id"]
            # if the file is firefox, skip the bug_severity column
            if "firefox" in source_dir:
                new_df["bug_severity"] = ""
            else:
                new_df["bug_severity"] = df["bug_severity"]
            new_df["description"] = df["description"]
            new_df["priority"] = df["priority"]

            # Save the new DataFrame to a new CSV file in the target directory
            new_file_path = os.path.join(target_dir, file_name.replace(".csv", "_new.csv"))
            new_df.to_csv(new_file_path, index=False)


eclipse_small.csv
eclipse_small_pairs.csv


### 2. Clean up the data pairs files

In [19]:
# Define the directory paths
directories = [
    ("..\\dataset\\eclipse", "..\\new_dataset\\eclipse"),
    ("..\\dataset\\firefox", "..\\new_dataset\\firefox"),
    ("..\\dataset\\netbeans", "..\\new_dataset\\netbeans"),
    ("..\\dataset\\openoffice", "..\\new_dataset\\openoffice")
]

In [22]:
# Iterate over each directory
for source_dir, target_dir in directories:
    # Iterate over each file in the directory
    for file_name in os.listdir(source_dir):
        print(file_name)
        # Check if the file is a CSV file and contains "pairs" in its name
        if file_name.endswith(".csv") and "pairs" in file_name:
            # Load the CSV file
            df = pd.read_csv(os.path.join(source_dir, file_name))
            
            # Check if the DataFrame is empty (end of file reached)
            if df.empty:
                print("End of file reached for:", file_name)
                continue
            
            # Filter out rows where the "duplicate" column is empty
            df = df.dropna(subset=['duplicate'])
            
            # Create a new DataFrame with two columns
            new_df = pd.DataFrame(columns=["issue_id", "duplicate"])

            # Populate the new DataFrame with data from the loaded DataFrame
            new_df["issue_id"] = df["issue_id"]
            new_df["duplicate"] = df["duplicate"]

            # Save the new DataFrame to a new CSV file in the target directory
            new_file_path = os.path.join(target_dir, file_name.replace(".csv", "_new.csv"))
            new_df.to_csv(new_file_path, index=False)

eclipse.csv
eclipse_pairs.csv
eclipse_small.csv
eclipse_small_pairs.csv
firefox.csv
firefox_pairs.csv
netbeans-Copia.csv
netbeans.csv
netbeans_pairs-Copia.csv
netbeans_pairs.csv
openoffice.csv
openoffice_pairs.csv


In [9]:
# Define the directory paths
directories = [
    ("..\\new_dataset\\eclipse", "..\\BOW_dataset\\eclipse"),
    ("..\\new_dataset\\firefox", "..\\BOW_dataset\\firefox"),
    ("..\\new_dataset\\netbeans", "..\\BOW_dataset\\netbeans"),
    ("..\\new_dataset\\openoffice", "..\\BOW_dataset\\openoffice")
]


In [18]:
# Function to find the index of a column in a CSV file
def find_column_index(csv_file_path, column_name):
    with open(csv_file_path, 'r') as file:
        header = file.readline().strip().split(',')  # Read the first line (header) and split it by commas
        try:
            return header.index(column_name)  # Find the index of the column name
        except ValueError:
            return None  # Return None if the column name is not found


In [24]:
# Iterate over each directory
for source_dir, target_dir in directories:
    # Iterate over each file in the directory
    for file_name in os.listdir(source_dir):
        # Check if the file is a CSV file and not contains "pairs" in its name
        if file_name.endswith(".csv") and "pairs" not in file_name:
            # Load the CSV file
            new_data = pd.read_csv(os.path.join(source_dir, file_name))
            print("Pairs file loaded:", file_name)

            # Get the corresponding new CSV file name
            new_file_name = file_name.replace(".csv", "_similarity.csv")
            new_file_path = os.path.join(target_dir, new_file_name)

            # Check if the new file exists
            if os.path.exists(new_file_path):
                # Load the new CSV file
                bug_reports_file = pd.read_csv(new_file_path)
                print("Original file loaded:", new_file_name)

                # Check if the DataFrame is empty (end of file reached)
                if new_data.empty:
                    print("End of file reached for:", file_name)
                    continue

                # Create an empty DataFrame to store results of bug_severity and priority similarity columns
                results_df = pd.DataFrame(columns=["issue_id", "duplicate_id", "similarity", "bug_severity_similarity", "priority_similarity"])

                # Iterate through each pair of bug reports
                for index, row in bug_reports_file.iterrows():
                    issue_id = row['issue_id']
                    duplicate_id = row['duplicate_id']

                    # Find the index of bug_severity and priority columns in the source DataFrame
                    bug_severity_index = find_column_index(new_file_path, 'bug_severity')
                    priority_index = find_column_index(new_file_path, 'priority')

                    # Find the bug severity and priority of issue_id and duplicate_id
                    issue_bug_severity = np.array(new_data.loc[new_data['bug_id'] == issue_id, 'bug_severity'])[0]
                    duplicate_bug_severity = np.array(new_data.loc[new_data['bug_id'] == duplicate_id, new_data.columns[bug_severity_index]])[0]

                    issue_priority = np.array(df.loc[df['bug_id'] == issue_id, df.columns[priority_index]])[0]
                    duplicate_priority = np.array(df.loc[df['bug_id'] == duplicate_id, df.columns[priority_index]])[0]

                    # Calculate bug severity and priority similarity
                    bug_severity_similarity = 1 if issue_bug_severity == duplicate_bug_severity else 0
                    priority_similarity = 1 if issue_priority == duplicate_priority else 0
                    
                    # Append the results to the DataFrame
                    results_df = results_df.append({"issue_id": issue_id, "duplicate_id": duplicate_id,
                                                    "similarity": row['similarity'], 
                                                    "bug_severity_similarity": bug_severity_similarity, 
                                                    "priority_similarity": priority_similarity}, ignore_index=True)

                # Save results DataFrame to the existing CSV file
                results_df.to_csv(new_file_path, index=False)
                print("Results saved to:", new_file_name)
            else:
                print("New CSV file not found for:", new_file_name)

Pairs file loaded: eclipse_new.csv
Original file loaded: eclipse_new_similarity.csv


IndexError: index 0 is out of bounds for axis 0 with size 0