In [1]:
file_path = 'mozilla_merged_file.csv'

# Open the file in read mode
with open(file_path, 'r') as file:
    # Use a loop to iterate over each line in the file
    line_count = sum(1 for line in file)

print("Number of lines in the original file:", line_count)


Number of lines in the original file: 89662


In [2]:
import csv

file_path = 'mozilla_merged_file.csv'

# Open the CSV file in read mode
with open(file_path, 'r') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)
    
    # Read the first row of the CSV file
    column_names = next(csv_reader)

print("Column names in the CSV file:", column_names)


Column names in the CSV file: ['Bug ID', 'Type', 'Summary', 'Product', 'Component', 'Assignee', 'Status', 'Resolution', 'Updated']


In [3]:
import pandas as pd

file_path = 'mozilla_merged_file.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path)

# Filter out rows with 'Duplicate' in the 'Status' column
df = df[df['Assignee'] != 'nobody']

# Write the filtered DataFrame back to a CSV file
df.to_csv('filtered_file.csv', index=False)


In [5]:
file_path = 'filtered_file.csv'

# Open the file in read mode
with open(file_path, 'r') as file:
    # Use a loop to iterate over each line in the file
    line_count = sum(1 for line in file)

print("Number of lines after filtering:", line_count)

Number of lines after filtering: 9124


In [1]:
import csv
import re

def remove_short_summary_columns(input_file, output_file, word_threshold=10):
  """
  This function removes columns with less than the specified word threshold 
  in the 'Summary' column from a CSV file, cleans special characters, newlines, 
  removes hyperlinks, and saves the result in another file.

  Args:
      input_file (str): Path to the input CSV file.
      output_file (str): Path to the output CSV file.
      word_threshold (int, optional): Minimum number of words allowed in the 'Summary' column. Defaults to 10.
  """
  with open(input_file, 'r', newline='') as infile, open(output_file, 'w', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    # Read the header row
    header_row = next(reader)
    # Identify the index of the 'Summary' column
    summary_index = header_row.index('Summary')

    # Write the header row to the output file
    writer.writerow(header_row)

    # Define regular expressions for special characters, newlines, and hyperlinks
    special_char_pattern = r"[^\w\s]"
    url_pattern = r"(http|https)?://[^\s]+?"  # Matches URLs with optional protocol (http/https)

    # Process each data row
    for row in reader:
      # Clean the 'Summary' column element
      clean_summary = re.sub(special_char_pattern, "", row[summary_index])
      clean_summary = clean_summary.replace('\n', ' ')  # Replace newline with space
      clean_summary = re.sub(url_pattern, "", clean_summary)  # Remove hyperlinks

      # Count words in the cleaned 'Summary' column
      word_count = len(clean_summary.split())

      # Include row only if word count meets the threshold
      if word_count >= word_threshold:
        row[summary_index] = clean_summary  # Update row with cleaned summary
        writer.writerow(row)

# Example usage
input_file = 'filtered_file.csv'
output_file = 'filtered_file_2.csv'
remove_short_summary_columns(input_file, output_file)

print(f"CSV file processed. Result saved in: {output_file}")

CSV file processed. Result saved in: filtered_file_2.csv


In [2]:
file_path = 'filtered_file_2.csv'

# Open the file in read mode
with open(file_path, 'r') as file:
    # Use a loop to iterate over each line in the file
    line_count = sum(1 for line in file)

print("Number of lines after filtering:", line_count)

Number of lines after filtering: 3823
