In [3]:
import os
import pandas as pd

**Loading the data**

In [2]:
# Get the current working directory & Construct the relative path to train.csv
current_dir = os.getcwd() 
relative_path = os.path.join('..', 'data', 'kaggle_dataset','kaggle_dataset.csv')

In [4]:
kaggle_dataset = pd.read_csv(os.path.join(current_dir, relative_path))

**Exploring the data**

In [5]:
print(kaggle_dataset.shape)

(118643, 3)


In [6]:
print(kaggle_dataset.columns)

Index(['owner', 'issue_title', 'description'], dtype='object')


In [7]:
kaggle_dataset

Unnamed: 0,owner,issue_title,description
0,amit@chromium.org,"Scrolling with some scroll mice (touchpad, etc...",Product Version : <see about:version>URLs...
1,jon@chromium.org,Proxy causes some or all network requests to fail,Product Version : 0.2.149.27 (1583)URLs (...
2,pfeldman@chromium.org,"Web inspector button ""dock to main window"" doe...",Product Version : chrome beta 1URLs (if a...
3,jon@chromium.org,Habari admin interface is not rendered correctly,Product Version : 0.2.149.27 (1583)URLs (...
4,pkasting@chromium.org,Maximize on second larger monitor not working,Product Version : 0.2.149.27URLs (if appl...
...,...,...,...
118638,navabi@chromium.org,Launch clank_qa recipes to the waterfall,We had git trouble
118639,bulach@chromium.org,data race in ThreadWatcherListTest,r255322 is culprithttp://build.chromium.org/p/...
118640,pfeldman@chromium.org,window.console object should not be configurable,Recently sites have begun replacing window.con...
118641,ernstm@chromium.org,Windows GPU bots failing on multiple tests,All Windows GPU bots are failing a variety of ...


In [8]:
print(kaggle_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118643 entries, 0 to 118642
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   owner        118643 non-null  object
 1   issue_title  118642 non-null  object
 2   description  118642 non-null  object
dtypes: object(3)
memory usage: 2.7+ MB
None


**Delete duplicate data [if exist]**

In [9]:
kaggle_dataset.duplicated().any()

True

In [10]:
kaggle_dataset = kaggle_dataset.drop_duplicates()

In [11]:
kaggle_dataset.duplicated().any()

False

In [12]:
print(kaggle_dataset.shape)

(118587, 3)


**generates descriptive statistics of the data**

In [13]:
# generates descriptive statistics of the data
# include="O": This parameter is specifying that you want to include columns with object data type 
kaggle_dataset.describe(include = "O")

Unnamed: 0,owner,issue_title,description
count,118587,118586,118586
unique,2564,118012,116371
top,estade@chromium.org,Skia image rebaseline,See the link to graphs below.
freq,1249,38,1398


In [14]:
# compute the number of unique values for each column in the DataFrame training_data.
kaggle_dataset.nunique()

owner            2564
issue_title    118012
description    116371
dtype: int64

In [None]:
'''
number of unique titles is less than number of rows
so there are duplicate titles
remove rows of duplicate titles
'''

**Remove duplicate titles**

In [15]:
kaggle_dataset = kaggle_dataset.drop_duplicates(subset='issue_title')

In [16]:
print(kaggle_dataset.shape)

(118013, 3)


In [18]:
# compute the number of unique values for each column in the DataFrame training_data.
kaggle_dataset.nunique()

owner            2563
issue_title    118012
description    115812
dtype: int64

**generates descriptive statistics of the data**

In [17]:
# generates descriptive statistics of the data
# include="O": This parameter is specifying that you want to include columns with object data type 
kaggle_dataset.describe(include = "O")

Unnamed: 0,owner,issue_title,description
count,118013,118012,118012
unique,2563,118012,115812
top,estade@chromium.org,"Scrolling with some scroll mice (touchpad, etc...",See the link to graphs below.
freq,1243,1,1396


**Show the number of nulls in each column**

In [19]:
# Print the number of null values in each column in the training data
print(kaggle_dataset.isnull().sum())

owner          0
issue_title    1
description    1
dtype: int64


**Remove rows with null values**

In [20]:
# remove rows with missing values from the dataset
kaggle_dataset.dropna(inplace=True)
print(kaggle_dataset.isnull().sum())

owner          0
issue_title    0
description    0
dtype: int64


**Save cleaned data**

In [21]:
relative_path = os.path.join('..', 'data','kaggle_dataset' ,'cleaned_kaggle_dataset.csv')
kaggle_dataset.to_csv(os.path.join(current_dir, relative_path), index=False) # exclude the DataFrame index from being saved to the CSV file.

Get the number of lines in the dataset

In [49]:
file_path = 'classifier_data_0.csv'

# Open the file in read mode
with open(file_path, 'r') as file:
    # Use a loop to iterate over each line in the file
    line_count = sum(1 for line in file)

print("Number of lines in the original file:", line_count)


Number of lines in the original file: 118644


Get the columns of the dataset

In [51]:
import csv

file_path = 'classifier_data_0.csv'

# Open the CSV file in read mode
with open(file_path, 'r') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)
    
    # Read the first row of the CSV file
    column_names = next(csv_reader)

print("Column names in the classifier_data_0 file:", column_names)


Column names in the classifier_data_0 file: ['owner', 'issue_title', 'description']


Get the number of unique owners in the dataset

In [52]:
import csv

def get_unique_owner_count(csv_file):
    unique_owners = set()
    with open(csv_file, 'rb') as file:
        try:
            filtered_lines = (line.decode('utf-8-sig') for line in file if b'\x00' not in line)
            csv_reader = csv.DictReader(filtered_lines)
            for row in csv_reader:
                unique_owners.add(row['owner'])
        except Exception as e:
            print("Error occurred:", e)
    return len(unique_owners)

csv_file = 'classifier_data_0.csv'
unique_owner_count = get_unique_owner_count(csv_file)
print("Number of unique owners in classifier_data_0 file:", unique_owner_count)

Number of unique owners in classifier_data_0 file: 2564


Show 5 elements in the description column

In [53]:
import csv

def print_description_elements(csv_file, num_elements=5):
    with open(csv_file, 'r', encoding='utf-8-sig') as file:
        csv_reader = csv.DictReader(file)
        print(f"First {num_elements} elements of the 'description' column:")
        for row in csv_reader:
            print(row['description'])
            num_elements -= 1
            if num_elements == 0:
                break

csv_file = 'classifier_data_0.csv'
print_description_elements(csv_file, num_elements=5)

First 5 elements of the 'description' column:
Product Version      : <see about:version>URLs (if applicable) :0.2.149.27Other browsers tested: Firefox / IEAdd OK or FAIL after other browsers where you have tested this issue:Safari 3:    Firefox 3: OK         IE 7:OKWhat steps will reproduce the problem?1. Open any webpage on compaq 6715s running vista.2. Try scrolling with the touchpad3. Scrolling down will work , but up will not.What is the expected result?The page to scroll up.What happens instead?The page doesn't move.Please provide any additional information below. Attach a screenshot if possible.Only a minor bug. 
Product Version      : 0.2.149.27 (1583)URLs (if applicable) : http://www.igoogle.com,http://code.google.com/p/chromiumOther browsers tested:Add OK or FAIL after other browsers where you have tested this issue:Safari 3:    Firefox 3: OK         IE 7: OKWhat steps will reproduce the problem?1. Load http://www.igoogle.com/ (or any other google account page)2. Click the Sig

Show 5 elements in the title column

In [54]:
import csv

def print_title_elements(csv_file, num_elements=5):
    with open(csv_file, 'r', encoding='utf-8-sig') as file:
        csv_reader = csv.DictReader(file)
        print(f"First {num_elements} elements of the 'title' column:")
        for row in csv_reader:
            print(row['issue_title'])
            num_elements -= 1
            if num_elements == 0:
                break

csv_file = 'classifier_data_0.csv'
print_title_elements(csv_file, num_elements=5)

First 5 elements of the 'title' column:
Scrolling with some scroll mice (touchpad, etc.) scrolls down but not up
Proxy causes some or all network requests to fail
Web inspector button "dock to main window" does nothing
Habari admin interface is not rendered correctly
Maximize on second larger monitor not working


In [1]:
import csv
def filter_non_empty_rows(input_csv, output_csv):
    with open(input_csv, 'r', encoding='utf-8-sig') as file:
        # reads all the lines from the file into a list.
        lines = file.readlines()
        # Filter out lines containing NUL characters
        lines = [line for line in lines if '\0' not in line]
    # Open the output CSV file for writing
    with open(output_csv, 'w', newline='', encoding='utf-8') as output_file:
        csv_writer = csv.writer(output_file)
        csv_writer.writerow(['owner', 'issue_title'])  # Write header
        
        # iterates over each line from the filtered list of lines.
        for line in lines:
            row = line.strip().split(',')
            if row[0] and row[1]:  # Check if both columns are not empty
                csv_writer.writerow(row)

input_csv_file = 'classifier_data_0.csv'
output_csv_file = 'output_csv_file.csv'

filter_non_empty_rows(input_csv_file, output_csv_file)
print(f"Filtered rows saved to '{output_csv_file}'")

Filtered rows saved to 'output_csv_file.csv'


In [6]:
import csv


def filter_non_empty_rows(input_csv, output_csv, delimiter=','):
    """
    Filters rows from a CSV file where the first two columns are not empty 
    and saves them to another CSV file.

    Args:
        input_csv (str): Path to the input CSV file.
        output_csv (str): Path to the output CSV file.
        delimiter (str, optional): Delimiter used in the CSV file (default is comma ',').
    """

    with open(input_csv, 'r', encoding='utf-8-sig') as file:
        lines = file.readlines()
        # Filter out lines containing NUL characters
        lines = [line for line in lines if '\0' not in line]

    with open(output_csv, 'w', newline='', encoding='utf-8') as output_file:
        csv_writer = csv.writer(output_file, delimiter=delimiter)

        # Write header row if it exists in the input file 
        # (assuming the first line is the header)
        header_row = next(csv.reader(open(input_csv, 'r', encoding='utf-8-sig'), delimiter=delimiter))
        if header_row:
            csv_writer.writerow(header_row)

        for line in lines:
            row = line.strip().split(delimiter)  # Split based on delimiter

            # Check if the first two columns are not empty
            if row[0] and row[1]:
                csv_writer.writerow(row)

input_csv_file = 'classifier_data_0.csv'
output_csv_file = 'output_csv_file.csv'

filter_non_empty_rows(input_csv_file, output_csv_file)
print(f"Filtered rows saved to '{output_csv_file}'")

Filtered rows saved to 'output_csv_file.csv'


In [7]:
file_path = 'output_csv_file.csv'

# Open the file in read mode
with open(file_path, 'r') as file:
    # Use a loop to iterate over each line in the file
    line_count = sum(1 for line in file)

print("Number of lines in the filtered file:", line_count)

Number of lines in the filtered file: 118567


In [8]:
import csv

file_path = 'output_csv_file.csv'

# Open the CSV file in read mode
with open(file_path, 'r') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)
    
    # Read the first row of the CSV file
    column_names = next(csv_reader)

print("Column names in the CSV filtered_csv_file:", column_names)

Column names in the CSV filtered_csv_file: ['owner', 'issue_title', 'description']


In [9]:
import csv
from collections import Counter

def filter_rows_by_owner_count(input_csv, output_csv, min_owner_count):
    """
    Filters rows from a CSV file based on the minimum occurrence count of an owner.

    Args:
        input_csv (str): Path to the input CSV file.
        output_csv (str): Path to the output CSV file where filtered rows will be saved.
        min_owner_count (int): Minimum number of times an owner must appear in the input file to be included in the output.
    """

    # Count occurrences of each owner
    owner_counts = Counter()

    with open(input_csv, 'r', encoding='utf-8-sig') as file:
        csv_reader = csv.reader(file)
        header = next(csv_reader)  # Skip header
        for row in csv_reader:
            if len(row) == len(header):  # Ensure the row has the correct number of elements
                owner_counts[row[0]] += 1  # Assuming 'owner' is the first column

    # Filter rows with owners that occur at least min_owner_count times
    filtered_rows = []

    with open(input_csv, 'r', encoding='utf-8-sig') as file:
        csv_reader = csv.reader(file)
        header = next(csv_reader)  # Skip header
        filtered_rows.append(header)
        for row in csv_reader:
            if len(row) == len(header) and owner_counts[row[0]] >= min_owner_count:
                filtered_rows.append(row)

    # Write filtered rows to the output CSV file (including all columns)
    with open(output_csv, 'w', newline='', encoding='utf-8') as file:
        csv_writer = csv.writer(file)
        csv_writer.writerows(filtered_rows)  # Write all rows, not just specific columns

input_csv_file = 'output_csv_file.csv'
output_csv_file = 'filtered_csv_file.csv'
min_owner_count = 5  # Minimum count of occurrences for an owner to be kept

filter_rows_by_owner_count(input_csv_file, output_csv_file, min_owner_count)
print(f"Filtered rows saved to '{output_csv_file}'")

Filtered rows saved to 'filtered_csv_file.csv'


In [10]:
import csv

file_path = 'filtered_csv_file.csv'

# Open the CSV file in read mode
with open(file_path, 'r') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)
    
    # Read the first row of the CSV file
    column_names = next(csv_reader)

print("Column names in the CSV filtered_csv_file:", column_names)

Column names in the CSV filtered_csv_file: ['owner', 'issue_title', 'description']


In [14]:
import csv

file_path = 'classifier_data_20.csv'
# Open the CSV file in read mode
with open(file_path, 'r') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)
    
    # Read the first row of the CSV file
    column_names = next(csv_reader)

print("Column names in the CSV classifier_data_20 file:", column_names)

Column names in the CSV classifier_data_20 file: ['owner', 'issue_title', 'description']


In [11]:
file_path = 'filtered_csv_file.csv'

# Open the file in read mode
with open(file_path, 'r') as file:
    # Use a loop to iterate over each line in the file
    line_count = sum(1 for line in file)

print("Number of lines in the filtered file:", line_count)

Number of lines in the filtered file: 35619


In [60]:
import csv

file_path = 'deep_data.csv'

# Open the CSV file in read mode
with open(file_path, 'r') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)
    
    # Read the first row of the CSV file
    column_names = next(csv_reader)

print("Column names in the CSV deep_data file:", column_names)

Column names in the CSV deep_data file: ['id', 'issue_id', 'issue_title', 'reported_time', 'owner', 'description']


In [15]:
import csv

def merge_csv_files(input_files, output_file):
    # Open the output CSV file in write mode
    with open(output_file, 'w', newline='', encoding='utf-8') as output_csv:
        csv_writer = csv.writer(output_csv)

        # Iterate over each input file
        for input_file in input_files:
            # Open the input CSV file
            with open(input_file, 'r', newline='', encoding='utf-8') as input_csv:
                # Filter out lines containing NUL characters
                filtered_lines = (line for line in input_csv if '\0' not in line)
                csv_reader = csv.reader(filtered_lines)
                
                # Write the rows from the input file to the output file
                csv_writer.writerows(csv_reader)

# List of input CSV files to merge
input_files = ['filtered_csv_file.csv', 'classifier_data_5.csv', 'classifier_data_10.csv', 'classifier_data_20.csv']

# Output CSV file where merged content will be written
output_file = 'merged_file.csv'

# Merge the CSV files
merge_csv_files(input_files, output_file)

print(f"Merged files saved to '{output_file}'")

Merged files saved to 'merged_file.csv'


In [16]:
import csv

file_path = 'merged_file.csv'

# Open the CSV file in read mode
with open(file_path, 'r') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)
    
    # Read the first row of the CSV file
    column_names = next(csv_reader)

print("Column names in the CSV file:", column_names)

Column names in the CSV file: ['owner', 'issue_title', 'description']


In [17]:
file_path = 'merged_file.csv'

# Open the file in read mode
with open(file_path, 'r') as file:
    # Use a loop to iterate over each line in the file
    line_count = sum(1 for line in file)

print("Number of lines in the merged file:", line_count)

Number of lines in the merged file: 377053


In [21]:
import csv
import re


def merge_filter_save_csv(input_csv, output_csv, min_word_count=10):
    """
    Merges 'issue_title' and 'description' columns into a new 'Summary' column,
    removes special characters, newlines, and hyperlinks from the Summary,
    filters rows where the Summary has at least min_word_count words,
    and saves the results to a new CSV file.

    Args:
        input_csv (str): Path to the input CSV file.
        output_csv (str): Path to the output CSV file.
        min_word_count (int, optional): Minimum word count for Summary column. Defaults to 10.
    """

    with open(input_csv, 'r', newline='') as infile, open(output_csv, 'w', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        # Read header row
        header = next(reader)

        # Identify indices of 'issue_title' and 'description' columns
        title_index = header.index('issue_title')
        desc_index = header.index('description')

        # Update header with 'Summary'
        header.insert(header.index('description') + 1, 'Summary')  # Insert before 'description'
        del header[desc_index]  # Remove 'description'
        del header[title_index]  # Remove 'issue_title'
        writer.writerow(header)

        # Define regular expressions for special characters, newlines, and hyperlinks
        special_char_pattern = r"[^\w\s]"
        url_pattern = r"(http|https)?://[^\s]+?"  # Matches URLs with optional protocol (http/https)

        # Process data rows
        for row in reader:
            summary = row[title_index] + " " + row[desc_index]  # Merge text with space

            # Clean the Summary text
            clean_summary = re.sub(special_char_pattern, "", summary)
            clean_summary = clean_summary.replace('\n', ' ')  # Replace newline with space
            clean_summary = re.sub(url_pattern, "", clean_summary)  # Remove hyperlinks

            word_count = len(clean_summary.split())

            if word_count >= min_word_count:
                row.insert(desc_index + 1, clean_summary)  # Insert cleaned Summary before 'description'
                del row[desc_index]  # Remove 'description'
                del row[title_index]  # Remove 'issue_title'
                writer.writerow(row)

# Example usage
input_csv_file = "merged_file.csv"
output_csv_file = "filtered_merged_cleaned_output.csv"

merge_filter_save_csv(input_csv_file, output_csv_file)
print(f"Filtered, merged, and cleaned CSV saved to: {output_csv_file}")

Filtered, merged, and cleaned CSV saved to: filtered_merged_cleaned_output.csv


In [22]:
import csv

file_path = 'merged_file_after_filteration.csv'

# Open the CSV file in read mode
with open(file_path, 'r') as file:
    # Create a CSV reader object
    csv_reader = csv.reader(file)
    
    # Read the first row of the CSV file
    column_names = next(csv_reader)

print("Column names in the CSV file:", column_names)

Column names in the CSV file: ['owner', 'Summary']


In [1]:
file_path = 'merged_file_after_filteration.csv'

# Open the file in read mode
with open(file_path, 'r') as file:
    # Use a loop to iterate over each line in the file
    line_count = sum(1 for line in file)

print("Number of lines in the merged file after filteration:", line_count)

FileNotFoundError: [Errno 2] No such file or directory: 'merged_file_after_filteration.csv'

In [24]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('merged_file_after_filteration.csv')

# Calculate the split indices
total_rows = len(df)
split_indices = [0] + [total_rows * i // 4 for i in range(1, 4)] + [total_rows]

# Split the DataFrame into four parts and save each part to a separate CSV file
for i in range(4):
    start_index = split_indices[i]
    end_index = split_indices[i+1]
    df_part = df.iloc[start_index:end_index]
    df_part.to_csv(f'merged_file_part_{i+1}.csv', index=False)
