In [1]:
import os
import pandas as pd
import re

**Loading and merging the data**

In [2]:
# Define the path to the directory containing the CSV files
# 'os.path.join' is used to construct a directory path by joining folder names
# The resulting path is relative to the current working directory
directory_path = os.path.join('..', 'data', 'mozilla_dataset', 'dataset_before_merge')
 
# List all CSV files in the directory
# 'os.listdir' lists all files and directories in 'directory_path'
# The list comprehension filters out only the files that end with '.csv'
# 'os.path.join(directory_path, file)' constructs the full file path for each CSV file
file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.csv')]

# Read the CSV files into DataFrames
# 'pd.read_csv(file)' reads each CSV file into a pandas DataFrame
# The resulting DataFrames are stored in a list
dataframes = [pd.read_csv(file) for file in file_paths]

# Concatenate all DataFrames into a single DataFrame
# 'pd.concat(dataframes, ignore_index=True)' concatenates the list of DataFrames into one DataFrame
# 'ignore_index=True' resets the index in the resulting DataFrame
mozilla_dataset = pd.concat(dataframes, ignore_index=True)

**Exploring the data**

In [3]:
# Print the shape of the DataFrame 'mozilla_dataset'
# 'shape' returns a tuple representing the dimensionality of the DataFrame
# The first value is the number of rows, and the second value is the number of columns
print(mozilla_dataset.shape)

(89661, 9)


In [6]:
# Print the names of the columns in the DataFrame 'mozilla_dataset'
# 'columns' returns an Index object containing the column labels of the DataFrame
print(mozilla_dataset.columns)

Index(['Bug ID', 'Type', 'Summary', 'Product', 'Component', 'Assignee',
       'Status', 'Resolution', 'Updated'],
      dtype='object')


In [7]:
# Display the DataFrame 'mozilla_dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
mozilla_dataset

Unnamed: 0,Bug ID,Type,Summary,Product,Component,Assignee,Status,Resolution,Updated
0,1124005,defect,CSS cached in style editor,DevTools,Style Editor,nobody,UNCONFIRMED,---,2023-09-14 13:00:12
1,1312250,defect,"Display non default properties before others, ...",DevTools,DOM,nobody,UNCONFIRMED,---,2024-02-22 23:15:00
2,1447966,enhancement,Add option to temporarily disable CSS features,DevTools,Inspector: Rules,nobody,UNCONFIRMED,---,2022-10-11 14:25:10
3,1457607,defect,"Firefox 59+ version developer tool freeze, can...",DevTools,Console,nobody,UNCONFIRMED,---,2022-10-11 14:32:37
4,1503225,defect,ASAN build - UI freezes when switching from Ne...,DevTools,Console,nobody,UNCONFIRMED,---,2022-10-11 15:09:19
...,...,...,...,...,...,...,...,...,...
89656,1810575,task,Investigate mdn_yari.page_v1 schema errors,Data Platform and Tools,General,nobody,REOPENED,---,2024-01-19 11:21:30
89657,1822095,task,Review buildid SQL queries,Data Platform and Tools,Glean Platform,nobody,REOPENED,---,2023-05-05 05:18:28
89658,1844886,defect,Airflow task bqetl_main_summary .client_probe_...,Data Platform and Tools,General,nobody,REOPENED,---,2023-12-15 10:33:38
89659,1879863,defect,structured missing columns in `firefox_desktop...,Data Platform and Tools,General,nobody,REOPENED,---,2024-02-23 13:04:20


In [8]:
# Print a concise summary of the DataFrame 'mozilla_dataset'
# 'info()' provides essential information about the DataFrame, including:
# - The number of non-null entries in each column
# - The data type of each column
# - The memory usage of the DataFrame
# This is useful for understanding the structure, completeness, and memory footprint of the data
print(mozilla_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89661 entries, 0 to 89660
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Bug ID      89661 non-null  int64 
 1   Type        89661 non-null  object
 2   Summary     89661 non-null  object
 3   Product     89661 non-null  object
 4   Component   89661 non-null  object
 5   Assignee    89661 non-null  object
 6   Status      89661 non-null  object
 7   Resolution  89661 non-null  object
 8   Updated     89661 non-null  object
dtypes: int64(1), object(8)
memory usage: 6.2+ MB
None


**Delete duplicate data [if exist]**

In [9]:
# Check if there are any duplicate rows in the DataFrame 'mozilla_dataset'
# 'duplicated()' returns a Series of boolean values indicating whether each row is a duplicate of a previous row
# 'any()' returns True if any element in the Series is True (indicating the presence of duplicates), otherwise False
# This is useful for determining if there are any duplicate rows in the DataFrame
mozilla_dataset.duplicated().any()

False

**generates descriptive statistics of the data**

In [10]:
# generates descriptive statistics of the data
# include="O": This parameter is specifying that you want to include columns with object data type 
# By default, describe works on numeric columns
mozilla_dataset.describe(include = "O")

Unnamed: 0,Type,Summary,Product,Component,Assignee,Status,Resolution,Updated
count,89661,89661,89661,89661,89661,89661,89661,89661
unique,3,89555,94,824,1003,4,1,77179
top,defect,ISPDB Database Entries,Core,General,nobody,NEW,---,2023-02-09 07:40:14
freq,55676,12,10000,12951,80538,59910,89661,128


In [11]:
# Count the number of unique values in each column of the DataFrame 'mozilla_dataset'
# 'nunique()' returns a Series with the number of unique values for each column
# This is useful for understanding the variability and cardinality of data in each column
mozilla_dataset.nunique()

Bug ID        89661
Type              3
Summary       89555
Product          94
Component       824
Assignee       1003
Status            4
Resolution        1
Updated       77179
dtype: int64

**Show the unique values for type column**

In [12]:
# Print the unique values in the 'Type' column of the DataFrame 'mozilla_dataset'
# 'mozilla_dataset['Type'].unique()' retrieves an array of unique values from the 'Type' column
type_unique_values = mozilla_dataset['Type'].unique()

# Print the unique values array to the console
print(f"Unique values in the 'Type' column: {type_unique_values}")

Unique values in the 'Type' column: ['defect' 'enhancement' 'task']


**Show the unique values for status column**

In [13]:
# Print the unique values in the 'Status' column of the DataFrame 'mozilla_dataset'
# 'mozilla_dataset['Status'].unique()' retrieves an array of unique values from the 'Status' column
status_unique_values = mozilla_dataset['Status'].unique()

# Print the unique values array to the console
print(f"Unique values in the 'Status' column: {status_unique_values}")

Unique values in the 'Status' column: ['UNCONFIRMED' 'NEW' 'ASSIGNED' 'REOPENED']


In [14]:
'''
Bug ID is unique for each column ==> remove it
Resolution is the same for all rows ==> remove it
Type, status, product, component, updated don't affect our task
'''

"\nBug ID is unique for each column ==> remove it\nResolution is the same for all rows ==> remove it\nType, status, product, component, updated don't affect our task\n"

**Drop unnecessary columns**

In [4]:
# Columns to drop from the DataFrame 'mozilla_dataset'
columns_to_drop = ['Bug ID', 'Type', 'Product', 'Component', 'Status', 'Resolution', 'Updated']

# Drop the specified columns from the DataFrame 'mozilla_dataset'
# 'drop(columns=columns_to_drop)' removes the specified columns from the DataFrame
mozilla_dataset = mozilla_dataset.drop(columns=columns_to_drop)

In [5]:
# Print the names of the columns in the DataFrame 'mozilla_dataset'
# 'columns' returns an Index object containing the column labels of the DataFrame
print(mozilla_dataset.columns)

Index(['Summary', 'Assignee'], dtype='object')


**generates descriptive statistics of the data**

In [6]:
# generates descriptive statistics of the data
# include="O": This parameter is specifying that you want to include columns with object data type 
# By default, describe works on numeric columns
mozilla_dataset.describe(include = "O")

Unnamed: 0,Summary,Assignee
count,89661,89661
unique,89555,1003
top,ISPDB Database Entries,nobody
freq,12,80538


**Show the number of nulls in each column**

In [21]:
# Print the number of missing values (NaN) in each column of the DataFrame 'eclipse_dataset'
# 'eclipse_dataset.isnull().sum()' calculates the sum of missing values across all columns
print(mozilla_dataset.isnull().sum())

Summary     0
Assignee    0
dtype: int64


**Print the minimum number of occurance of Assignee column**

In [22]:
# Calculate the minimum number of occurrences of each unique value in the 'Assignee' column
# 'eclipse_datamozilla_datasetset['Assignee'].value_counts().min()' computes the minimum count of occurrences for any value in 'Assignee'
min_occurrences = mozilla_dataset['Assignee'].value_counts().min()

# Print the minimum number of occurrences found in the 'Assignee' column
print(f"The minimum number of occurrences in owner column is {min_occurrences}")

The minimum number of occurrences in owner column is 1


**Filter the dataset do the minimum occurance of each Assignee is 5**

In [7]:
# Calculate the occurrences of each unique value in the 'Assignee' column
# 'mozilla_dataset['Assignee'].value_counts()' computes the frequency of each unique value in 'Assignee'
value_counts = mozilla_dataset['Assigneclipse_datasetee'].value_counts()

# Filter the DataFrame 'eclipse_dataset' to include only rows where 'Assignee' has at least 5 occurrences
# 'mozilla_dataset[eclipse_dataset['Assignee'].isin(value_counts[value_counts >= 5].index)]' filters rows based on occurrence count
mozilla_dataset = mozilla_dataset[mozilla_dataset['Assignee'].isin(value_counts[value_counts >= 5].index)]

In [8]:
# Calculate the minimum number of occurrences of each unique value in the 'Assignee' column
# 'eclipse_datamozilla_datasetset['Assignee'].value_counts().min()' computes the minimum count of occurrences for any value in 'Assignee'
min_occurrences = mozilla_dataset['Assignee'].value_counts().min()

# Print the minimum number of occurrences found in the 'Assignee' column
print(f"The minimum number of occurrences in owner column is {min_occurrences}")

The minimum number of occurrences in owner column after filteration is 5


In [9]:
# Print the shape of the DataFrame 'mozilla_dataset'
# 'shape' returns a tuple representing the dimensionality of the DataFrame
# The first value is the number of rows, and the second value is the number of columns
print(mozilla_dataset.shape)

(88483, 2)


In [29]:
# Count the number of unique values in each column of the DataFrame 'mozilla_dataset'
# 'nunique()' returns a Series with the number of unique values for each column
# This is useful for understanding the variability and cardinality of data in each column
mozilla_dataset.nunique()

Summary     88381
Assignee      304
dtype: int64

**Show the values of Assignee column**

In [30]:
# Print the unique values in the 'Assignee' column of the DataFrame 'mozilla_dataset'
# 'mozilla_dataset['Assignee'].unique()' retrieves an array of unique values from the 'Assignee' column
assignee_unique_values = mozilla_dataset['Assignee'].unique()

# Print each unique value in the 'Assignee' column to the console
print(f"Unique values in the 'Assignee' column:")
for assignee_value in assignee_unique_values:
    print(assignee_value)

Unique values in the 'Assignee' column:
nobody
hmanilla
jimb
poirot.alex
gl
jdescottes
jwatt
daisuke
ayeddi
emilio
kaie
nchevobbe
gijskruitbosch+bugs
continuation
csabou
dlee
martin
arai.unmht
mcs
bugzeeeeee
jboek
kkaya
m_kato
zmckenney
dschubert
mtighe
rob
jcristau
timeless
cdenizet
tthibaud
gregp
jackyzy823
nalexander
bugzillamozillaorg_serge_20140323
hikezoe.birchill
bwerth
krosylight
valentin.gosu
gsvelto
haftandilian
general
gwatson
jmuizelaar
lissyx+mozillians
mozilla
sdowne
chutten
dmosedale
emcminn
gsuntop
halemu
jprickett
mathieu
mconley
nsauermann
pdahiya
scunnane
edilee
enordin
imani
jteow
mcheang
mstriemer
nrishel
nsharpley
rhelmer
sgalich
standard8
teshaq
cgeorgiu
enndeakin
mpohle
ssachdev
choller
jhugman
jneuberger
joschmidt
afranchuk
hsohaney
jules
pmcmanis
ajvincent
brosa
padenot
tomica
wdurand
aryx.bugmail
mh+mozilla
neil
dao+bmo
rpierzina
lgreco
mak
mtigley
bugzilla
itiel_yn8
sphink
mstange.moz
jmaher
zeid
afinder
aglavic
gmierz2
bacasandrei
tom
wptsync
gbrown
james
k

**Remove rows where the 'Assignee' column has the value 'nobody'**

In [10]:
# Remove rows from the DataFrame 'mozilla_dataset' where the 'Assignee' column has the value 'nobody'
# 'mozilla_dataset[mozilla_dataset['Assignee'] != 'nobody']' filters out rows with 'Assignee' not equal to 'nobody'
mozilla_dataset = mozilla_dataset[mozilla_dataset['Assignee'] != 'nobody']

In [11]:
# Print the shape of the DataFrame 'mozilla_dataset'
# 'shape' returns a tuple representing the dimensionality of the DataFrame
# The first value is the number of rows, and the second value is the number of columns
print(mozilla_dataset.shape)

(7945, 2)


**clean the text in the summary column**

In [12]:
def preprocess_summary(summary):
    """
    Preprocesses a summary string by removing special characters, newlines, and hyperlinks.

    Parameters:
    summary (str): The summary string to preprocess.

    Returns:
    str: The preprocessed summary string.
    """
    # Remove special characters using regular expression
    summary = re.sub(r'[^\w\s]', '', summary)
    
    # Remove newlines by replacing with a space
    summary = summary.replace('\n', ' ')
    
    # Remove hyperlinks using regular expression
    summary = re.sub(r'http\S+', '', summary)
    
    return summary

In [13]:
# Preprocess the 'Summary' column in the DataFrame 'v'
# 'apply(preprocess_summary)' applies the function 'preprocess_summary' to each element in the 'Summary' column
# This preprocesses each summary by removing special characters, newlines, and hyperlinks
mozilla_dataset['Summary'] = mozilla_dataset['Summary'].apply(preprocess_summary)

**Filter the dataset by the number of words in Summary column**

In [14]:
def filter_by_word_count(df, min_word_count):
    """
    Filter a DataFrame 'df' to include only rows where the 'Summary' column has at least 'min_word_count' words.

    Parameters:
    df (DataFrame): The input DataFrame containing a 'Summary' column.
    min_word_count (int): The minimum number of words required in the 'Summary' column.

    Returns:
    DataFrame: A filtered DataFrame containing rows with at least 'min_word_count' words in the 'Summary' column.
    """
    # Split each summary into words, count the number of words, and filter rows based on word count
    return df[df['Summary'].str.split().str.len() >= min_word_count]

In [15]:
# Define the minimum word count required in each summary
min_word_count = 5

# Filter rows in the DataFrame 'mozilla_dataset' to include only those where the 'Summary' column has at least 'min_word_count' words
# 'filter_by_word_count(mozilla_dataset, min_word_count)' filters the DataFrame based on the word count in the 'Summary' column
mozilla_dataset = filter_by_word_count(mozilla_dataset, min_word_count)

In [16]:
# Print the shape of the DataFrame 'mozilla_dataset'
# 'shape' returns a tuple representing the dimensionality of the DataFrame
# The first value is the number of rows, and the second value is the number of columns
print(mozilla_dataset.shape)

(7291, 2)


**Save cleaned data**

In [17]:
# Get the current working directory
current_dir = os.getcwd() 

# Define the relative path to save the cleaned DataFrame as a CSV file
relative_path = os.path.join('..', 'data','mozilla_dataset' ,'cleaned_mozilla_dataset.csv')
# Save the DataFrame 'kaggle_dataset' to a CSV file at the specified path
# 'index=False' excludes the DataFrame index from being saved to the CSV file
mozilla_dataset.to_csv(os.path.join(current_dir, relative_path), index=False) # exclude the DataFrame index from being saved to the CSV file.

In [None]:
'''
When exporting a DataFrame to CSV with the index included (index=True, which is the default), 
Pandas includes the index values as an additional column in the CSV file. 
If this index column is not needed for analysis or is redundant (especially if the index is just an integer sequence), 
excluding it can make the CSV file cleaner and more aligned with typical CSV data formats.
'''