**Imports**

In [27]:
import os
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

**Download necessary NLTK data**

In [28]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

**Initialize the stemmer and stop words**

In [29]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

**Load the data**

In [2]:
kaggle_data_set = pd.read_csv('cleaned_kaggle_dataset.csv')
mozilla_data_set = pd.read_csv('cleaned_mozilla_dataset.csv')
eclipse_data_set = pd.read_csv('cleaned_eclipse_dataset.csv')

**Show the data**

In [3]:
kaggle_data_set

Unnamed: 0,owner,Summary
0,amit@chromium.org,Scrolling with some scroll mice touchpad etc s...
1,jon@chromium.org,Proxy causes some or all network requests to f...
2,pfeldman@chromium.org,Web inspector button dock to main window does ...
3,jon@chromium.org,Habari admin interface is not rendered correct...
4,pkasting@chromium.org,Maximize on second larger monitor not working ...
...,...,...
116090,navabi@chromium.org,Launch clank_qa recipes to the waterfall We ha...
116091,bulach@chromium.org,data race in ThreadWatcherListTest r255322 is ...
116092,pfeldman@chromium.org,windowconsole object should not be configurabl...
116093,ernstm@chromium.org,Windows GPU bots failing on multiple tests All...


In [4]:
mozilla_data_set

Unnamed: 0,Summary,Assignee
0,Allow inspecting a logged DOM nodes tree vs th...,hmanilla
1,Display response for multipart content type,hmanilla
2,3934 463 damp customprettyprintjsdebuggerrelo...,hmanilla
3,Devtools go into infinite loop after creating ...,jimb
4,Allow hotreloading stylesheets for temporary i...,poirot.alex
...,...,...
7286,Run event timestamp experiment on Android,jrediger
7287,Speed up table deploys in bqetl_artifact_deplo...,ascholtz
7288,Implement Glean Server Knobs for mobile products,brosa
7289,Airflow task bqetl_firefox_ioschecks__fail_fir...,kignasiak


In [5]:
eclipse_data_set

Unnamed: 0,Assignee,Summary
0,sphinx-inbox,Usage of HashSet and HashMap does not preserve...
1,sphinx-inbox,Add support for scheduling rules in the dynami...
2,sphinx-inbox,Class loading problem during dynamic workflow ...
3,sphinx-inbox,BasicTransactionalFormEditor addPages method i...
4,sphinx-inbox,Make Class loading in BasicWorkflowRunnerOpera...
...,...,...
247992,kmunir,Change action for Filters should be removed
247993,mober.at+eclipse,RSE Project Filters show up as ViewFilterRSETe...
247994,mober.at+eclipse,typo on RSE Tutorials help page
247995,mober.at+eclipse,Need to update the year of copyright in Univer...


**Rename 'owner' column in kaggle dataset to be 'Assignee'**

In [6]:
kaggle_data_set.rename(columns={'owner': 'Assignee'}, inplace=True)

In [7]:
kaggle_data_set

Unnamed: 0,Assignee,Summary
0,amit@chromium.org,Scrolling with some scroll mice touchpad etc s...
1,jon@chromium.org,Proxy causes some or all network requests to f...
2,pfeldman@chromium.org,Web inspector button dock to main window does ...
3,jon@chromium.org,Habari admin interface is not rendered correct...
4,pkasting@chromium.org,Maximize on second larger monitor not working ...
...,...,...
116090,navabi@chromium.org,Launch clank_qa recipes to the waterfall We ha...
116091,bulach@chromium.org,data race in ThreadWatcherListTest r255322 is ...
116092,pfeldman@chromium.org,windowconsole object should not be configurabl...
116093,ernstm@chromium.org,Windows GPU bots failing on multiple tests All...


**Exchange columns in kaggle and eclipse datasets to be 'Summary then 'Assignee'**

In [9]:
cols = list(kaggle_data_set.columns)
cols[0], cols[1] = cols[1], cols[0]  # Swap the columns 'Assignee' and 'Summary'

# Reorder the DataFrame
kaggle_data_set = kaggle_data_set[cols]

In [11]:
kaggle_data_set

Unnamed: 0,Summary,Assignee
0,Scrolling with some scroll mice touchpad etc s...,amit@chromium.org
1,Proxy causes some or all network requests to f...,jon@chromium.org
2,Web inspector button dock to main window does ...,pfeldman@chromium.org
3,Habari admin interface is not rendered correct...,jon@chromium.org
4,Maximize on second larger monitor not working ...,pkasting@chromium.org
...,...,...
116090,Launch clank_qa recipes to the waterfall We ha...,navabi@chromium.org
116091,data race in ThreadWatcherListTest r255322 is ...,bulach@chromium.org
116092,windowconsole object should not be configurabl...,pfeldman@chromium.org
116093,Windows GPU bots failing on multiple tests All...,ernstm@chromium.org


In [10]:
cols = list(eclipse_data_set.columns)
cols[0], cols[1] = cols[1], cols[0]  # Swap the columns 'Assignee' and 'Summary'

# Reorder the DataFrame
eclipse_data_set = eclipse_data_set[cols]

In [12]:
eclipse_data_set

Unnamed: 0,Summary,Assignee
0,Usage of HashSet and HashMap does not preserve...,sphinx-inbox
1,Add support for scheduling rules in the dynami...,sphinx-inbox
2,Class loading problem during dynamic workflow ...,sphinx-inbox
3,BasicTransactionalFormEditor addPages method i...,sphinx-inbox
4,Make Class loading in BasicWorkflowRunnerOpera...,sphinx-inbox
...,...,...
247992,Change action for Filters should be removed,kmunir
247993,RSE Project Filters show up as ViewFilterRSETe...,mober.at+eclipse
247994,typo on RSE Tutorials help page,mober.at+eclipse
247995,Need to update the year of copyright in Univer...,mober.at+eclipse


**Nerge the 3 datasets into one dataset**

In [13]:
dataset = pd.concat([kaggle_data_set, eclipse_data_set, mozilla_data_set], ignore_index=True)

In [14]:
dataset.shape

(371383, 2)

**Check if there are duplicates**

In [15]:
dataset.duplicated().any()

True

**Drop duplicate rows**

In [16]:
dataset = dataset.drop_duplicates()

In [17]:
dataset.duplicated().any()

False

In [18]:
dataset.shape

(369741, 2)

**Remove sources on noise**

In [20]:
def clean_text(text):
    # Remove hyperlinks
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove newlines
    text = text.replace('\n', ' ').replace('\r', '')

    # Remove special characters (keep only alphanumeric characters and spaces)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)

    return text

In [21]:
dataset['Summary'] = dataset['Summary'].apply(clean_text)

**Show the number of nulls in each column**

In [22]:
# Print the number of null values in each column in the training data
print(dataset.isnull().sum())

Summary     0
Assignee    0
dtype: int64


In [23]:
'''
There is no nulls
'''

'\nThere is no nulls\n'

**Remove rows with fewer than 10 words in their 'Summary' column**

In [24]:
def filter_by_word_count(df, min_word_count):
    return df[df['Summary'].str.split().str.len() >= min_word_count]

In [25]:
# Define the minimum word count
min_word_count = 10

# Filter rows where the 'Summary' has at least min_word_count words
dataset = filter_by_word_count(dataset, min_word_count)

In [26]:
dataset.shape

(199302, 2)

**Apply Tokenization on 'Summary' column**

In [30]:
# Function to tokenize text
def tokenize_summary(text):
    return word_tokenize(text)

In [31]:
# Apply the tokenization function to the 'Summary' column
dataset['Summary_Tokens'] = dataset['Summary'].apply(tokenize_summary)

In [32]:
dataset

Unnamed: 0,Summary,Assignee,Summary_Tokens
0,Scrolling with some scroll mice touchpad etc s...,amit@chromium.org,"[Scrolling, with, some, scroll, mice, touchpad..."
1,Proxy causes some or all network requests to f...,jon@chromium.org,"[Proxy, causes, some, or, all, network, reques..."
2,Web inspector button dock to main window does ...,pfeldman@chromium.org,"[Web, inspector, button, dock, to, main, windo..."
3,Habari admin interface is not rendered correct...,jon@chromium.org,"[Habari, admin, interface, is, not, rendered, ..."
4,Maximize on second larger monitor not working ...,pkasting@chromium.org,"[Maximize, on, second, larger, monitor, not, w..."
...,...,...,...
371370,Remove manual page load events from the Glean ...,brosa,"[Remove, manual, page, load, events, from, the..."
371372,redirects to main AMO homepage instead of rev...,wezhou,"[redirects, to, main, AMO, homepage, instead, ..."
371373,Add git shortref in deployment messages to clo...,sven,"[Add, git, shortref, in, deployment, messages,..."
371376,sign testpilot and system addons for Fx 58 wit...,u581815,"[sign, testpilot, and, system, addons, for, Fx..."


**Remove stop words**

In [33]:
# Initialize the stop words
stop_words = set(stopwords.words('english'))

In [34]:
def remove_stopwords(tokens):
    return [word for word in tokens if word.lower() not in stop_words]

In [35]:
dataset['Summary_Cleaned'] = dataset['Summary_Tokens'].apply(remove_stopwords)

In [36]:
dataset

Unnamed: 0,Summary,Assignee,Summary_Tokens,Summary_Cleaned
0,Scrolling with some scroll mice touchpad etc s...,amit@chromium.org,"[Scrolling, with, some, scroll, mice, touchpad...","[Scrolling, scroll, mice, touchpad, etc, scrol..."
1,Proxy causes some or all network requests to f...,jon@chromium.org,"[Proxy, causes, some, or, all, network, reques...","[Proxy, causes, network, requests, fail, Produ..."
2,Web inspector button dock to main window does ...,pfeldman@chromium.org,"[Web, inspector, button, dock, to, main, windo...","[Web, inspector, button, dock, main, window, n..."
3,Habari admin interface is not rendered correct...,jon@chromium.org,"[Habari, admin, interface, is, not, rendered, ...","[Habari, admin, interface, rendered, correctly..."
4,Maximize on second larger monitor not working ...,pkasting@chromium.org,"[Maximize, on, second, larger, monitor, not, w...","[Maximize, second, larger, monitor, working, P..."
...,...,...,...,...
371370,Remove manual page load events from the Glean ...,brosa,"[Remove, manual, page, load, events, from, the...","[Remove, manual, page, load, events, Glean, De..."
371372,redirects to main AMO homepage instead of rev...,wezhou,"[redirects, to, main, AMO, homepage, instead, ...","[redirects, main, AMO, homepage, instead, revi..."
371373,Add git shortref in deployment messages to clo...,sven,"[Add, git, shortref, in, deployment, messages,...","[Add, git, shortref, deployment, messages, clo..."
371376,sign testpilot and system addons for Fx 58 wit...,u581815,"[sign, testpilot, and, system, addons, for, Fx...","[sign, testpilot, system, addons, Fx, 58, PKCS..."


**Apply stemming**

In [37]:
def stem_tokens(tokens):
    return [stemmer.stem(word) for word in tokens]

In [38]:
dataset['Summary_Stemmed'] = dataset['Summary_Cleaned'].apply(stem_tokens)

In [39]:
dataset

Unnamed: 0,Summary,Assignee,Summary_Tokens,Summary_Cleaned,Summary_Stemmed
0,Scrolling with some scroll mice touchpad etc s...,amit@chromium.org,"[Scrolling, with, some, scroll, mice, touchpad...","[Scrolling, scroll, mice, touchpad, etc, scrol...","[scroll, scroll, mice, touchpad, etc, scroll, ..."
1,Proxy causes some or all network requests to f...,jon@chromium.org,"[Proxy, causes, some, or, all, network, reques...","[Proxy, causes, network, requests, fail, Produ...","[proxi, caus, network, request, fail, product,..."
2,Web inspector button dock to main window does ...,pfeldman@chromium.org,"[Web, inspector, button, dock, to, main, windo...","[Web, inspector, button, dock, main, window, n...","[web, inspector, button, dock, main, window, n..."
3,Habari admin interface is not rendered correct...,jon@chromium.org,"[Habari, admin, interface, is, not, rendered, ...","[Habari, admin, interface, rendered, correctly...","[habari, admin, interfac, render, correctli, p..."
4,Maximize on second larger monitor not working ...,pkasting@chromium.org,"[Maximize, on, second, larger, monitor, not, w...","[Maximize, second, larger, monitor, working, P...","[maxim, second, larger, monitor, work, product..."
...,...,...,...,...,...
371370,Remove manual page load events from the Glean ...,brosa,"[Remove, manual, page, load, events, from, the...","[Remove, manual, page, load, events, Glean, De...","[remov, manual, page, load, event, glean, debu..."
371372,redirects to main AMO homepage instead of rev...,wezhou,"[redirects, to, main, AMO, homepage, instead, ...","[redirects, main, AMO, homepage, instead, revi...","[redirect, main, amo, homepag, instead, review..."
371373,Add git shortref in deployment messages to clo...,sven,"[Add, git, shortref, in, deployment, messages,...","[Add, git, shortref, deployment, messages, clo...","[add, git, shortref, deploy, messag, cloudop, ..."
371376,sign testpilot and system addons for Fx 58 wit...,u581815,"[sign, testpilot, and, system, addons, for, Fx...","[sign, testpilot, system, addons, Fx, 58, PKCS...","[sign, testpilot, system, addon, fx, 58, pkcs7..."


**Save the 'Assignee' and 'Sumamry_stemmed' columns to csv file**

In [40]:
columns_to_save = ['Summary_Stemmed', 'Assignee']
output_file = 'dataset_after_preprocessing.csv'
dataset[columns_to_save].to_csv(output_file, index=False)