**Imports**

In [1]:
import os
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

**Download necessary NLTK data**

In [2]:
# Download the 'punkt' tokenizer from NLTK (Natural Language Toolkit) if not already downloaded
nltk.download('punkt')

# Download the stopwords dataset from NLTK if not already downloaded
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/donia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/donia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Initialize the stemmer and stop words**

In [3]:
# Initialize a Porter stemmer from NLTK for stemming words
stemmer = PorterStemmer()

# Retrieve the set of stopwords in English from NLTK
stop_words = set(stopwords.words('english'))

**Load the data**

In [4]:
# Read the cleaned Kaggle dataset from the CSV file into a Pandas DataFrame
kaggle_data_set = pd.read_csv('../data/kaggle_dataset/cleaned_kaggle_dataset.csv')

# Read the cleaned Mozilla dataset from the CSV file into a Pandas DataFrame
mozilla_data_set = pd.read_csv('../data/mozilla_dataset/cleaned_mozilla_dataset.csv')

# Read the cleaned Eclipse dataset from the CSV file into a Pandas DataFrame
eclipse_data_set = pd.read_csv('../data/eclipse_dataset/cleaned_eclipse_dataset.csv')

**Show the data**

In [5]:
# Display the DataFrame 'kaggle_data_set'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
kaggle_data_set 

Unnamed: 0,owner,Summary
0,amit@chromium.org,Scrolling with some scroll mice touchpad etc s...
1,jon@chromium.org,Proxy causes some or all network requests to fail
2,pfeldman@chromium.org,Web inspector button dock to main window does ...
3,jon@chromium.org,Habari admin interface is not rendered correctly
4,pkasting@chromium.org,Maximize on second larger monitor not working
...,...,...
100042,cyrusm@chromium.org,Bypassing policies set by removing battery can...
100043,mlchan@chromium.org,Chrome OS Eureka setup UI is not a 100 visibl...
100044,lottie@chromium.org,Broken Link on Web Store Upload Page
100045,mario.pr...@samsung.com,Weird overlapping of text Friend observed in C...


In [6]:
# Display the DataFrame 'mozilla_data_set'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
mozilla_data_set

Unnamed: 0,Summary,Assignee
0,Allow inspecting a logged DOM nodes tree vs th...,hmanilla
1,Display response for multipart content type,hmanilla
2,3934 463 damp customprettyprintjsdebuggerrelo...,hmanilla
3,Devtools go into infinite loop after creating ...,jimb
4,Allow hotreloading stylesheets for temporary i...,poirot.alex
...,...,...
7286,Run event timestamp experiment on Android,jrediger
7287,Speed up table deploys in bqetl_artifact_deplo...,ascholtz
7288,Implement Glean Server Knobs for mobile products,brosa
7289,Airflow task bqetl_firefox_ioschecks__fail_fir...,kignasiak


In [7]:
# Display the DataFrame 'eclipse_data_set'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
eclipse_data_set

Unnamed: 0,Assignee,Summary
0,sphinx-inbox,Usage of HashSet and HashMap does not preserve...
1,sphinx-inbox,Add support for scheduling rules in the dynami...
2,sphinx-inbox,Class loading problem during dynamic workflow ...
3,sphinx-inbox,BasicTransactionalFormEditor addPages method i...
4,sphinx-inbox,Make Class loading in BasicWorkflowRunnerOpera...
...,...,...
247992,kmunir,Change action for Filters should be removed
247993,mober.at+eclipse,RSE Project Filters show up as ViewFilterRSETe...
247994,mober.at+eclipse,typo on RSE Tutorials help page
247995,mober.at+eclipse,Need to update the year of copyright in Univer...


**Rename 'owner' column in kaggle dataset to be 'Assignee'**

In [5]:
# Rename the 'owner' column in the DataFrame 'kaggle_data_set' to 'Assignee'
# 'kaggle_data_set.rename(columns={'owner': 'Assignee'}, inplace=True)' renames the specified column in place
kaggle_data_set.rename(columns={'owner': 'Assignee'}, inplace=True)

In [9]:
# Display the DataFrame 'kaggle_data_set'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
kaggle_data_set 

Unnamed: 0,Assignee,Summary
0,amit@chromium.org,Scrolling with some scroll mice touchpad etc s...
1,jon@chromium.org,Proxy causes some or all network requests to fail
2,pfeldman@chromium.org,Web inspector button dock to main window does ...
3,jon@chromium.org,Habari admin interface is not rendered correctly
4,pkasting@chromium.org,Maximize on second larger monitor not working
...,...,...
100042,cyrusm@chromium.org,Bypassing policies set by removing battery can...
100043,mlchan@chromium.org,Chrome OS Eureka setup UI is not a 100 visibl...
100044,lottie@chromium.org,Broken Link on Web Store Upload Page
100045,mario.pr...@samsung.com,Weird overlapping of text Friend observed in C...


**Exchange columns in kaggle and eclipse datasets to be 'Summary then 'Assignee'**

In [6]:
# Get a list of column names from the DataFrame 'kaggle_data_set'
cols = list(kaggle_data_set.columns)

# Swap the positions of columns 'Assignee' and 'Summary' in the list of column names
cols[0], cols[1] = cols[1], cols[0]  # Swapping column positions

# Reorder the DataFrame 'kaggle_data_set' based on the updated list of column names
kaggle_data_set = kaggle_data_set[cols]

In [11]:
# Display the DataFrame 'kaggle_data_set'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
kaggle_data_set 

Unnamed: 0,Summary,Assignee
0,Scrolling with some scroll mice touchpad etc s...,amit@chromium.org
1,Proxy causes some or all network requests to fail,jon@chromium.org
2,Web inspector button dock to main window does ...,pfeldman@chromium.org
3,Habari admin interface is not rendered correctly,jon@chromium.org
4,Maximize on second larger monitor not working,pkasting@chromium.org
...,...,...
100042,Bypassing policies set by removing battery can...,cyrusm@chromium.org
100043,Chrome OS Eureka setup UI is not a 100 visibl...,mlchan@chromium.org
100044,Broken Link on Web Store Upload Page,lottie@chromium.org
100045,Weird overlapping of text Friend observed in C...,mario.pr...@samsung.com


In [7]:
# Get a list of column names from the DataFrame 'eclipse_data_set'
cols = list(eclipse_data_set.columns)

# Swap the positions of columns 'Assignee' and 'Summary' in the list of column names
cols[0], cols[1] = cols[1], cols[0]  # Swapping column positions

# Reorder the DataFrame 'eclipse_data_set' based on the updated list of column names
eclipse_data_set = eclipse_data_set[cols]

In [13]:
# Display the DataFrame 'eclipse_data_set'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
eclipse_data_set

Unnamed: 0,Summary,Assignee
0,Usage of HashSet and HashMap does not preserve...,sphinx-inbox
1,Add support for scheduling rules in the dynami...,sphinx-inbox
2,Class loading problem during dynamic workflow ...,sphinx-inbox
3,BasicTransactionalFormEditor addPages method i...,sphinx-inbox
4,Make Class loading in BasicWorkflowRunnerOpera...,sphinx-inbox
...,...,...
247992,Change action for Filters should be removed,kmunir
247993,RSE Project Filters show up as ViewFilterRSETe...,mober.at+eclipse
247994,typo on RSE Tutorials help page,mober.at+eclipse
247995,Need to update the year of copyright in Univer...,mober.at+eclipse


**Nerge the 3 datasets into one dataset**

In [32]:
# Concatenate the DataFrames 'kaggle_data_set', 'eclipse_data_set', and 'mozilla_data_set' into a single DataFrame 'dataset'
# 'pd.concat(...)' concatenates the specified DataFrames into one, ignoring existing indexes and resetting to a new index
dataset = pd.concat([kaggle_data_set, eclipse_data_set, mozilla_data_set], ignore_index=True)

In [33]:
# Print the shape of the DataFrame 'dataset'
# 'shape' returns a tuple representing the dimensionality of the DataFrame
# The first value is the number of rows, and the second value is the number of columns
dataset.shape

(355335, 2)

**Check if there are duplicates**

In [34]:
# Check if there are any duplicate rows in the DataFrame 'dataset'
# 'duplicated()' returns a Series of boolean values indicating whether each row is a duplicate of a previous row
# 'any()' returns True if any element in the Series is True (indicating the presence of duplicates), otherwise False
# This is useful for determining if there are any duplicate rows in the DataFrame
dataset.duplicated().any()

True

**Drop duplicate rows**

In [35]:
# Remove duplicate rows from the DataFrame 'dataset'
# 'dataset.drop_duplicates()' removes rows that are duplicates based on all columns
dataset = dataset.drop_duplicates()

In [36]:
# Check if there are any duplicate rows in the DataFrame 'dataset'
# 'duplicated()' returns a Series of boolean values indicating whether each row is a duplicate of a previous row
# 'any()' returns True if any element in the Series is True (indicating the presence of duplicates), otherwise False
# This is useful for determining if there are any duplicate rows in the DataFrame
dataset.duplicated().any()

False

In [37]:
# Print the shape of the DataFrame 'dataset'
# 'shape' returns a tuple representing the dimensionality of the DataFrame
# The first value is the number of rows, and the second value is the number of columns
dataset.shape

(353694, 2)

**Remove sources on noise**

In [38]:
def clean_text(text):
    """
    Clean and preprocesses text data by removing hyperlinks, newlines, and special characters.

    Parameters:
    ----------
    text : str
        The input text to be cleaned.

    Returns:
    -------
    str
        Cleaned text with hyperlinks, newlines, and special characters removed.
    """
    # Remove hyperlinks from the text using regular expressions
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Replace newline and carriage return characters with spaces
    text = text.replace('\n', ' ').replace('\r', ' ')

    # Remove special characters (keep only alphanumeric characters and spaces)
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)

    return text

In [39]:
# Apply the 'clean_text' function to preprocess the 'Summary' column in the DataFrame 'dataset'
dataset['Summary'] = dataset['Summary'].apply(clean_text)

**Show the number of nulls in each column**

In [40]:
# Print the number of null values in each column of the DataFrame 'dataset'
print(dataset.isnull().sum())

Summary     0
Assignee    0
dtype: int64


In [None]:
'''
There is no nulls
'''

'\nThere is no nulls\n'

**Remove rows with fewer than 10 words in their 'Summary' column**

In [41]:
def filter_by_word_count(df, min_word_count):
    """
    Filter rows in a DataFrame based on the word count in the 'Summary' column.

    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame containing the 'Summary' column to filter.
    min_word_count : int
        The minimum number of words required in each 'Summary' to retain a row.

    Returns:
    --------
    pandas.DataFrame
        Filtered DataFrame containing rows where the 'Summary' meets the word count criteria.
    """
    return df[df['Summary'].str.split().str.len() >= min_word_count]

In [44]:
# Define the minimum word count required in the 'Summary' column
min_word_count = 10

# Filter rows in the DataFrame 'dataset' where the 'Summary' has at least min_word_count words
dataset = filter_by_word_count(dataset, min_word_count)

In [45]:
# Print the shape of the DataFrame 'dataset'
# 'shape' returns a tuple representing the dimensionality of the DataFrame
# The first value is the number of rows, and the second value is the number of columns
dataset.shape

(119919, 2)

**Apply Tokenization on 'Summary' column**

In [46]:
def tokenize_summary(text):
    """
    Tokenize the input text into words using word_tokenize from nltk.

    Parameters:
    -----------
    text : str
        The input text to tokenize.

    Returns:
    --------
    list
        A list of tokens (words) extracted from the input text.
    """
    # Tokenize the text into words, converting to lowercase
    return word_tokenize(text.lower())

In [47]:
# Apply the 'tokenize_summary' function to tokenize the 'Summary' column in the DataFrame 'dataset'
dataset['Summary_Tokens'] = dataset['Summary'].apply(tokenize_summary)

In [48]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
dataset

Unnamed: 0,Summary,Assignee,Summary_Tokens
0,Scrolling with some scroll mice touchpad etc s...,amit@chromium.org,"[scrolling, with, some, scroll, mice, touchpad..."
19,Add checks for items in the download panel in ...,achuith@chromium.org,"[add, checks, for, items, in, the, download, p..."
20,Useafterfree by navigating out a document duri...,tkent@chromium.org,"[useafterfree, by, navigating, out, a, documen..."
24,Cannot add an address properly in the AutoFil...,sky@chromium.org,"[can, not, add, an, address, properly, in, the..."
25,libxmlgyp should define LIBXMLSTATIC for itsel...,wtc@chromium.org,"[libxmlgyp, should, define, libxmlstatic, for,..."
...,...,...,...
355322,Remove manual page load events from the Glean ...,brosa,"[remove, manual, page, load, events, from, the..."
355324,redirects to main AMO homepage instead of rev...,wezhou,"[redirects, to, main, amo, homepage, instead, ..."
355325,Add git shortref in deployment messages to clo...,sven,"[add, git, shortref, in, deployment, messages,..."
355328,sign testpilot and system addons for Fx 58 wit...,u581815,"[sign, testpilot, and, system, addons, for, fx..."


**Remove stop words**

In [49]:
# Initialize a set of stop words using NLTK's stopwords for the English language
stop_words = set(stopwords.words('english'))

In [50]:
def remove_stopwords(tokens):
    """
    Remove stopwords and punctuation from a list of tokens.

    Parameters:
    -----------
    tokens : list
        A list of tokens (words) from which stopwords and punctuation will be removed.

    Returns:
    --------
    list
        A list of tokens with stopwords and punctuation removed.
    """
    # List comprehension to filter out stopwords and punctuation
    return [word for word in tokens if word not in stop_words and word not in string.punctuation]

In [51]:
# Apply the 'remove_stopwords' function to remove stopwords and punctuation from the 'Summary_Tokens' column
dataset['Summary_Cleaned'] = dataset['Summary_Tokens'].apply(remove_stopwords)

In [52]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
dataset

Unnamed: 0,Summary,Assignee,Summary_Tokens,Summary_Cleaned
0,Scrolling with some scroll mice touchpad etc s...,amit@chromium.org,"[scrolling, with, some, scroll, mice, touchpad...","[scrolling, scroll, mice, touchpad, etc, scrolls]"
19,Add checks for items in the download panel in ...,achuith@chromium.org,"[add, checks, for, items, in, the, download, p...","[add, checks, items, download, panel, browser,..."
20,Useafterfree by navigating out a document duri...,tkent@chromium.org,"[useafterfree, by, navigating, out, a, documen...","[useafterfree, navigating, document, form, val..."
24,Cannot add an address properly in the AutoFil...,sky@chromium.org,"[can, not, add, an, address, properly, in, the...","[add, address, properly, autofill, options, di..."
25,libxmlgyp should define LIBXMLSTATIC for itsel...,wtc@chromium.org,"[libxmlgyp, should, define, libxmlstatic, for,...","[libxmlgyp, define, libxmlstatic, direct, depe..."
...,...,...,...,...
355322,Remove manual page load events from the Glean ...,brosa,"[remove, manual, page, load, events, from, the...","[remove, manual, page, load, events, glean, de..."
355324,redirects to main AMO homepage instead of rev...,wezhou,"[redirects, to, main, amo, homepage, instead, ...","[redirects, main, amo, homepage, instead, revi..."
355325,Add git shortref in deployment messages to clo...,sven,"[add, git, shortref, in, deployment, messages,...","[add, git, shortref, deployment, messages, clo..."
355328,sign testpilot and system addons for Fx 58 wit...,u581815,"[sign, testpilot, and, system, addons, for, fx...","[sign, testpilot, system, addons, fx, 58, pkcs..."


**Apply stemming**

In [53]:
def stem_tokens(tokens):
    """
    Apply stemming to a list of tokens.

    Parameters:
    -----------
    tokens : list
        A list of tokens (words) to be stemmed.

    Returns:
    --------
    list
        A list of stemmed tokens.
    """
    # Apply stemming using the Porter Stemmer
    return [stemmer.stem(word) for word in tokens]

In [54]:
# Apply the 'stem_tokens' function to stem the tokens in the 'Summary_Cleaned' column
dataset['Summary_Stemmed'] = dataset['Summary_Cleaned'].apply(stem_tokens)

In [55]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
dataset

Unnamed: 0,Summary,Assignee,Summary_Tokens,Summary_Cleaned,Summary_Stemmed
0,Scrolling with some scroll mice touchpad etc s...,amit@chromium.org,"[scrolling, with, some, scroll, mice, touchpad...","[scrolling, scroll, mice, touchpad, etc, scrolls]","[scroll, scroll, mice, touchpad, etc, scroll]"
19,Add checks for items in the download panel in ...,achuith@chromium.org,"[add, checks, for, items, in, the, download, p...","[add, checks, items, download, panel, browser,...","[add, check, item, download, panel, browser, t..."
20,Useafterfree by navigating out a document duri...,tkent@chromium.org,"[useafterfree, by, navigating, out, a, documen...","[useafterfree, navigating, document, form, val...","[useafterfre, navig, document, form, valid, me..."
24,Cannot add an address properly in the AutoFil...,sky@chromium.org,"[can, not, add, an, address, properly, in, the...","[add, address, properly, autofill, options, di...","[add, address, properli, autofil, option, dial..."
25,libxmlgyp should define LIBXMLSTATIC for itsel...,wtc@chromium.org,"[libxmlgyp, should, define, libxmlstatic, for,...","[libxmlgyp, define, libxmlstatic, direct, depe...","[libxmlgyp, defin, libxmlstat, direct, depend]"
...,...,...,...,...,...
355322,Remove manual page load events from the Glean ...,brosa,"[remove, manual, page, load, events, from, the...","[remove, manual, page, load, events, glean, de...","[remov, manual, page, load, event, glean, debu..."
355324,redirects to main AMO homepage instead of rev...,wezhou,"[redirects, to, main, amo, homepage, instead, ...","[redirects, main, amo, homepage, instead, revi...","[redirect, main, amo, homepag, instead, review..."
355325,Add git shortref in deployment messages to clo...,sven,"[add, git, shortref, in, deployment, messages,...","[add, git, shortref, deployment, messages, clo...","[add, git, shortref, deploy, messag, cloudop, ..."
355328,sign testpilot and system addons for Fx 58 wit...,u581815,"[sign, testpilot, and, system, addons, for, fx...","[sign, testpilot, system, addons, fx, 58, pkcs...","[sign, testpilot, system, addon, fx, 58, pkcs7..."


**join tokens into a single string**

In [56]:
def join_tokens(tokens):
    """
    Join a list of tokens into a single string.

    Parameters:
    -----------
    tokens : list
        A list of tokens (words) to be joined into a string.

    Returns:
    --------
    str
        A single string where tokens are joined by a space.
    """
    # Join tokens into a single string separated by a space
    return ' '.join(tokens)

In [57]:
# Apply the 'join_tokens' function to join the tokens in the 'Summary_Stemmed' column into a single string
dataset['processed_summary'] = dataset['Summary_Stemmed'].apply(join_tokens)

In [58]:
# Display the DataFrame 'dataset'
# This will print the first and last 5 rows of the DataFrame along with the column names and index
# Useful for a quick overview of the data after processing
dataset

Unnamed: 0,Summary,Assignee,Summary_Tokens,Summary_Cleaned,Summary_Stemmed,processed_summary
0,Scrolling with some scroll mice touchpad etc s...,amit@chromium.org,"[scrolling, with, some, scroll, mice, touchpad...","[scrolling, scroll, mice, touchpad, etc, scrolls]","[scroll, scroll, mice, touchpad, etc, scroll]",scroll scroll mice touchpad etc scroll
19,Add checks for items in the download panel in ...,achuith@chromium.org,"[add, checks, for, items, in, the, download, p...","[add, checks, items, download, panel, browser,...","[add, check, item, download, panel, browser, t...",add check item download panel browser test
20,Useafterfree by navigating out a document duri...,tkent@chromium.org,"[useafterfree, by, navigating, out, a, documen...","[useafterfree, navigating, document, form, val...","[useafterfre, navig, document, form, valid, me...",useafterfre navig document form valid messag s...
24,Cannot add an address properly in the AutoFil...,sky@chromium.org,"[can, not, add, an, address, properly, in, the...","[add, address, properly, autofill, options, di...","[add, address, properli, autofil, option, dial...",add address properli autofil option dialog box
25,libxmlgyp should define LIBXMLSTATIC for itsel...,wtc@chromium.org,"[libxmlgyp, should, define, libxmlstatic, for,...","[libxmlgyp, define, libxmlstatic, direct, depe...","[libxmlgyp, defin, libxmlstat, direct, depend]",libxmlgyp defin libxmlstat direct depend
...,...,...,...,...,...,...
355322,Remove manual page load events from the Glean ...,brosa,"[remove, manual, page, load, events, from, the...","[remove, manual, page, load, events, glean, de...","[remov, manual, page, load, event, glean, debu...",remov manual page load event glean debug ping ...
355324,redirects to main AMO homepage instead of rev...,wezhou,"[redirects, to, main, amo, homepage, instead, ...","[redirects, main, amo, homepage, instead, revi...","[redirect, main, amo, homepag, instead, review...",redirect main amo homepag instead review tool ...
355325,Add git shortref in deployment messages to clo...,sven,"[add, git, shortref, in, deployment, messages,...","[add, git, shortref, deployment, messages, clo...","[add, git, shortref, deploy, messag, cloudop, ...",add git shortref deploy messag cloudop slack bot
355328,sign testpilot and system addons for Fx 58 wit...,u581815,"[sign, testpilot, and, system, addons, for, fx...","[sign, testpilot, system, addons, fx, 58, pkcs...","[sign, testpilot, system, addon, fx, 58, pkcs7...",sign testpilot system addon fx 58 pkcs7 sha256...


**Filter the dataset do the minimum occurance of each owner is 5**

In [59]:
# Calculate the minimum number of occurrences of any value in the 'Assignee' column
min_occurrences = dataset['Assignee'].value_counts().min()

# Print the result
print(f"The minimum number of occurrences in Assignee column is {min_occurrences}")

The minimum number of occurrences in Assignee column is 1


In [60]:
# Calculate the occurrences of each value in the 'Assignee' column
value_counts = dataset['Assignee'].value_counts()

# Filter the dataset to include only rows where the value in 'Assignee' has at least 5 occurrences
dataset = dataset[dataset['Assignee'].isin(value_counts[value_counts >= 5].index)]

In [61]:
# Calculate the minimum number of occurrences of any value in the 'Assignee' column
min_occurrences = dataset['Assignee'].value_counts().min()

# Print the result
print(f"The minimum number of occurrences in Assignee column is {min_occurrences}")

The minimum number of occurrences in Assignee column is 5


In [62]:
# Print the shape of the DataFrame 'dataset'
# 'shape' returns a tuple representing the dimensionality of the DataFrame
# The first value is the number of rows, and the second value is the number of columns
dataset.shape

(117381, 6)

**Save the 'Assignee' and 'Sumamry_stemmed' columns to csv file**

In [64]:
# Define columns to save in the final dataset
columns_to_save = ['Summary_Stemmed', 'processed_summary', 'Assignee']

# Get the current working directory
current_dir = os.getcwd()

# Define the relative path for saving the CSV file
relative_path = os.path.join('..', 'data', 'dataset_after_preprocessing.csv')

# Save the selected columns to a CSV file, excluding the DataFrame index
dataset[columns_to_save].to_csv(os.path.join(current_dir, relative_path), index=False)