In [1]:
import os
import pandas as pd
import re

**Loading and merging the data**

In [2]:
# Define the path to the directory containing the CSV files
directory_path = os.path.join('..', 'data', 'eclipse_dataset', 'dataset_before_merge')

# List all CSV files in the directory
file_paths = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.csv')]

# Read the CSV files into DataFrames
dataframes = [pd.read_csv(file) for file in file_paths]

# Concatenate all DataFrames into a single DataFrame
eclipse_dataset = pd.concat(dataframes, ignore_index=True)

**Exploring the data**

In [44]:
print(eclipse_dataset.shape)

(333667, 8)


In [45]:
print(eclipse_dataset.columns)

Index(['Bug ID', 'Product', 'Component', 'Assignee', 'Status', 'Resolution',
       'Summary', 'Changed'],
      dtype='object')


In [46]:
eclipse_dataset

Unnamed: 0,Bug ID,Product,Component,Assignee,Status,Resolution,Summary,Changed
0,419458,Sphinx,Core,sphinx-inbox,UNCONFIRMED,---,Usage of HashSet and HashMap does not preserve...,2021-08-24 08:47:27
1,430531,Sphinx,Core,sphinx-inbox,UNCONFIRMED,---,ModelLoadManager forces System.gc(),2021-08-24 08:48:08
2,447379,Sphinx,Core,sphinx-inbox,UNCONFIRMED,---,Add support for scheduling rules in the dynami...,2021-08-24 08:47:19
3,447395,Sphinx,Core,sphinx-inbox,UNCONFIRMED,---,Class loading problem during dynamic workflow ...,2021-08-24 08:47:38
4,448041,Sphinx,Navigator & Editor Sockets,sphinx-inbox,UNCONFIRMED,---,[BasicTransactionalFormEditor] addPages() meth...,2021-08-24 08:47:56
...,...,...,...,...,...,...,...,...
333662,150112,Target Management,RSE,kmunir,CLOSED,FIXED,[api] Dead code: SystemNewConnectionWizard,2008-08-13 13:16:59
333663,153629,Target Management,RSE,mober.at+eclipse,CLOSED,FIXED,"RSE Project Filters show up as ""%ViewFilter.RS...",2006-11-23 06:53:01
333664,160778,Target Management,RSE,mober.at+eclipse,CLOSED,FIXED,"typo on RSE ""Tutorials"" help page",2008-08-13 13:08:32
333665,190930,Target Management,RSE,mober.at+eclipse,CLOSED,FIXED,Need to update the year of copyright in Univer...,2012-05-23 17:32:27


In [47]:
print(eclipse_dataset.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333667 entries, 0 to 333666
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Bug ID      333667 non-null  int64 
 1   Product     333667 non-null  object
 2   Component   333667 non-null  object
 3   Assignee    333667 non-null  object
 4   Status      333667 non-null  object
 5   Resolution  333667 non-null  object
 6   Summary     333664 non-null  object
 7   Changed     333667 non-null  object
dtypes: int64(1), object(7)
memory usage: 20.4+ MB
None


**Delete duplicate data [if exist]**

In [48]:
eclipse_dataset.duplicated().any()

False

**generates descriptive statistics of the data**

In [49]:
# generates descriptive statistics of the data
# include="O": This parameter is specifying that you want to include columns with object data type 
eclipse_dataset.describe(include = "O")

Unnamed: 0,Product,Component,Assignee,Status,Resolution,Summary,Changed
count,333667,333667,333667,333667,333667,333664,333667
unique,222,778,2820,7,8,327017,305148
top,JDT,Core,webmaster,RESOLVED,FIXED,DVT34: broken link in documentation,2005-09-27 09:12:53
freq,63266,54768,9369,171494,201858,105,504


In [50]:
# compute the number of unique values for each column in the DataFrame training_data.
eclipse_dataset.nunique()

Bug ID        333667
Product          222
Component        778
Assignee        2820
Status             7
Resolution         8
Summary       327017
Changed       305148
dtype: int64

**Show the unique values for Status column**

In [51]:
# Print the unique values for the 'Status' column
status_unique_values = eclipse_dataset['Status'].unique()
print(f"Unique values in the 'Status' column: {status_unique_values}")

Unique values in the 'Status' column: ['UNCONFIRMED' 'NEW' 'ASSIGNED' 'REOPENED' 'RESOLVED' 'VERIFIED' 'CLOSED']


**Show the unique values for Resolution column**

In [52]:
# Print the unique values for the 'Resolution' column
resolution_unique_values = eclipse_dataset['Resolution'].unique()
print(f"Unique values in the 'Resolution' column: {resolution_unique_values}")

Unique values in the 'Resolution' column: [' ---' 'FIXED' 'WONTFIX' 'INVALID' 'NOT_ECLIPSE' 'WORKSFORME' 'DUPLICATE'
 'MOVED']


**Remove rows with 'Resolution' is 'DUPLICATE' or 'INVALID'**

In [3]:
# Filter out rows with 'Duplicate' in the 'Resolution' column
eclipse_dataset = eclipse_dataset[eclipse_dataset['Resolution'] != 'DUPLICATE']

# Filter out rows with 'INVALID' in the 'Resolution' column
eclipse_dataset = eclipse_dataset[eclipse_dataset['Resolution'] != 'INVALID']

In [54]:
'''
Bug ID is unique for each column ==> remove it
Resolution, Status, Product, Component, Changed don't affect our task
'''

"\nBug ID is unique for each column ==> remove it\nResolution, Status, Product, Component, Changed don't affect our task\n"

**Drop unnecessary columns**

In [4]:
# Columns to drop
columns_to_drop = ['Bug ID', 'Resolution', 'Status', 'Product', 'Component', 'Changed']

# Drop the specified columns
eclipse_dataset = eclipse_dataset.drop(columns=columns_to_drop)

In [56]:
print(eclipse_dataset.columns)

Index(['Assignee', 'Summary'], dtype='object')


**generates descriptive statistics of the data**

In [57]:
# generates descriptive statistics of the data
# include="O": This parameter is specifying that you want to include columns with object data type 
eclipse_dataset.describe(include = "O")

Unnamed: 0,Assignee,Summary
count,291373,291371
unique,2779,286938
top,webmaster,DVT34: broken link in documentation
freq,8603,71


**Show the number of nulls in each column**

In [58]:
# Print the number of null values in each column in the training data
print(eclipse_dataset.isnull().sum())

Assignee    0
Summary     2
dtype: int64


**Remove rows with null values**

In [5]:
# remove rows with missing values from the dataset
eclipse_dataset.dropna(inplace=True)
print(eclipse_dataset.isnull().sum())

Assignee    0
Summary     0
dtype: int64


**Print the minimum number of occurance of Assignee column**

In [6]:
min_occurrences = eclipse_dataset['Assignee'].value_counts().min()
print(f"The minimum number of occurrences in owner column is {min_occurrences}")

The minimum number of occurrences in owner column is 1


**Filter the dataset do the minimum occurance of each Assignee is 5**

In [7]:
# Calculate the occurrences of each value in the owner column
value_counts = eclipse_dataset['Assignee'].value_counts()

# Filter the dataset to include only rows where the value in 'owner' has at least 5 occurrences
eclipse_dataset = eclipse_dataset[eclipse_dataset['Assignee'].isin(value_counts[value_counts >= 5].index)]

In [8]:
min_occurrences = eclipse_dataset['Assignee'].value_counts().min()
print(f"The minimum number of occurrences in owner column after filteration is {min_occurrences}")

The minimum number of occurrences in owner column after filteration is 5


In [9]:
print(eclipse_dataset.shape)

(289359, 2)


In [10]:
# compute the number of unique values for each column in the DataFrame training_data.
eclipse_dataset.nunique()

Assignee      1623
Summary     284967
dtype: int64

**Show the values of Assignee column**

In [65]:
# Print the unique values for the 'Status' column
assignee_unique_values = eclipse_dataset['Assignee'].unique()
print(f"Unique values in the 'Assignee' column:")
for assignee_value in assignee_unique_values:
    print (assignee_value)


Unique values in the 'Assignee' column:
sphinx-inbox
a.gurov
ali.akar82
cdt-build-inbox
ddoerr
KetanPadegaonkar
khu
nicolaspeifer
sedlund
soa-inbox
stem.core-inbox
stem.simulation-inbox
stem.ui-inbox
stephaneberle9
swtbot-inbox
balazs.grill
christian.k.2510
christian.thoens
jhkauf
mattadav
werner.keil
mariot.chauvin
igor.burilo
alexei.goncharov
aprsac
apupier
john.cortell
laurent.redor
lorenzo.bettini
malaperle
michael.keppler
mistria
patrick.tasse
pierre-charles.david
sbouchet
twolf
wellmann.hannes1
ymesika
idydieng
jthomas119
webdaford
alexander.fedorov
eclipse
emo
ohf.stem-inbox
robert.kiss
statet-inbox
sw
mmt-qvt.operational-inbox
qvtd-inbox
adolfosbh
alexander.igdalov
arcanefoam
christopher.gerking
dvorak.radek
engine.qvtr-inbox
ivan
mknauer
mmt-atl.web-inbox
parser.qvtr-inbox
rap-inbox
rap.incubator-inbox
serg.boyko2011
ui.qvtr-inbox
austin.riddle
rsternberg
b.muskalla
fr.appel
ruediger.herrmann
tbuschto
a_mergey
bjorn.freeman-benson
dominik.ebert
m2t.core-inbox
ralf.zahn
stefan.

**clean the text in the summary column**

In [11]:
def preprocess_summary(summary):
    # Remove special characters
    summary = re.sub(r'[^\w\s]', '', summary)
    # Remove newlines
    summary = summary.replace('\n', ' ')
    # Remove hyperlinks
    summary = re.sub(r'http\S+', '', summary)
    return summary

In [12]:
# Preprocess the 'Summary' column
eclipse_dataset['Summary'] = eclipse_dataset['Summary'].apply(preprocess_summary)

**Filter the dataset by the number of words in Summary column**

In [13]:
def filter_by_word_count(df, min_word_count):
    return df[df['Summary'].str.split().str.len() >= min_word_count]

In [14]:
# Define the minimum word count
min_word_count = 5

# Filter rows where the 'Summary' has at least min_word_count words
eclipse_dataset = filter_by_word_count(eclipse_dataset, min_word_count)

In [15]:
print(eclipse_dataset.shape)

(247997, 2)


**Save cleaned data**

In [16]:
current_dir = os.getcwd() 
relative_path = os.path.join('..', 'data','eclipse_dataset','cleaned_eclipse_dataset.csv')
eclipse_dataset.to_csv(os.path.join(current_dir, relative_path), index=False) # exclude the DataFrame index from being saved to the CSV file.