In [1]:
import pandas as pd
import os
import json
import csv
import re, datetime
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, timedelta

from lifelines import KaplanMeierFitter

from global_functions import *

### Information

Input files:
/raw_data/all_eclipse_bugs_full.csv

Output files:
/data_processing/bugs_info.csv

In [2]:
eclipse_bugs_all_file = '.'+os.sep+'raw_data'+os.sep+'all_eclipse_bugs_full.zip'
bugs_info_file = 'data_processing'+os.sep+'bugs_info.zip'

In [3]:
#Load data
#use os.sep to get the separator based on the operating system so as not to have issues running the script in other operating systems
df = pd.read_csv(eclipse_bugs_all_file,index_col=False,compression='zip',dtype={'version':str})
df = fetch_minimal_columns(df)
df.head(n=2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,Product,version,resolution,status,severity,creation_time,priority
0,475361,Platform,4.5,FIXED,RESOLVED,normal,2015-08-19 10:50:25,P3
1,475365,Platform,4.6,FIXED,RESOLVED,minor,2015-08-19 11:34:37,P3


In [4]:
print('Initial dataset size:'+str(len(df.id.unique())))

Initial dataset size:208862


In [5]:
#Versions of e4
df[df['Product']=='e4'].version.unique()

array(['unspecified', '0.9', '1.0', '0.12', '0.2', '0.13', '0.14', '0.15',
       '0.11', '0.16', '0.17', '0.18'], dtype=object)

In [6]:
print('e4 bugs='+str(len(df[df['Product']=='e4'])))

e4 bugs=3782


In [7]:
print('Incubator bugs='+str(len(df[df['Product']=='Incubator'])))

Incubator bugs=49


In [8]:
#Versions of Incubator
df[df['Product']=='e4'].version.unique()

array(['unspecified', '0.9', '1.0', '0.12', '0.2', '0.13', '0.14', '0.15',
       '0.11', '0.16', '0.17', '0.18'], dtype=object)

In [9]:
#Filter 1: Excluding e4 and inclubator products
bef = len(df)
df = df[df['Product']!='e4']
df = df[df['Product']!='Incubator']

after = len(df)
print('Removed E4 & Incubator bugs:'+str(bef-after))
print('Dataset size:'+str(len(df)))

Removed E4 & Incubator bugs:3831
Dataset size:205031


In [10]:
df.severity.unique()

array(['normal', 'minor', 'enhancement', 'major', 'trivial', 'critical',
       'blocker'], dtype=object)

In [11]:
#Filter 2: Excluding enhancements
bef = len(df)
df = df[df['severity']!='enhancement']
after = len(df)
print('Removed enhancement bugs:'+str(bef-after))
print('Dataset size:'+str(len(df)))

Removed enhancement bugs:27799
Dataset size:177232


In [12]:
print('Eclipse products:'+str(len(df.Product.unique())))
print('Eclipse products:'+str(df.Product.unique().tolist()))

print('Eclipse releases:'+str(len(df.version.unique())))
print('Eclipse releases:'+str(sorted(df.version.unique().tolist())))

Eclipse products:4
Eclipse products:['Platform', 'JDT', 'PDE', 'Equinox']
Eclipse releases:70
Eclipse releases:['1.0', '2.0', '2.0.1', '2.0.2', '2.1', '2.1.1', '2.1.2', '2.1.3', '3.0', '3.0.1', '3.0.2', '3.1', '3.1.1', '3.1.2', '3.10.0 Luna', '3.10.1 Luna', '3.2', '3.2.1', '3.2.2', '3.3', '3.3.1', '3.3.2', '3.4', '3.4.1', '3.4.2', '3.5', '3.5.1', '3.5.2', '3.6', '3.6.1', '3.6.2', '3.7', '3.7.1', '3.7.2', '3.8', '3.8.0 Juno', '3.8.1', '3.8.1 Juno', '3.8.2', '3.8.2 Juno', '3.9.0 Kepler', '4.0', '4.1', '4.10', '4.11', '4.2', '4.2.1', '4.2.2', '4.3', '4.3.1', '4.3.2', '4.4', '4.4.1', '4.4.2', '4.5', '4.5.0 Mars', '4.5.1', '4.5.2', '4.6', '4.6.0 Neon', '4.7', '4.7.0 Oxygen', '4.7.1', '4.7.1a', '4.7.2', '4.7.3', '4.8', '4.8.0 Photon', '4.9', 'unspecified']


# Unspecified bug information

In [13]:
df_un=df[df['version']=='unspecified']
print('Number of unspecified bugs:'+str(len(df_un.id.unique())))
print('Products with unspecified bugs:'+str(df_un.Product.unique().tolist()))

Number of unspecified bugs:3237
Products with unspecified bugs:['Equinox']


In [14]:
for prod in df_un.Product.unique().tolist():
    perc = len(df[(df['Product']==prod)&(df['version']=='unspecified')])/len(df[df['Product']==prod])
    print('Number of unspecified bugs for product:'+str(len(df[(df['Product']==prod)&(df['version']=='unspecified')])))
    print('Number of bugs for product:'+str(len(df[df['Product']==prod])))
    print('Percentage of unspecified bugs in '+prod+':'+str(perc))
    print('========')

Number of unspecified bugs for product:3237
Number of bugs for product:12610
Percentage of unspecified bugs in Equinox:0.256701030927835


In [15]:
#Filter 3: remove bugs with unspecified version
print('Dataset size (before filtering):'+str(len(df.id.unique())))
print('Number of unspecified bugs:'+str(len(df[df['version']=='unspecified'].id.unique())))
df = df[df['version']!='unspecified']

print('Dataset size (after filtering unspecified):'+str(len(df.id.unique())))

Dataset size (before filtering):177232
Number of unspecified bugs:3237
Dataset size (after filtering unspecified):173995


In [16]:
#products after filtering
print('Eclipse products:'+str(df.Product.unique()))

Eclipse products:['Platform' 'JDT' 'PDE' 'Equinox']


In [17]:
#Get the closest minor release of eclipse
df['release'] = df.apply(lambda x : str(assign_to_closest_minor(x['version'])), axis=1)
df.head(n=2)

Unnamed: 0,id,Product,version,resolution,status,severity,creation_time,priority,release
0,475361,Platform,4.5,FIXED,RESOLVED,normal,2015-08-19 10:50:25,P3,4.5
1,475365,Platform,4.6,FIXED,RESOLVED,minor,2015-08-19 11:34:37,P3,4.6


In [18]:
df.release.unique()

array(['4.5', '4.6', '4.4', '3.7', '4.2', '4.3', '3.6', '4.8', '3.1',
       '4.7', '4.11', '4.10', '3.4', '3.5', '3.3', '4.0', '3.2', '3.0',
       '2.0', '2.1', '1.0', '4.1', '4.9', '3.10', '3.9'], dtype=object)

In [19]:
#Filter 4: Filter out releases before 3.0 and 4.11
exclude_releases = list()
for release in df.release.unique():
    if is_smaller_release(release,'3.0') or is_larger_release(release,'4.10'):
        exclude_releases.append(release)
#exclude_releases.append('4.11')
exclude_releases

['4.11', '2.0', '2.1', '1.0']

In [20]:
bef = len(df)
print('Dataset size (before removing releases smaller than 3.0 or larger than 4.10):'+str(len(df.id.unique())))
df = df[~df['release'].isin(exclude_releases)]
after = len(df)
print('Bugs removed in excluded releases:'+str(bef-after))
print('Dataset size:'+str(len(df.id.unique())))

Dataset size (before removing releases smaller than 3.0 or larger than 4.10):173995
Bugs removed in excluded releases:32985
Dataset size:141010


In [22]:
official_releases = ['3.0', '3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.8', '4.2', '4.3', '4.4', '4.5', '4.6', '4.7', '4.8', '4.9', '4.10']
to_exclude = [x for  x in df.release.unique().tolist() if x not in official_releases]
to_exclude

['4.0', '4.1', '3.10', '3.9']

In [23]:
bef = len(df.id.unique())
df = df[~df['release'].isin(to_exclude)]
after = len(df.id.unique())
print('Removed bugs with not in official releases:'+str(bef-after))

Removed bugs with not in official releases:2565


In [24]:
print('Final dataset size:'+str(len(df.id.unique())))

Final dataset size:138445


In [25]:
#export the file to the processing data folder
df.to_csv(bugs_info_file,index=False,compression='zip')