In [1]:
import pandas as pd
import os
import json
import csv
import re, datetime
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, timedelta

from lifelines import KaplanMeierFitter

from global_functions import *

### Information

Input files:
data_processing/bugs_info.csv
data_processing/bug_history.csv
raw_data/all_eclipse_bugs_full.csv
raw_data/bug_history.csv


Output files:
data/bugs_info.csv
data/bugs_full.csv (full information of the final dataset)
data/bug_history.csv (injected history of the final dataset)
data/bug_history_orig.csv (original history without injection of the final dataset)
data/bugs_versions.csv (the different versions a bug was assigned to)

In [2]:
bugs_processed_info_file = 'data_processing'+os.sep+'bugs_info.zip'
bugs_processed_history_file = '.'+os.sep+'data_processing'+os.sep+'bug_history.zip'
bugs_original_history_file = '.'+os.sep+'raw_data'+os.sep+'bug_history.zip'
bugs_original_info_file = '.'+os.sep+'raw_data'+os.sep+'all_eclipse_bugs_full.zip'

bugs_final_info_file = 'data'+os.sep+'bugs_info.zip'
bugs_final_full_file = 'data'+os.sep+'bugs_full.zip'
bugs_history_final_file = 'data'+os.sep+'bug_history.zip'
bugs_history_final_file_non_injected = 'data'+os.sep+'bug_history_orig.zip'
bug_versions_file = 'data'+os.sep+'bugs_versions.csv'

In [3]:
relase_creation_ts_all = get_release_dates()

In [4]:
#get data in the preprocessing phase
df_bugs = pd.read_csv(bugs_processed_info_file,index_col=False,compression='zip',
                     dtype={'release':str,'version':str})
df_bugs = trans_to_datetime(df_bugs,['creation_time',
                             'first_resolved_date','last_resolved_date',
                             'first_fixed_date','last_fixed_date',
                             'first_assignment_date','last_assignment_date'])
df_bugs.head()

Unnamed: 0,id,release,Product,creation_time,version,resolution,is_assigned,is_resolved,is_fixed,first_assignment_date,last_assignment_date,first_resolved_date,last_resolved_date,first_fixed_date,last_fixed_date
0,475361,4.5,Platform,2015-08-19 10:50:25,4.5,FIXED,0,1,1,NaT,NaT,2015-08-19 11:53:03,2015-08-19 11:53:03,2015-08-19 11:53:03,2015-08-19 11:53:03
1,475365,4.6,Platform,2015-08-19 11:34:37,4.6,FIXED,0,1,1,NaT,NaT,2015-08-26 14:29:30,2015-08-26 14:29:30,2015-08-26 14:29:30,2015-08-26 14:29:30
2,475370,4.5,Platform,2015-08-19 12:09:06,4.5,DUPLICATE,0,0,0,NaT,NaT,NaT,NaT,NaT,NaT
3,475379,4.5,Platform,2015-08-19 13:39:12,4.5,,0,0,0,NaT,NaT,NaT,NaT,NaT,NaT
4,475407,4.5,Platform,2015-08-19 17:06:10,4.5,FIXED,0,1,1,NaT,NaT,2015-08-19 17:15:25,2015-08-19 17:15:25,2015-08-19 17:15:25,2015-08-19 17:15:25


In [5]:
df_bugs.version.unique()

array(['4.5', '4.6', '4.4.2', '4.4.1', '4.5.1', '3.7.2', '3.8.2', '4.4',
       '3.7.1', '4.3', '4.2', '4.2.1', '3.6.2', '4.3.2', '3.6', '4.8',
       '3.1', '4.2.2', '4.7', '3.8', '3.7', '4.3.1', '4.7.1', '4.10',
       '3.4.1', '3.4', '3.5', '3.3.2', '3.3', '3.2.2', '3.2', '3.3.1',
       '3.0.2', '3.4.2', '3.2.1', '3.5.1', '3.1.2', '3.1.1', '3.0',
       '3.0.1', '3.5.2', '3.6.1', '4.7.1a', '4.5.2', '4.9', '3.8.1',
       '4.7.2', '4.7.3', '3.8.0 Juno', '4.5.0 Mars', '4.6.0 Neon',
       '4.7.0 Oxygen', '3.8.2 Juno', '4.8.0 Photon', '3.8.1 Juno'],
      dtype=object)

In [6]:
df_bugs.release.unique()

array(['4.5', '4.6', '4.4', '3.7', '4.2', '4.3', '3.6', '4.8', '3.1',
       '4.7', '4.10', '3.4', '3.5', '3.3', '3.2', '3.0', '4.9'],
      dtype=object)

In [8]:
print('Dataset size:'+str(len(df_bugs.id.unique())))

Dataset size:138445


In [9]:
#get the difference between the first and last date of assignment
df_bugs['time_assign_diff']=df_bugs.apply(
    lambda x: (
        x['last_assignment_date']-x['first_assignment_date']).days 
    if x['last_assignment_date'] is not None and x['first_assignment_date'] is not None 
    else None 
    ,axis=1
)

#get the difference between the first and last date of resolution
df_bugs['time_resolve_diff']=df_bugs.apply(
    lambda x: (
        x['last_resolved_date']-x['first_resolved_date']).days 
    if x['last_resolved_date'] is not None and x['first_resolved_date'] is not None 
    else None 
    ,axis=1
)

#get the difference between the first and last date of fixes
df_bugs['time_fix_diff']=df_bugs.apply(
    lambda x: (
        x['last_fixed_date']-x['first_fixed_date']).days 
    if x['last_fixed_date'] is not None and x['first_fixed_date'] is not None 
    else None 
    ,axis=1
)

df_bugs.head()

Unnamed: 0,id,release,Product,creation_time,version,resolution,is_assigned,is_resolved,is_fixed,first_assignment_date,last_assignment_date,first_resolved_date,last_resolved_date,first_fixed_date,last_fixed_date,time_assign_diff,time_resolve_diff,time_fix_diff
0,475361,4.5,Platform,2015-08-19 10:50:25,4.5,FIXED,0,1,1,NaT,NaT,2015-08-19 11:53:03,2015-08-19 11:53:03,2015-08-19 11:53:03,2015-08-19 11:53:03,,0.0,0.0
1,475365,4.6,Platform,2015-08-19 11:34:37,4.6,FIXED,0,1,1,NaT,NaT,2015-08-26 14:29:30,2015-08-26 14:29:30,2015-08-26 14:29:30,2015-08-26 14:29:30,,0.0,0.0
2,475370,4.5,Platform,2015-08-19 12:09:06,4.5,DUPLICATE,0,0,0,NaT,NaT,NaT,NaT,NaT,NaT,,,
3,475379,4.5,Platform,2015-08-19 13:39:12,4.5,,0,0,0,NaT,NaT,NaT,NaT,NaT,NaT,,,
4,475407,4.5,Platform,2015-08-19 17:06:10,4.5,FIXED,0,1,1,NaT,NaT,2015-08-19 17:15:25,2015-08-19 17:15:25,2015-08-19 17:15:25,2015-08-19 17:15:25,,0.0,0.0


In [10]:
assigned_df = df_bugs[~df_bugs['time_assign_diff'].isna()]
print('Bugs assigned only once:',len(assigned_df[assigned_df['first_assignment_date']==assigned_df['last_assignment_date']]))
print('Bugs assigned only once or assigned more than once but on the same day:',len(assigned_df[assigned_df['time_assign_diff']==0]))
print('Bugs assigned more than one time in more than one days:',len(assigned_df[assigned_df['time_assign_diff']>0]))

Bugs assigned only once: 49304
Bugs assigned only once or assigned more than once but on the same day: 49810
Bugs assigned more than one time in more than one days: 2570


In [11]:
resolved_df = df_bugs[~df_bugs['time_resolve_diff'].isna()]
print('Bugs resolved only once:',len(resolved_df[resolved_df['first_assignment_date']==resolved_df['last_assignment_date']]))
print('Bugs resolved only once or resolved more than once but on the same day:',len(resolved_df[resolved_df['time_assign_diff']==0]))
print('Bugs resolved more than one time in more than one days:',len(resolved_df[resolved_df['time_assign_diff']>0]))

Bugs resolved only once: 42402
Bugs resolved only once or resolved more than once but on the same day: 42887
Bugs resolved more than one time in more than one days: 2264


In [12]:
fixed_df = df_bugs[~df_bugs['time_fix_diff'].isna()]
print('Bugs fixed only once:',len(fixed_df[fixed_df['first_assignment_date']==fixed_df['last_assignment_date']]))
print('Bugs fixed only once or fixed more than once but on the same day:',len(fixed_df[fixed_df['time_assign_diff']==0]))
print('Bugs fixed more than one time in more than one days:',len(fixed_df[fixed_df['time_assign_diff']>0]))

Bugs fixed only once: 31909
Bugs fixed only once or fixed more than once but on the same day: 32327
Bugs fixed more than one time in more than one days: 1794


In [13]:
if not os.path.exists(bugs_history_final_file):
    bug_history = pd.read_csv(bugs_processed_history_file,index_col=False,compression='zip')
    bug_history = bug_history[bug_history['id'].isin(df_bugs.id.unique().tolist())]
    bug_history.to_csv(bugs_history_final_file,index=False,compression='zip')
else:
    bug_history = pd.read_csv(bugs_history_final_file,index_col=False,compression='zip')
bug_history.head()

Unnamed: 0,added,id,removed,what,when,who
0,platform-help-inbox@eclipse.org,100001,pde-ui-inbox@eclipse.org,assigned_to,2005-06-14T16:16:00Z,wassim.melhem@gmail.com
1,Help,100001,UI,component,2005-06-14T16:16:00Z,wassim.melhem@gmail.com
2,Platform,100001,PDE,product,2005-06-14T16:16:00Z,wassim.melhem@gmail.com
3,konradk@ca.ibm.com,100001,,cc,2005-06-14T17:17:31Z,konradk@ca.ibm.com
4,dejan@ca.ibm.com,100001,platform-help-inbox@eclipse.org,assigned_to,2005-06-14T17:17:31Z,konradk@ca.ibm.com


In [14]:
#Transfer the original files to the data folder with only the final dataset bugs
if not os.path.exists(bugs_final_full_file):
    df_bugs_orig = pd.read_csv(bugs_original_info_file,index_col=False,compression='zip')
    df_bugs_orig = df_bugs_orig[df_bugs_orig['id'].isin(df_bugs.id.unique().tolist())]
    df_bugs_orig.to_csv(bugs_final_full_file,index=False,compression='zip')
else:
    df_bugs_orig = pd.read_csv(bugs_final_full_file,index_col=False,compression='zip')

df_bugs_orig.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,alias,assigned_to,assigned_to_detail.email,assigned_to_detail.id,assigned_to_detail.name,assigned_to_detail.real_name,blocks,cc,cc_detail,classification,...,qa_contact_detail.real_name,resolution,see_also,severity,status,summary,target_milestone,url,version,whiteboard
0,[],Lars.Vogel@vogella.com,Lars.Vogel@vogella.com,31345,Lars.Vogel@vogella.com,Lars Vogel,[],"['Lars.Vogel@vogella.com', 'simon.scholz@vogel...","[{'id': 31345, 'email': 'Lars.Vogel@vogella.co...",Eclipse,...,,FIXED,"['https://git.eclipse.org/r/54078', 'https://g...",normal,RESOLVED,Remove redundant type arguments (1.7 or highte...,4.6 M2,,4.5,
1,[],simon.scholz@vogella.com,simon.scholz@vogella.com,140999,simon.scholz@vogella.com,Simon Scholz,[],['Lars.Vogel@vogella.com'],"[{'name': 'Lars.Vogel@vogella.com', 'real_name...",Eclipse,...,,FIXED,"['https://git.eclipse.org/r/54086', 'https://g...",minor,RESOLVED,[Model Editor] The ModelEditor should use a De...,4.6 M2,,4.6,
2,[],platform-swt-inbox@eclipse.org,platform-swt-inbox@eclipse.org,2206,platform-swt-inbox@eclipse.org,Platform-SWT-Inbox,[],"['daniel_megert@ch.ibm.com', 'ericwill@redhat....","[{'real_name': 'Dani Megert', 'name': 'daniel_...",Eclipse,...,,DUPLICATE,[],minor,CLOSED,Initial scrollbar position incorrect,---,,4.5,
3,[],Platform-UI-Inbox@eclipse.org,Platform-UI-Inbox@eclipse.org,2169,Platform-UI-Inbox@eclipse.org,Platform-UI-Inbox,[],[],[],Eclipse,...,,,[],normal,NEW,Perpective overlay the welcome screen,---,,4.5,
5,[],platform-releng-inbox@eclipse.org,platform-releng-inbox@eclipse.org,2217,platform-releng-inbox@eclipse.org,Platform-Releng-Inbox,[],"['mistria@redhat.com', 'tjwatson@us.ibm.com']","[{'real_name': 'Mickael Istria', 'name': 'mist...",Eclipse,...,,FIXED,[],normal,RESOLVED,rt.equinox.bundles-Gerrit has been running fo...,---,,4.5,


In [15]:
if not os.path.exists(bugs_history_final_file_non_injected):
    bugs_history_orig  = pd.read_csv(bugs_original_history_file,index_col=False,compression='zip')
    bugs_history_orig = bugs_history_orig[bugs_history_orig['id'].isin(df_bugs.id.unique().tolist())]
    bugs_history_orig.to_csv(bugs_history_final_file_non_injected,index=False,compression='zip')
else:
    bugs_history_orig = pd.read_csv(bugs_history_final_file_non_injected,index_col=False,compression='zip')
    
bugs_history_orig.head()

Unnamed: 0,added,id,removed,what,when,who
0,platform-help-inbox@eclipse.org,100001,pde-ui-inbox@eclipse.org,assigned_to,2005-06-14T16:16:00Z,wassim.melhem@gmail.com
1,Help,100001,UI,component,2005-06-14T16:16:00Z,wassim.melhem@gmail.com
2,Platform,100001,PDE,product,2005-06-14T16:16:00Z,wassim.melhem@gmail.com
3,konradk@ca.ibm.com,100001,,cc,2005-06-14T17:17:31Z,konradk@ca.ibm.com
4,dejan@ca.ibm.com,100001,platform-help-inbox@eclipse.org,assigned_to,2005-06-14T17:17:31Z,konradk@ca.ibm.com


# "version_origin" to store the first release that the bug appeared

In [16]:
#create a new dataframe about a but present in different releases
bug_versions = pd.DataFrame()

In [17]:
bug_versions = df_bugs[['id','version']]
bug_versions['source'] = 'version field'
bug_versions.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,id,version,source
0,475361,4.5,version field
1,475365,4.6,version field
2,475370,4.5,version field
3,475379,4.5,version field
4,475407,4.5,version field


In [18]:
df_all = pd.read_csv(bugs_processed_history_file,index_col=False,compression='zip')
df_all[df_all['id'].isin(df_bugs.id.unique())]
df_all=trans_to_datetime(df_all,['when'])
df_all

Unnamed: 0,added,id,removed,what,when,who
0,platform-help-inbox@eclipse.org,100001,pde-ui-inbox@eclipse.org,assigned_to,2005-06-14 16:16:00,wassim.melhem@gmail.com
1,Help,100001,UI,component,2005-06-14 16:16:00,wassim.melhem@gmail.com
2,Platform,100001,PDE,product,2005-06-14 16:16:00,wassim.melhem@gmail.com
3,konradk@ca.ibm.com,100001,,cc,2005-06-14 17:17:31,konradk@ca.ibm.com
4,dejan@ca.ibm.com,100001,platform-help-inbox@eclipse.org,assigned_to,2005-06-14 17:17:31,konradk@ca.ibm.com
5,3.1 RC3,100001,---,target_milestone,2005-06-14 20:26:08,konradk@ca.ibm.com
6,P2,100001,P3,priority,2005-06-14 20:44:53,dejan@ca.ibm.com
7,RESOLVED,100001,NEW,status,2005-06-14 21:26:05,dejan@ca.ibm.com
8,FIXED,100001,,resolution,2005-06-14 21:26:05,dejan@ca.ibm.com
9,david_audel@fr.ibm.com,100002,jdt-core-inbox@eclipse.org,assigned_to,2006-10-06 19:42:16,Olivier_Thomann@ca.ibm.com


In [19]:
bug_versions

Unnamed: 0,id,version,source
0,475361,4.5,version field
1,475365,4.6,version field
2,475370,4.5,version field
3,475379,4.5,version field
4,475407,4.5,version field
5,475427,4.5,version field
6,475471,4.6,version field
7,475478,4.6,version field
8,475479,4.6,version field
9,475480,4.5,version field


In [20]:
version_buglist=df_all[df_all['what']=='version'].id.unique()
version_bugs=df_all[df_all['id'].isin(version_buglist)]
only_version_bugs=version_bugs[version_bugs['what']=='version']

version_bugs_grouped=only_version_bugs[['id','removed','when']].rename(index=str,columns={'removed':'version'})

version_bugs_grouped['source'] = 'bug history'

bug_versions = bug_versions.append(version_bugs_grouped,ignore_index=True)

version_bugs_grouped=only_version_bugs[['id','added','when']].rename(index=str,columns={'added':'version'})
version_bugs_grouped['source'] = 'bug history'

bug_versions = bug_versions.append(version_bugs_grouped,ignore_index=True)

bug_versions = bug_versions.drop_duplicates()
bug_versions.head()


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,id,source,version,when
0,475361,version field,4.5,NaT
1,475365,version field,4.6,NaT
2,475370,version field,4.5,NaT
3,475379,version field,4.5,NaT
4,475407,version field,4.5,NaT


In [21]:
#if a version exists both because of the version field and the bug history, only keep the version field entry
bug_versions = bug_versions.sort_values(by=['id','version','source'])
bug_versions = bug_versions.drop_duplicates(['id','version'],keep='last')
bug_versions

Unnamed: 0,id,source,version,when
139848,2382,bug history,2.0,2009-11-02 17:04:45
132067,2382,version field,3.6,NaT
140887,2802,bug history,2.0,2006-12-07 14:30:43
18483,2802,version field,3.3,NaT
142176,3091,bug history,2.0,2004-05-17 16:22:02
132068,3091,version field,3.0,NaT
142257,3109,bug history,2.0,2012-12-31 14:55:50
132069,3109,version field,3.8.0 Juno,NaT
146835,4745,bug history,2.0,2017-06-28 14:29:39
33037,4745,version field,4.8,NaT


In [22]:
bug_versions.version.unique()

array(['2.0', '3.6', '3.3', '3.0', '3.8.0 Juno', '4.8', '3.2', '3.1',
       'unspecified', '2.1.2', '3.5', '2.1', '2.0.2', '1.0', '1.1',
       '2.1.1', '1.2', '3.1.2', '3.8', '3.0.1', '3.4', '2.1.3', '1.1.3',
       '2.0.1', '3.7', '1.0.0', '3.0.2', '4.0', '3.1.1', '4.7', '0.7',
       '3.2.1', '2.2', '4.4', '4.5.1', '3.2.2', '4.1', '4.4.2', '1.3.0',
       '1.5', '1.0.1', '3.4.1', '1.0.2', '4.2', '2.0.0', '3.3.1', '0.7.1',
       '0.9', '4.2.2', '2.1.0', '3.6.1', '4.3', '0.2', '0.6', '0.8',
       '4.5', '1.2.1', '1.5.2', '3.3.2', '1.5.3', '0.1.3', '1.5.4',
       '4.0.1', '4.6', '3.4.2', '4.0.2', '2.4.0', '1.5.5', 'dev', '2.3.0',
       '2.3', '5.0', '4.0.3', '3.5.1', '1.3.1', '3.0.3', '2.5.0', '3.6.2',
       '2.3.1', '1.0.3', '3.5.2', '5.0.1', '2008-Ganymede', '0.1', '6.0',
       'DD 1.1', '5.0.2', '3.0.0', '1.6.4', '0.9.2', '1.7', '1.1.0',
       '3.7.2', '1.2.0', 'Europa', 'Galileo', '1.7.1', '1.8', '4.2.1',
       '3.8.1', '0.8.0', '6.0.2', '2.6.0', '0 DD 1.1', '3.8.2', '7.0'

In [23]:
#filter 1: fix names that we know the number for, e.g., Galileo
bug_versions.loc[bug_versions['version']=='Europa','version'] = '3.3'
bug_versions.loc[bug_versions['version']=='2008-Ganymede','version'] = '3.4'
bug_versions.loc[bug_versions['version']=='Galileo','version'] = '3.5'


In [24]:
#filter 2: 
bug_versions = bug_versions[bug_versions['version'].str.find('.')>=0]
bug_versions.head()


Unnamed: 0,id,source,version,when
139848,2382,bug history,2.0,2009-11-02 17:04:45
132067,2382,version field,3.6,NaT
140887,2802,bug history,2.0,2006-12-07 14:30:43
18483,2802,version field,3.3,NaT
142176,3091,bug history,2.0,2004-05-17 16:22:02


In [25]:
bug_versions.version.unique()

array(['2.0', '3.6', '3.3', '3.0', '3.8.0 Juno', '4.8', '3.2', '3.1',
       '2.1.2', '3.5', '2.1', '2.0.2', '1.0', '1.1', '2.1.1', '1.2',
       '3.1.2', '3.8', '3.0.1', '3.4', '2.1.3', '1.1.3', '2.0.1', '3.7',
       '1.0.0', '3.0.2', '4.0', '3.1.1', '4.7', '0.7', '3.2.1', '2.2',
       '4.4', '4.5.1', '3.2.2', '4.1', '4.4.2', '1.3.0', '1.5', '1.0.1',
       '3.4.1', '1.0.2', '4.2', '2.0.0', '3.3.1', '0.7.1', '0.9', '4.2.2',
       '2.1.0', '3.6.1', '4.3', '0.2', '0.6', '0.8', '4.5', '1.2.1',
       '1.5.2', '3.3.2', '1.5.3', '0.1.3', '1.5.4', '4.0.1', '4.6',
       '3.4.2', '4.0.2', '2.4.0', '1.5.5', '2.3.0', '2.3', '5.0', '4.0.3',
       '3.5.1', '1.3.1', '3.0.3', '2.5.0', '3.6.2', '2.3.1', '1.0.3',
       '3.5.2', '5.0.1', '0.1', '6.0', 'DD 1.1', '5.0.2', '3.0.0',
       '1.6.4', '0.9.2', '1.7', '1.1.0', '3.7.2', '1.2.0', '1.7.1', '1.8',
       '4.2.1', '3.8.1', '0.8.0', '6.0.2', '2.6.0', '0 DD 1.1', '3.8.2',
       '7.0', '3.7.1', '4.3.1', '4.3.2', '3.0.5', '2.2.1', '8.0', '0.5',

In [26]:
#fix abnormal values because we might miss closest minor releases
for versioninit in bug_versions.version.unique().tolist():
    version = versioninit
    if ' ' in version:
        if '.' not in version[:version.find(' ')]:
            continue
        else:
            version = version[:version.find(' ')]
            bug_versions.loc[bug_versions['version']==versioninit,'version'] = version

bug_versions.version.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


array(['2.0', '3.6', '3.3', '3.0', '3.8.0', '4.8', '3.2', '3.1', '2.1.2',
       '3.5', '2.1', '2.0.2', '1.0', '1.1', '2.1.1', '1.2', '3.1.2',
       '3.8', '3.0.1', '3.4', '2.1.3', '1.1.3', '2.0.1', '3.7', '1.0.0',
       '3.0.2', '4.0', '3.1.1', '4.7', '0.7', '3.2.1', '2.2', '4.4',
       '4.5.1', '3.2.2', '4.1', '4.4.2', '1.3.0', '1.5', '1.0.1', '3.4.1',
       '1.0.2', '4.2', '2.0.0', '3.3.1', '0.7.1', '0.9', '4.2.2', '2.1.0',
       '3.6.1', '4.3', '0.2', '0.6', '0.8', '4.5', '1.2.1', '1.5.2',
       '3.3.2', '1.5.3', '0.1.3', '1.5.4', '4.0.1', '4.6', '3.4.2',
       '4.0.2', '2.4.0', '1.5.5', '2.3.0', '2.3', '5.0', '4.0.3', '3.5.1',
       '1.3.1', '3.0.3', '2.5.0', '3.6.2', '2.3.1', '1.0.3', '3.5.2',
       '5.0.1', '0.1', '6.0', 'DD 1.1', '5.0.2', '3.0.0', '1.6.4',
       '0.9.2', '1.7', '1.1.0', '3.7.2', '1.2.0', '1.7.1', '1.8', '4.2.1',
       '3.8.1', '0.8.0', '6.0.2', '2.6.0', '0 DD 1.1', '3.8.2', '7.0',
       '3.7.1', '4.3.1', '4.3.2', '3.0.5', '2.2.1', '8.0', '0.5', '7.0

In [27]:
bug_versions['release'] = bug_versions.apply(lambda x: assign_to_closest_minor(str(x['version'])),axis=1)
bug_versions['release'] = bug_versions['release'].astype(str)
bug_versions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,id,source,version,when,release
139848,2382,bug history,2.0,2009-11-02 17:04:45,2.0
132067,2382,version field,3.6,NaT,3.6
140887,2802,bug history,2.0,2006-12-07 14:30:43,2.0
18483,2802,version field,3.3,NaT,3.3
142176,3091,bug history,2.0,2004-05-17 16:22:02,2.0
132068,3091,version field,3.0,NaT,3.0
142257,3109,bug history,2.0,2012-12-31 14:55:50,2.0
132069,3109,version field,3.8.0,NaT,4.2
146835,4745,bug history,2.0,2017-06-28 14:29:39,2.0
33037,4745,version field,4.8,NaT,4.8


In [28]:
bug_versions.head()

Unnamed: 0,id,source,version,when,release
139848,2382,bug history,2.0,2009-11-02 17:04:45,2.0
132067,2382,version field,3.6,NaT,3.6
140887,2802,bug history,2.0,2006-12-07 14:30:43,2.0
18483,2802,version field,3.3,NaT,3.3
142176,3091,bug history,2.0,2004-05-17 16:22:02,2.0


In [29]:
print('Current releases:'+str(sorted(bug_versions.release.unique().tolist())))

Current releases:['0 DD 0.9', '0 DD 1.1', '0.1', '0.11', '0.12', '0.13', '0.14', '0.15', '0.17', '0.2', '0.21', '0.22', '0.24', '0.3', '0.5', '0.6', '0.7', '0.8', '0.9', '1.0', '1.1', '1.10', '1.2', '1.3', '1.4', '1.5', '1.6', '1.7', '1.8', '1.9', '2.0', '2.1', '2.11', '2.12', '2.2', '2.3', '2.4', '2.5', '2.6', '2.7', '2.8', '2.9', '3.0', '3.1', '3.10', '3.11', '3.13', '3.14', '3.16', '3.18', '3.19', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.9', '4.0', '4.1', '4.10', '4.11', '4.2', '4.3', '4.4', '4.5', '4.6', '4.7', '4.8', '4.9', '5.0', '5.1', '5.2', '5.9', '6.0', '7.0', '8.0', '8.1', '8.2', '8.3', '8.4', '8.5', '8.6', '8.7', '8.8', '9.0', '9.2', '9.3', '9.4', '9.5', 'DD 1.1']


In [30]:
#filter 3: exclude the releases not in our study
bug_versions = bug_versions[bug_versions['release'].isin(relase_creation_ts_all)]
print('After filtering releases:'+str(sorted(bug_versions.release.unique().tolist())))

After filtering releases:['3.0', '3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '4.10', '4.11', '4.2', '4.3', '4.4', '4.5', '4.6', '4.7', '4.8', '4.9']


In [31]:
sbugs = bug_versions[bug_versions['source']!='version field'].id.unique().tolist()

In [32]:
print('Bugs with multiple versions:'+str(len(sbugs)))

Bugs with multiple versions:4063


In [34]:
#counter1 counts the number of bugs with a major version chagne
counter1=0
for bid in sbugs:
    try:
        version_field_release = bug_versions[(bug_versions['id']==bid )& (bug_versions['source']=='version field')].release.values.tolist()[0]
    except:
        continue
    history_values = bug_versions[(bug_versions['id']==bid )& (bug_versions['source']=='bug history')].release.values.tolist()
    
    major_change = [x for x in history_values if x!=version_field_release]
    if len(major_change)>0:
        counter1+=1
print('counter1='+str(counter1))

counter1=3621


In [35]:
df_bugs[df_bugs['id'].isin(sbugs)].groupby('resolution')['id'].nunique().reset_index()

Unnamed: 0,resolution,id
0,DUPLICATE,595
1,FIXED,1538
2,INVALID,195
3,MOVED,1
4,NOT_ECLIPSE,104
5,WONTFIX,197
6,WORKSFORME,386


In [38]:
#measure the number of bugs that were resolved in multiple major releases
resolved_sbugs = df_bugs[(df_bugs['id'].isin(sbugs)) & (df_bugs['is_resolved']==1)].id.unique()
resolution

In [45]:
bug_history[bug_history['id']==39447]

Unnamed: 0,added,id,removed,what,when,who
580419,major,39447,normal,severity,2003-06-27T23:43:47Z,chris@scmbb.ulb.ac.be
580420,jdt-ui-inbox@eclipse.org,39447,jdt-core-inbox@eclipse.org,assigned_to,2003-06-30T10:16:52Z,philippe_mulet@fr.ibm.com
580421,UI,39447,Core,component,2003-06-30T10:16:52Z,philippe_mulet@fr.ibm.com
580422,erich_gamma@ch.ibm.com,39447,jdt-ui-inbox@eclipse.org,assigned_to,2003-07-07T09:22:30Z,erich_gamma@ch.ibm.com
580423,erimurph@pfc.cfs.nrcan.gc.ca,39447,,cc,2005-12-08T18:41:20Z,erimurph@pfc.cfs.nrcan.gc.ca
580424,jdt-ui-inbox@eclipse.org,39447,erich_gamma@ch.ibm.com,assigned_to,2006-05-24T09:48:23Z,erich_gamma@ch.ibm.com
580425,3.2,39447,3.0,version,2006-05-24T09:48:23Z,erich_gamma@ch.ibm.com
580426,RESOLVED,39447,NEW,status,2006-05-24T20:43:42Z,martinae@microsoft.com
580427,LATER,39447,,resolution,2006-05-24T20:43:42Z,martinae@microsoft.com
580428,"[Junit] When run as JUnit Test, Specific Class...",39447,"When run as JUnit Test, Specific Class Name ca...",summary,2006-05-24T20:43:42Z,martinae@microsoft.com


In [92]:
multiple_resolutions_candidates = (
    bug_history[(bug_history['id'].isin(resolved_sbugs)) & 
                (bug_history['what']=='status') & 
                (bug_history['added']=='RESOLVED')]
)
multiple_resolutions_candidates = multiple_resolutions_candidates.groupby('id')['what'].count().reset_index()
multiple_resolutions_candidates = multiple_resolutions_candidates[multiple_resolutions_candidates['what']>1].id.unique().tolist()

threats = list()
for bid in multiple_resolutions_candidates:
    #to be a threat, we need to have 1 resolved, 1 version change and 1 more resolved event
    bhistory = bug_history[bug_history['id']==bid]
    to_test = pd.DataFrame()
    to_test = to_test.append(bhistory[bhistory['what']=='version'],ignore_index=True)
    to_test = to_test.append(bhistory[(bhistory['what']=='status') & (bhistory['added']=='RESOLVED')],ignore_index=True)
    to_test['when'] = pd.to_datetime(to_test['when'])
    to_test = to_test.sort_values(by='when')
    
    init_version = sort_df(bug_versions[bug_versions['id']==bid],'release')['release'].values.tolist()[0]
    
    version_change_found = False
    changed_major = False
    res1_found = False
    res2_found = False
    for index,row in to_test.iterrows():
        if row['added']=='RESOLVED' and not version_change_found:
            res1_found = True
            continue
        if row['what']=='version' and res1_found:
            new_version = str(row['added'])
            try:
                new_release = assign_to_closest_minor(new_version)
            except:
                continue
            if new_release!=init_version:
                version_change_found = True
                continue
        if res1_found and version_change_found and row['added']=='RESOLVED':
            res2_found = True
            break
    if res1_found and version_change_found and res2_found:
        threats.append(bid)

    
print('Bugs that are resolved in multiple releases (resolved, changed version and re-resolved):'+str(len(threats)))

Bugs that are resolved in multiple releases (resolved, changed version and re-resolved):98


In [93]:
if not os.path.exists(bug_versions_file):
    bug_versions.to_csv(bug_versions_file,index=False,compression='zip')

In [94]:
def get_tap(value):
    tap = tuple(int(i) for i in value.split('.'))
    return tap

def ag_function_min(values):
    values = values.unique().tolist()
    if len(values)>1:
        values = [get_tap(x) for x in values]
        values = sorted(values)
        min_val = values[0]
        return str(min_val[0])+'.'+str(min_val[1])
    else:
        return values[0]
    
def ag_function_max(values):
    values = values.unique().tolist()
    if len(values)>1:
        values = [get_tap(x) for x in values]
        values = sorted(values,reverse=True)
        max_val = values[0]
        return str(max_val[0])+'.'+str(max_val[1])
    else:
        return values[0]

min_releases = (
    bug_versions
    .groupby(by=['id'])
    .agg({'release':ag_function_min})
    .reset_index()
    .rename(index=str,columns={'release':'min_release'})
)
max_releases = (
    bug_versions
    .groupby(by=['id'])
    .agg({'release':ag_function_max})
    .reset_index()
    .rename(index=str,columns={'release':'max_release'})
)

if 'min_release' not in df_bugs.columns:
    df_bugs = pd.merge(df_bugs,min_releases,on=['id'],how='left')
    
if 'max_release' not in df_bugs.columns:
    df_bugs = pd.merge(df_bugs,max_releases,on=['id'],how='left')

In [None]:
#the bugs with not version coming from the bug history will have None values in the min and max release columns
for index,row in df_bugs[df_bugs['min_release'].isna()].iterrows():
    df_bugs.loc[index,'min_release'] = row['release']
    
for index,row in df_bugs[df_bugs['max_release'].isna()].iterrows():
    df_bugs.loc[index,'max_release'] = row['release']


In [95]:
print('Bugs in more than one release:'+str(len(df_bugs[df_bugs['min_release']!=df_bugs['max_release']])))
perc = len(df_bugs[df_bugs['min_release']!=df_bugs['max_release']])/len(df_bugs)
print('Percentage:'+str(perc))

Bugs in more than one release:3621
Percentage:0.026154790711112715


In [96]:
#create a temp dataframe to calculate this
dftemp = df_bugs
dftemp['min_release_before_version_field'] = dftemp.apply(
    lambda x :
    is_larger_release(x['release'],x['min_release']),
    axis=1
)
print('Bugs started in earlier releases (compared to the verion field value):'+str(len(dftemp[dftemp['min_release_before_version_field']==True])))

Bugs started in earlier releases (compared to the verion field value):1709


In [97]:
dftemp['min_release_after_version_field'] = dftemp.apply(
    lambda x :
    is_larger_release(x['min_release'],x['release']),
    axis=1
)
print('Bugs started in subsequent releases (compared to the verion field value):'+str(len(dftemp[dftemp['min_release_after_version_field']==True])))

Bugs started in subsequent releases (compared to the verion field value):0


In [98]:
#EXPORT FILES
df_bugs.to_csv(bugs_final_info_file,index=False,compression='zip')