In [1]:
import pandas as pd
import os
import json
import csv
import re, datetime
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, timedelta

from lifelines import KaplanMeierFitter

from global_functions import *

### Information

Input files:
data_processing/bugs_info.csv
data_processing/bug_history.csv
raw_data/all_eclipse_bugs_full.csv
raw_data/bug_history.csv


Output files:
data/bugs_info.csv
data/bugs_full.csv (full information of the final dataset)
data/bug_history.csv (injected history of the final dataset)
data/bug_history_orig.csv (original history without injection of the final dataset)
data/bugs_versions.csv (the different versions a bug was assigned to)

In [2]:
bugs_processed_info_file = 'data_processing'+os.sep+'bugs_info.csv'
bugs_processed_history_file = '.'+os.sep+'data_processing'+os.sep+'bug_history.csv'
bugs_original_history_file = '.'+os.sep+'raw_data'+os.sep+'bug_history.csv'
bugs_original_info_file = '.'+os.sep+'raw_data'+os.sep+'all_eclipse_bugs_full.csv'

bugs_final_info_file = 'data'+os.sep+'bugs_info.csv'
bugs_final_full_file = 'data'+os.sep+'bugs_full.csv'
bugs_history_final_file = 'data'+os.sep+'bug_history.csv'
bugs_history_final_file_non_injected = 'data'+os.sep+'bug_history_orig.csv'
bug_versions_file = 'data'+os.sep+'bugs_versions.csv'

In [3]:
relase_creation_ts_all = get_release_dates()
relase_creation_ts_all

{'3.0': '2004-06-25 00:00:00',
 '3.1': '2005-06-28 00:00:00',
 '3.2': '2006-06-29 00:00:00',
 '3.3': '2007-06-28 00:00:00',
 '3.4': '2008-06-25 00:00:00',
 '3.5': '2009-06-24 00:00:00',
 '3.6': '2010-06-23 00:00:00',
 '3.7': '2011-06-22 00:00:00',
 '3.8': '2012-06-27 00:00:00',
 '4.2': '2012-06-27 00:00:00',
 '4.3': '2013-06-26 20:00:00',
 '4.4': '2014-06-25 12:15:00',
 '4.5': '2015-06-24 20:00:00',
 '4.6': '2016-06-22 11:00:00',
 '4.7': '2017-06-28 09:50:00',
 '4.8': '2018-06-27 00:00:00',
 '4.9': '2018-09-19 00:00:00',
 '4.10': '2018-12-19 00:00:00'}

In [4]:
#get data in the preprocessing phase
df_bugs = pd.read_csv(bugs_processed_info_file,index_col=False,
                     dtype={'release':str,'version':str})
df_bugs = trans_to_datetime(df_bugs,['creation_time',
                             'first_resolved_date','last_resolved_date',
                             'first_fixed_date','last_fixed_date',
                             'first_assignment_date','last_assignment_date'])
df_bugs.head()

Unnamed: 0,id,release,Product,creation_time,version,resolution,is_assigned,is_resolved,is_fixed,first_assignment_date,last_assignment_date,first_resolved_date,last_resolved_date,first_fixed_date,last_fixed_date
0,475361,4.5,Platform,2015-08-19 10:50:25,4.5,FIXED,0,1,1,NaT,NaT,2015-08-19 11:53:03,2015-08-19 11:53:03,2015-08-19 11:53:03,2015-08-19 11:53:03
1,475365,4.6,Platform,2015-08-19 11:34:37,4.6,FIXED,0,1,1,NaT,NaT,2015-08-26 14:29:30,2015-08-26 14:29:30,2015-08-26 14:29:30,2015-08-26 14:29:30
2,475370,4.5,Platform,2015-08-19 12:09:06,4.5,DUPLICATE,0,0,0,NaT,NaT,NaT,NaT,NaT,NaT
3,475379,4.5,Platform,2015-08-19 13:39:12,4.5,,0,0,0,NaT,NaT,NaT,NaT,NaT,NaT
4,475407,4.5,Platform,2015-08-19 17:06:10,4.5,FIXED,0,1,1,NaT,NaT,2015-08-19 17:15:25,2015-08-19 17:15:25,2015-08-19 17:15:25,2015-08-19 17:15:25


In [5]:
df_bugs['release'] = df_bugs.apply(lambda x: assign_to_closest_minor(str(x['version'])),axis=1)

In [6]:
df_bugs.version.unique()

array(['4.5', '4.6', '4.4.2', '4.4.1', '4.5.1', '3.7.2', '3.8.2', '4.4',
       '3.7.1', '4.3', '4.2', '4.2.1', '3.6.2', '4.3.2', '3.6', '4.8',
       '3.1', '4.2.2', '4.7', '3.8', '3.7', '4.3.1', '4.7.1', '4.10',
       '3.4.1', '3.4', '3.5', '3.3.2', '4.0', '3.3', '3.2.2', '3.2',
       '3.3.1', '3.0.2', '3.4.2', '3.2.1', '3.5.1', '3.1.2', '3.1.1',
       '3.0', '3.0.1', '4.1', '3.5.2', '3.6.1', '4.7.1a', '4.5.2', '4.9',
       '3.8.1', '4.7.2', '4.7.3', '3.8.0 Juno', '4.5.0 Mars',
       '3.10.0 Luna', '4.6.0 Neon', '3.10.1 Luna', '4.7.0 Oxygen',
       '3.9.0 Kepler', '3.8.2 Juno', '4.8.0 Photon', '3.8.1 Juno'],
      dtype=object)

In [7]:
df_bugs.release.unique()

array(['4.5', '4.6', '4.4', '3.7', '4.2', '4.3', '3.6', '4.8', '3.1',
       '4.7', '4.10', '3.4', '3.5', '3.3', '4.0', '3.2', '3.0', '4.1',
       '4.9', '3.10', '3.9'], dtype=object)

In [8]:
bef = len(df_bugs.id.unique())
print('Dataset size before:'+str(bef))
print('Removed releases because they are out of scope:'+str(df_bugs[~df_bugs['release'].isin(relase_creation_ts_all)].release.unique().tolist()))
df_bugs = df_bugs[df_bugs['release'].isin(relase_creation_ts_all)]
after = len(df_bugs.id.unique())
print('Removed bugs because they were not in the target releases:'+str(bef-after))
print('Dataset size:'+str(len(df_bugs.id.unique())))

Dataset size before:141010
Removed releases because they are out of scope:['4.0', '4.1', '3.10', '3.9']
Removed bugs because they were not in the target releases:2565
Dataset size:138445


In [9]:
#get the difference between the first and last date of assignment
df_bugs['time_assign_diff']=df_bugs.apply(
    lambda x: (
        x['last_assignment_date']-x['first_assignment_date']).days 
    if x['last_assignment_date'] is not None and x['first_assignment_date'] is not None 
    else None 
    ,axis=1
)

#get the difference between the first and last date of resolution
df_bugs['time_resolve_diff']=df_bugs.apply(
    lambda x: (
        x['last_resolved_date']-x['first_resolved_date']).days 
    if x['last_resolved_date'] is not None and x['first_resolved_date'] is not None 
    else None 
    ,axis=1
)

#get the difference between the first and last date of fixes
df_bugs['time_fix_diff']=df_bugs.apply(
    lambda x: (
        x['last_fixed_date']-x['first_fixed_date']).days 
    if x['last_fixed_date'] is not None and x['first_fixed_date'] is not None 
    else None 
    ,axis=1
)

df_bugs.head()

Unnamed: 0,id,release,Product,creation_time,version,resolution,is_assigned,is_resolved,is_fixed,first_assignment_date,last_assignment_date,first_resolved_date,last_resolved_date,first_fixed_date,last_fixed_date,time_assign_diff,time_resolve_diff,time_fix_diff
0,475361,4.5,Platform,2015-08-19 10:50:25,4.5,FIXED,0,1,1,NaT,NaT,2015-08-19 11:53:03,2015-08-19 11:53:03,2015-08-19 11:53:03,2015-08-19 11:53:03,,0.0,0.0
1,475365,4.6,Platform,2015-08-19 11:34:37,4.6,FIXED,0,1,1,NaT,NaT,2015-08-26 14:29:30,2015-08-26 14:29:30,2015-08-26 14:29:30,2015-08-26 14:29:30,,0.0,0.0
2,475370,4.5,Platform,2015-08-19 12:09:06,4.5,DUPLICATE,0,0,0,NaT,NaT,NaT,NaT,NaT,NaT,,,
3,475379,4.5,Platform,2015-08-19 13:39:12,4.5,,0,0,0,NaT,NaT,NaT,NaT,NaT,NaT,,,
4,475407,4.5,Platform,2015-08-19 17:06:10,4.5,FIXED,0,1,1,NaT,NaT,2015-08-19 17:15:25,2015-08-19 17:15:25,2015-08-19 17:15:25,2015-08-19 17:15:25,,0.0,0.0


In [10]:
assigned_df = df_bugs[~df_bugs['time_assign_diff'].isna()]
print('Bugs assigned only once:',len(assigned_df[assigned_df['first_assignment_date']==assigned_df['last_assignment_date']]))
print('Bugs assigned only once or assigned more than once but on the same day:',len(assigned_df[assigned_df['time_assign_diff']==0]))
print('Bugs assigned more than one time in more than one days:',len(assigned_df[assigned_df['time_assign_diff']>0]))

Bugs assigned only once: 47999
Bugs assigned only once or assigned more than once but on the same day: 48604
Bugs assigned more than one time in more than one days: 3776


In [11]:
resolved_df = df_bugs[~df_bugs['time_resolve_diff'].isna()]
print('Bugs resolved only once:',len(resolved_df[resolved_df['first_assignment_date']==resolved_df['last_assignment_date']]))
print('Bugs resolved only once or resolved more than once but on the same day:',len(resolved_df[resolved_df['time_assign_diff']==0]))
print('Bugs resolved more than one time in more than one days:',len(resolved_df[resolved_df['time_assign_diff']>0]))

Bugs resolved only once: 41466
Bugs resolved only once or resolved more than once but on the same day: 42045
Bugs resolved more than one time in more than one days: 3106


In [12]:
fixed_df = df_bugs[~df_bugs['time_fix_diff'].isna()]
print('Bugs fixed only once:',len(fixed_df[fixed_df['first_assignment_date']==fixed_df['last_assignment_date']]))
print('Bugs fixed only once or fixed more than once but on the same day:',len(fixed_df[fixed_df['time_assign_diff']==0]))
print('Bugs fixed more than one time in more than one days:',len(fixed_df[fixed_df['time_assign_diff']>0]))

Bugs fixed only once: 31170
Bugs fixed only once or fixed more than once but on the same day: 31678
Bugs fixed more than one time in more than one days: 2443


In [13]:
bug_history = pd.read_csv(bugs_processed_history_file,index_col=False)
bug_history = bug_history[bug_history['id'].isin(df_bugs.id.unique().tolist())]
bug_history.to_csv(bugs_history_final_file,index=False)

In [14]:
#Transfer the original files to the data folder with only the final dataset bugs
df_bugs_orig = pd.read_csv(bugs_original_info_file,index_col=False)
df_bugs_orig = df_bugs_orig[df_bugs_orig['id'].isin(df_bugs.id.unique().tolist())]
df_bugs_orig.to_csv(bugs_final_full_file,index=False)

bugs_history_orig  = pd.read_csv(bugs_original_history_file,index_col=False)
bugs_history_orig = bugs_history_orig[bugs_history_orig['id'].isin(df_bugs.id.unique().tolist())]
bugs_history_orig.to_csv(bugs_history_final_file_non_injected,index=False)

  interactivity=interactivity, compiler=compiler, result=result)


# "version_origin" to store the first release that the bug appeared

In [15]:
#create a new dataframe about a but present in different releases
bug_versions = pd.DataFrame()

In [17]:
bug_versions = df_bugs[['id','version']]
bug_versions['source'] = 'version field'
bug_versions.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,id,version,source
0,475361,4.5,version field
1,475365,4.6,version field
2,475370,4.5,version field
3,475379,4.5,version field
4,475407,4.5,version field


In [18]:
df_all = pd.read_csv(bugs_processed_history_file,index_col=False)
df_all[df_all['id'].isin(df_bugs.id.unique())]
df_all=trans_to_datetime(df_all,['when'])
df_all

Unnamed: 0,added,id,removed,what,when,who
0,platform-help-inbox@eclipse.org,100001,pde-ui-inbox@eclipse.org,assigned_to,2005-06-14 16:16:00,wassim.melhem@gmail.com
1,Help,100001,UI,component,2005-06-14 16:16:00,wassim.melhem@gmail.com
2,Platform,100001,PDE,product,2005-06-14 16:16:00,wassim.melhem@gmail.com
3,konradk@ca.ibm.com,100001,,cc,2005-06-14 17:17:31,konradk@ca.ibm.com
4,dejan@ca.ibm.com,100001,platform-help-inbox@eclipse.org,assigned_to,2005-06-14 17:17:31,konradk@ca.ibm.com
5,3.1 RC3,100001,---,target_milestone,2005-06-14 20:26:08,konradk@ca.ibm.com
6,P2,100001,P3,priority,2005-06-14 20:44:53,dejan@ca.ibm.com
7,RESOLVED,100001,NEW,status,2005-06-14 21:26:05,dejan@ca.ibm.com
8,FIXED,100001,,resolution,2005-06-14 21:26:05,dejan@ca.ibm.com
9,david_audel@fr.ibm.com,100002,jdt-core-inbox@eclipse.org,assigned_to,2006-10-06 19:42:16,Olivier_Thomann@ca.ibm.com


In [20]:
version_buglist=df_all[df_all['what']=='version'].id.unique()
version_bugs=df_all[df_all['id'].isin(version_buglist)]
only_version_bugs=version_bugs[version_bugs['what']=='version']

version_bugs_grouped=only_version_bugs[['id','removed']].rename(index=str,columns={'removed':'version'})

version_bugs_grouped['source'] = 'bug history'

bug_versions = bug_versions.append(version_bugs_grouped,ignore_index=True)

version_bugs_grouped=only_version_bugs[['id','added']].rename(index=str,columns={'added':'version'})
version_bugs_grouped['source'] = 'bug history'

bug_versions = bug_versions.append(version_bugs_grouped,ignore_index=True)

bug_versions = bug_versions.drop_duplicates()
bug_versions.head()


Unnamed: 0,id,version,source
0,475361,4.5,version field
1,475365,4.6,version field
2,475370,4.5,version field
3,475379,4.5,version field
4,475407,4.5,version field


In [21]:
#if a version exists both because of the version field and the bug history, only keep the version field entry
bug_versions = bug_versions.sort_values(by=['id','version','source'])
bug_versions = bug_versions.drop_duplicates(['id','version'],keep='last')
bug_versions

Unnamed: 0,id,version,source
139852,2382,2.0,bug history
132067,2382,3.6,version field
140899,2802,2.0,bug history
18483,2802,3.3,version field
142200,3091,2.0,bug history
132068,3091,3.0,version field
142287,3109,2.0,bug history
132069,3109,3.8.0 Juno,version field
147230,4745,2.0,bug history
33037,4745,4.8,version field


In [22]:
bug_versions.version.unique()

array(['2.0', '3.6', '3.3', '3.0', '3.8.0 Juno', '4.8', '3.2', '3.1',
       'unspecified', '2.1.2', '3.5', '2.1', '2.0.2', '1.0', '1.1',
       '2.1.1', '1.2', '3.1.2', '3.8', '3.0.1', '3.4', '2.1.3', '1.1.3',
       '2.0.1', '3.7', '1.0.0', '3.0.2', '4.0', '3.1.1', '4.7', '0.7',
       '3.2.1', '2.2', '4.4', '4.5.1', '3.2.2', '4.1', '4.4.2', '1.3.0',
       '1.5', '1.0.1', '3.4.1', '1.0.2', '4.2', '2.0.0', '3.3.1', '0.7.1',
       '0.9', '4.2.2', '2.1.0', '3.6.1', '4.3', '0.2', '0.6', '0.8',
       '4.5', '1.2.1', '1.5.2', '3.3.2', '1.5.3', '0.1.3', '1.5.4',
       '4.0.1', '4.6', '3.4.2', '4.0.2', '2.4.0', '1.5.5', 'dev', '2.3.0',
       '2.3', '5.0', '4.0.3', '3.5.1', '1.3.1', '3.0.3', '2.5.0', '3.6.2',
       '2.3.1', '1.0.3', '3.5.2', '5.0.1', '2008-Ganymede', '0.1', '6.0',
       'DD 1.1', '5.0.2', '3.0.0', '1.6.4', '0.9.2', '1.7', '1.1.0',
       '3.7.2', '1.2.0', 'Europa', 'Galileo', '1.7.1', '1.8', '4.2.1',
       '3.8.1', '0.8.0', '6.0.2', '2.6.0', '0 DD 1.1', '3.8.2', '7.0'

In [23]:
#filter 1: fix names that we know the number for, e.g., Galileo
bug_versions.loc[bug_versions['version']=='Europa','version'] = '3.3'
bug_versions.loc[bug_versions['version']=='2008-Ganymede','version'] = '3.4'
bug_versions.loc[bug_versions['version']=='Galileo','version'] = '3.5'


In [24]:
#filter 2: 
bug_versions = bug_versions[bug_versions['version'].str.find('.')>=0]
bug_versions.head()


Unnamed: 0,id,version,source
139852,2382,2.0,bug history
132067,2382,3.6,version field
140899,2802,2.0,bug history
18483,2802,3.3,version field
142200,3091,2.0,bug history


In [25]:
bug_versions.version.unique()

array(['2.0', '3.6', '3.3', '3.0', '3.8.0 Juno', '4.8', '3.2', '3.1',
       '2.1.2', '3.5', '2.1', '2.0.2', '1.0', '1.1', '2.1.1', '1.2',
       '3.1.2', '3.8', '3.0.1', '3.4', '2.1.3', '1.1.3', '2.0.1', '3.7',
       '1.0.0', '3.0.2', '4.0', '3.1.1', '4.7', '0.7', '3.2.1', '2.2',
       '4.4', '4.5.1', '3.2.2', '4.1', '4.4.2', '1.3.0', '1.5', '1.0.1',
       '3.4.1', '1.0.2', '4.2', '2.0.0', '3.3.1', '0.7.1', '0.9', '4.2.2',
       '2.1.0', '3.6.1', '4.3', '0.2', '0.6', '0.8', '4.5', '1.2.1',
       '1.5.2', '3.3.2', '1.5.3', '0.1.3', '1.5.4', '4.0.1', '4.6',
       '3.4.2', '4.0.2', '2.4.0', '1.5.5', '2.3.0', '2.3', '5.0', '4.0.3',
       '3.5.1', '1.3.1', '3.0.3', '2.5.0', '3.6.2', '2.3.1', '1.0.3',
       '3.5.2', '5.0.1', '0.1', '6.0', 'DD 1.1', '5.0.2', '3.0.0',
       '1.6.4', '0.9.2', '1.7', '1.1.0', '3.7.2', '1.2.0', '1.7.1', '1.8',
       '4.2.1', '3.8.1', '0.8.0', '6.0.2', '2.6.0', '0 DD 1.1', '3.8.2',
       '7.0', '3.7.1', '4.3.1', '4.3.2', '3.0.5', '2.2.1', '8.0', '0.5',

In [26]:
#fix abnormal values because we might miss closest minor releases
for versioninit in bug_versions.version.unique().tolist():
    version = versioninit
    if ' ' in version:
        if '.' not in version[:version.find(' ')]:
            continue
        else:
            version = version[:version.find(' ')]
            bug_versions.loc[bug_versions['version']==versioninit,'version'] = version

bug_versions.version.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


array(['2.0', '3.6', '3.3', '3.0', '3.8.0', '4.8', '3.2', '3.1', '2.1.2',
       '3.5', '2.1', '2.0.2', '1.0', '1.1', '2.1.1', '1.2', '3.1.2',
       '3.8', '3.0.1', '3.4', '2.1.3', '1.1.3', '2.0.1', '3.7', '1.0.0',
       '3.0.2', '4.0', '3.1.1', '4.7', '0.7', '3.2.1', '2.2', '4.4',
       '4.5.1', '3.2.2', '4.1', '4.4.2', '1.3.0', '1.5', '1.0.1', '3.4.1',
       '1.0.2', '4.2', '2.0.0', '3.3.1', '0.7.1', '0.9', '4.2.2', '2.1.0',
       '3.6.1', '4.3', '0.2', '0.6', '0.8', '4.5', '1.2.1', '1.5.2',
       '3.3.2', '1.5.3', '0.1.3', '1.5.4', '4.0.1', '4.6', '3.4.2',
       '4.0.2', '2.4.0', '1.5.5', '2.3.0', '2.3', '5.0', '4.0.3', '3.5.1',
       '1.3.1', '3.0.3', '2.5.0', '3.6.2', '2.3.1', '1.0.3', '3.5.2',
       '5.0.1', '0.1', '6.0', 'DD 1.1', '5.0.2', '3.0.0', '1.6.4',
       '0.9.2', '1.7', '1.1.0', '3.7.2', '1.2.0', '1.7.1', '1.8', '4.2.1',
       '3.8.1', '0.8.0', '6.0.2', '2.6.0', '0 DD 1.1', '3.8.2', '7.0',
       '3.7.1', '4.3.1', '4.3.2', '3.0.5', '2.2.1', '8.0', '0.5', '7.0

In [27]:
bug_versions['release'] = bug_versions.apply(lambda x: assign_to_closest_minor(str(x['version'])),axis=1)
bug_versions['release'] = bug_versions['release'].astype(str)
bug_versions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,id,version,source,release
139852,2382,2.0,bug history,2.0
132067,2382,3.6,version field,3.6
140899,2802,2.0,bug history,2.0
18483,2802,3.3,version field,3.3
142200,3091,2.0,bug history,2.0
132068,3091,3.0,version field,3.0
142287,3109,2.0,bug history,2.0
132069,3109,3.8.0,version field,4.2
147230,4745,2.0,bug history,2.0
33037,4745,4.8,version field,4.8


In [28]:
bug_versions.head()

Unnamed: 0,id,version,source,release
139852,2382,2.0,bug history,2.0
132067,2382,3.6,version field,3.6
140899,2802,2.0,bug history,2.0
18483,2802,3.3,version field,3.3
142200,3091,2.0,bug history,2.0


In [29]:
print('Current releases:'+str(sorted(bug_versions.release.unique().tolist())))

Current releases:['0 DD 0.9', '0 DD 1.1', '0.1', '0.11', '0.12', '0.13', '0.14', '0.15', '0.17', '0.2', '0.21', '0.22', '0.24', '0.3', '0.5', '0.6', '0.7', '0.8', '0.9', '1.0', '1.1', '1.10', '1.2', '1.3', '1.4', '1.5', '1.6', '1.7', '1.8', '1.9', '2.0', '2.1', '2.11', '2.12', '2.2', '2.3', '2.4', '2.5', '2.6', '2.7', '2.8', '2.9', '3.0', '3.1', '3.10', '3.11', '3.13', '3.14', '3.16', '3.18', '3.19', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.9', '4.0', '4.1', '4.10', '4.11', '4.2', '4.3', '4.4', '4.5', '4.6', '4.7', '4.8', '4.9', '5.0', '5.1', '5.2', '5.9', '6.0', '7.0', '8.0', '8.1', '8.2', '8.3', '8.4', '8.5', '8.6', '8.7', '8.8', '9.0', '9.2', '9.3', '9.4', '9.5', 'DD 1.1']


In [30]:
#filter 3: exclude the releases not in our study
bug_versions = bug_versions[bug_versions['release'].isin(relase_creation_ts_all)]
print('After filtering releases:'+str(sorted(bug_versions.release.unique().tolist())))

After filtering releases:['3.0', '3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '4.10', '4.2', '4.3', '4.4', '4.5', '4.6', '4.7', '4.8', '4.9']


In [32]:
def get_tap(value):
    tap = tuple(int(i) for i in value.split('.'))
    return tap

def ag_function_min(values):
    values = values.unique().tolist()
    if len(values)>1:
        values = [get_tap(x) for x in values]
        values = sorted(values)
        min_val = values[0]
        return str(min_val[0])+'.'+str(min_val[1])
    else:
        return values[0]
    
def ag_function_max(values):
    values = values.unique().tolist()
    if len(values)>1:
        values = [get_tap(x) for x in values]
        values = sorted(values,reverse=True)
        max_val = values[0]
        return str(max_val[0])+'.'+str(max_val[1])
    else:
        return values[0]

min_releases = (
    bug_versions
    .groupby(by=['id'])
    .agg({'release':ag_function_min})
    .reset_index()
    .rename(index=str,columns={'release':'min_release'})
)
max_releases = (
    bug_versions
    .groupby(by=['id'])
    .agg({'release':ag_function_max})
    .reset_index()
    .rename(index=str,columns={'release':'max_release'})
)

if 'min_release' not in df_bugs.columns:
    df_bugs = pd.merge(df_bugs,min_releases,on=['id'],how='left')
    
if 'max_release' not in df_bugs.columns:
    df_bugs = pd.merge(df_bugs,max_releases,on=['id'],how='left')

In [33]:
df_bugs.head()

Unnamed: 0,id,release,Product,creation_time,version,resolution,is_assigned,is_resolved,is_fixed,first_assignment_date,last_assignment_date,first_resolved_date,last_resolved_date,first_fixed_date,last_fixed_date,time_assign_diff,time_resolve_diff,time_fix_diff,min_release,max_release
0,475361,4.5,Platform,2015-08-19 10:50:25,4.5,FIXED,0,1,1,NaT,NaT,2015-08-19 11:53:03,2015-08-19 11:53:03,2015-08-19 11:53:03,2015-08-19 11:53:03,,0.0,0.0,4.5,4.5
1,475365,4.6,Platform,2015-08-19 11:34:37,4.6,FIXED,0,1,1,NaT,NaT,2015-08-26 14:29:30,2015-08-26 14:29:30,2015-08-26 14:29:30,2015-08-26 14:29:30,,0.0,0.0,4.6,4.6
2,475370,4.5,Platform,2015-08-19 12:09:06,4.5,DUPLICATE,0,0,0,NaT,NaT,NaT,NaT,NaT,NaT,,,,4.5,4.5
3,475379,4.5,Platform,2015-08-19 13:39:12,4.5,,0,0,0,NaT,NaT,NaT,NaT,NaT,NaT,,,,4.5,4.5
4,475407,4.5,Platform,2015-08-19 17:06:10,4.5,FIXED,0,1,1,NaT,NaT,2015-08-19 17:15:25,2015-08-19 17:15:25,2015-08-19 17:15:25,2015-08-19 17:15:25,,0.0,0.0,4.5,4.5


In [34]:
#the bugs with not version coming from the bug history will have None values in the min and max release columns
for index,row in df_bugs[df_bugs['min_release'].isna()].iterrows():
    df_bugs.loc[index,'min_release'] = row['release']
    
for index,row in df_bugs[df_bugs['max_release'].isna()].iterrows():
    df_bugs.loc[index,'max_release'] = row['release']


In [35]:
print('Bugs in more than one release:'+str(len(df_bugs[df_bugs['min_release']!=df_bugs['max_release']])))
perc = len(df_bugs[df_bugs['min_release']!=df_bugs['max_release']])/len(df_bugs)
print('Percentage:'+str(perc))

Bugs in more than one release:3604
Percentage:0.026031998266459605


In [36]:
#create a temp dataframe to calculate this
dftemp = df_bugs
dftemp['min_release_before_version_field'] = dftemp.apply(
    lambda x :
    is_larger_release(x['release'],x['min_release']),
    axis=1
)
print('Bugs started in earlier releases (compared to the verion field value):'+str(len(dftemp[dftemp['min_release_before_version_field']==True])))

Bugs started in earlier releases (compared to the verion field value):1709


In [37]:
dftemp['min_release_after_version_field'] = dftemp.apply(
    lambda x :
    is_larger_release(x['min_release'],x['release']),
    axis=1
)
print('Bugs started in subsequent releases (compared to the verion field value):'+str(len(dftemp[dftemp['min_release_after_version_field']==True])))

Bugs started in subsequent releases (compared to the verion field value):0


In [38]:
#EXPORT FILES
df_bugs.to_csv(bugs_final_info_file,index=False)
bug_versions.to_csv(bug_versions_file,index=False)

# Take into account if the release is changed when calculating the difference between the first and last assingment date

### WILL WE DO THIS???? IF NOT, REMOVE THE SUBSEQUENT CELLS

In [None]:
@ELENI: CONTINUE FROM HERE

In [None]:
df_all = pd.read_csv('.'+os.sep+'data'+os.sep+'bugs_info_initial_versions.csv',index_col=False)
df_all = df_bugs.fillna('')
df_all.head()

In [None]:
#@Zeinab: what does this count?
assigned_in_zero=assigned_df[assigned_df['f_l_assign_diff']!=pd.Timedelta(0)]

assigned_in_zero=assigned_in_zero[['bug_id']]
assigned_in_zero.columns

vers_bugs=pd.merge(assigned_in_zero, df_all, on=['bug_id'], how='inner')

len(vers_bugs.bug_id.unique())


vers=vers_bugs[vers_bugs['version']!=vers_bugs['intial_version']]


version_change=pd.merge(vers, df, on=['bug_id', 'release', 'Product', 'creation_ts', 'version'], how='inner')


len(version_change.bug_id.unique())

In [None]:
version_change=version_change[version_change['What']=='version']
version_change['last_assignment_date'] = pd.to_datetime(version_change['last_assignment_date'])
version_change['resolved_date'] = pd.to_datetime(version_change['resolved_date'])


In [None]:
version_btw_l_f=version_change[version_change['When']<version_change['last_assignment_date']]
len(version_btw_l_f.bug_id.unique())

In [None]:
version_btw_l_f=version_btw_l_f[version_btw_l_f['When']>version_btw_l_f['resolved_date']]
len(version_btw_l_f.bug_id.unique())

In [None]:
version_btw_l_f['added_minor']=version_btw_l_f.apply(lambda x: str(closest_minor(x['Added'])),axis=1)
version_btw_l_f['removed_minor']=version_btw_l_f.apply(lambda x: str(closest_minor(x['removed'])),axis=1)

In [None]:
version_btw_l_f[version_btw_l_f['added_minor']!=version_btw_l_f['removed_minor']]

In [None]:

version_btw_l_f[version_btw_l_f['bug_id']==246547]