In [1]:
import pandas as pd
import os
import json
import csv
import re, datetime
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, timedelta

from global_functions import *

### Information

Input files:
data_processing/bugs_info.zip
raw_data/bug_history.zip

Output files:
data_processing/bugs_info.zip
data_processing/bug_history.zip
data/assigned_to_ids.zip

In [2]:
bugs_processing_file = '.'+os.sep+'data_processing'+os.sep+'bugs_info.zip'

bug_history_orig = './raw_data/bug_history.zip'
processed_history_file = './data_processing/bug_history.zip'

injected_assignments_file =  './data/assigned_to_ids.zip'

In [3]:
#Load data
#use os.sep to get the separator based on the operating system so as not to have issues running the script in other operating systems
df_info = pd.read_csv(bugs_processing_file,index_col=False,dtype={'release':str,'version':str},compression='zip')
df_info['creation_time'] = pd.to_datetime(df_info['creation_time'])
df_info.head(n=2)

Unnamed: 0,id,Product,version,resolution,status,severity,creation_time,priority,release
0,475361,Platform,4.5,FIXED,RESOLVED,normal,2015-08-19 10:50:25,P3,4.5
1,475365,Platform,4.6,FIXED,RESOLVED,minor,2015-08-19 11:34:37,P3,4.6


In [4]:
len(df_info)

138445

In [5]:
#filter out the history of the bugs we excluded
bug_history = pd.read_csv(bug_history_orig,index_col=False)
bug_history = bug_history[bug_history['id'].isin(df_info.id.unique().tolist())]
if not os.path.exists(processed_history_file):
    bug_history.to_csv(processed_history_file,index=False,compression='zip')

In [6]:
df_bugs = df_info[['id','release','Product','creation_time','version','resolution']].drop_duplicates()

df = bug_history.copy()

#Bugs that were assigned
assigned_bugs = df[(df['what']=='status') & (df['added']=='ASSIGNED')].id.unique().tolist()
#Bugs that were resolved
resolved_bugs = df[(df['what']=='status') & (df['added']=='RESOLVED')].id.unique().tolist()
#Bugs that were fixed
fixed_bugs = df[(df['what']=='resolution') & (df['added']=='FIXED')].id.unique().tolist()


df_bugs['is_assigned'] = 0
df_bugs.loc[df_bugs['id'].isin(assigned_bugs),'is_assigned'] = 1

df_bugs['is_resolved'] = 0
df_bugs.loc[df_bugs['id'].isin(resolved_bugs),'is_resolved'] = 1

df_bugs['is_fixed'] = 0
df_bugs.loc[df_bugs['id'].isin(fixed_bugs),'is_fixed'] = 1

df_bugs.head(n=2)

min_assingment_date = (
    df[(df['id'].isin(assigned_bugs)) & 
       (df['what']=='status') & 
       (df['added']=='ASSIGNED')]
    .groupby('id')
    ['when']
    .min()
    .reset_index()
    .rename(index=str,columns={'when':'first_assignment_date'})
)

df_bugs = pd.merge(df_bugs,min_assingment_date,on=['id'],how='left')

max_assingment_date = (
    df[(df['id'].isin(assigned_bugs)) & 
       (df['what']=='status') & 
       (df['added']=='ASSIGNED')]
    .groupby('id')
    ['when']
    .max()
    .reset_index()
    .rename(index=str,columns={'when':'last_assignment_date'})
)

df_bugs = pd.merge(df_bugs,max_assingment_date,on=['id'],how='left')


min_resolved_date = (
    df[(df['id'].isin(resolved_bugs)) & 
      (df['what']=='status') & 
       (df['added']=='RESOLVED')]
    .groupby('id')
    ['when']
    .min()
    .reset_index()
    .rename(index=str,columns={'when':'first_resolved_date'})
)
df_bugs = pd.merge(df_bugs,min_resolved_date,on=['id'],how='left')

max_resolved_date = (
    df[(df['id'].isin(resolved_bugs)) & 
      (df['what']=='status') & 
       (df['added']=='RESOLVED')]
    .groupby('id')
    ['when']
    .max()
    .reset_index()
    .rename(index=str,columns={'when':'last_resolved_date'})
)
df_bugs = pd.merge(df_bugs,max_resolved_date,on=['id'],how='left')

min_fixed_date = (
    df[(df['id'].isin(fixed_bugs)) & 
      (df['what']=='resolution') & 
       (df['added']=='FIXED')]
    .groupby('id')
    ['when']
    .min()
    .reset_index()
    .rename(index=str,columns={'when':'first_fixed_date'})
)
df_bugs = pd.merge(df_bugs,min_fixed_date,on=['id'],how='left')

max_fixed_date = (
    df[(df['id'].isin(fixed_bugs)) & 
      (df['what']=='resolution') & 
       (df['added']=='FIXED')]
    .groupby('id')
    ['when']
    .max()
    .reset_index()
    .rename(index=str,columns={'when':'last_fixed_date'})
)
df_bugs = pd.merge(df_bugs,max_fixed_date,on=['id'],how='left')

df_bugs = df_bugs.fillna('')
df_bugs.head()

Unnamed: 0,id,release,Product,creation_time,version,resolution,is_assigned,is_resolved,is_fixed,first_assignment_date,last_assignment_date,first_resolved_date,last_resolved_date,first_fixed_date,last_fixed_date
0,475361,4.5,Platform,2015-08-19 10:50:25,4.5,FIXED,0,1,1,,,2015-08-19T11:53:03Z,2015-08-19T11:53:03Z,2015-08-19T11:53:03Z,2015-08-19T11:53:03Z
1,475365,4.6,Platform,2015-08-19 11:34:37,4.6,FIXED,0,1,1,,,2015-08-26T14:29:30Z,2015-08-26T14:29:30Z,2015-08-26T14:29:30Z,2015-08-26T14:29:30Z
2,475370,4.5,Platform,2015-08-19 12:09:06,4.5,DUPLICATE,0,0,0,,,,,,
3,475379,4.5,Platform,2015-08-19 13:39:12,4.5,,0,0,0,,,,,,
4,475407,4.5,Platform,2015-08-19 17:06:10,4.5,FIXED,0,1,1,,,2015-08-19T17:15:25Z,2015-08-19T17:15:25Z,2015-08-19T17:15:25Z,2015-08-19T17:15:25Z


In [7]:
len(df_bugs)

138445

### Assignment fixes

In eclipse bug handling, bugs are assigned aside the ASSIGNED status when the bug is assigned to an email address component_name-triaged@eclipse.org

We inject ASSIGNED statuses so that we minimize the threat to validity

#### CASE 1: -triaged@eclipse.org

In [8]:
injected_assignments = pd.DataFrame()

In [9]:
#bugs that are assigned by the status ASSIGNED
bugs_assigned = df_bugs[df_bugs['is_assigned']==1].id.unique().tolist()

triaged_with_email = bug_history[(bug_history['added'].str.find('-triaged@eclipse.org')>=0)]
triaged_with_email_bugs = [x for x in triaged_with_email.id.unique().tolist() if x not in bugs_assigned]
len(triaged_with_email_bugs)

4811

In [10]:
#inject in the history the assignment row
to_inject = triaged_with_email[triaged_with_email['id'].isin(triaged_with_email_bugs)].copy()
to_inject['removed'] = ''
to_inject['who'] = to_inject['added']
to_inject['added'] = 'ASSIGNED'
to_inject['what'] = 'status'

injected_assignments = injected_assignments.append(to_inject,ignore_index=True)

bug_history = bug_history.append(to_inject,ignore_index=True)

df_bugs.loc[df_bugs['id'].isin(to_inject.id.unique().tolist()),'is_assigned'] = 1

print(len(injected_assignments.id.unique()))

4811


In [14]:
injected_assignments[injected_assignments['id']==322487]

Unnamed: 0,added,id,removed,what,when,who
2413,ASSIGNED,322487,,status,2010-08-12T16:52:55Z,platform-ui-triaged@eclipse.org
2414,ASSIGNED,322487,,status,2010-08-16T13:16:34Z,platform-ui-triaged@eclipse.org
2415,ASSIGNED,322487,,status,2010-08-17T11:38:01Z,platform-ui-triaged@eclipse.org


In [17]:
injected_assignments.to_csv(injected_assignments_file,index=False,compression='zip')

In [16]:
#Re-run the info analysis after the injection
df_bugs = df_info[['id','release','Product','creation_time','version','resolution']].drop_duplicates()

df = bug_history.copy()

#Bugs that were assigned
assigned_bugs = df[(df['what']=='status') & (df['added']=='ASSIGNED')].id.unique().tolist()
#Bugs that were resolved
resolved_bugs = df[(df['what']=='status') & (df['added']=='RESOLVED')].id.unique().tolist()
#Bugs that were fixed
fixed_bugs = df[(df['what']=='resolution') & (df['added']=='FIXED')].id.unique().tolist()


df_bugs['is_assigned'] = 0
df_bugs.loc[df_bugs['id'].isin(assigned_bugs),'is_assigned'] = 1

df_bugs['is_resolved'] = 0
df_bugs.loc[df_bugs['id'].isin(resolved_bugs),'is_resolved'] = 1

df_bugs['is_fixed'] = 0
df_bugs.loc[df_bugs['id'].isin(fixed_bugs),'is_fixed'] = 1

df_bugs.head(n=2)

min_assingment_date = (
    df[(df['id'].isin(assigned_bugs)) & 
       (df['what']=='status') & 
       (df['added']=='ASSIGNED')]
    .groupby('id')
    ['when']
    .min()
    .reset_index()
    .rename(index=str,columns={'when':'first_assignment_date'})
)

df_bugs = pd.merge(df_bugs,min_assingment_date,on=['id'],how='outer')

max_assingment_date = (
    df[(df['id'].isin(assigned_bugs)) & 
       (df['what']=='status') & 
       (df['added']=='ASSIGNED')]
    .groupby('id')
    ['when']
    .max()
    .reset_index()
    .rename(index=str,columns={'when':'last_assignment_date'})
)

df_bugs = pd.merge(df_bugs,max_assingment_date,on=['id'],how='outer')


min_resolved_date = (
    df[(df['id'].isin(resolved_bugs)) & 
      (df['what']=='status') & 
       (df['added']=='RESOLVED')]
    .groupby('id')
    ['when']
    .min()
    .reset_index()
    .rename(index=str,columns={'when':'first_resolved_date'})
)
df_bugs = pd.merge(df_bugs,min_resolved_date,on=['id'],how='outer')

max_resolved_date = (
    df[(df['id'].isin(resolved_bugs)) & 
      (df['what']=='status') & 
       (df['added']=='RESOLVED')]
    .groupby('id')
    ['when']
    .max()
    .reset_index()
    .rename(index=str,columns={'when':'last_resolved_date'})
)
df_bugs = pd.merge(df_bugs,max_resolved_date,on=['id'],how='outer')

min_fixed_date = (
    df[(df['id'].isin(fixed_bugs)) & 
      (df['what']=='resolution') & 
       (df['added']=='FIXED')]
    .groupby('id')
    ['when']
    .min()
    .reset_index()
    .rename(index=str,columns={'when':'first_fixed_date'})
)
df_bugs = pd.merge(df_bugs,min_fixed_date,on=['id'],how='outer')

max_fixed_date = (
    df[(df['id'].isin(fixed_bugs)) & 
      (df['what']=='resolution') & 
       (df['added']=='FIXED')]
    .groupby('id')
    ['when']
    .max()
    .reset_index()
    .rename(index=str,columns={'when':'last_fixed_date'})
)
df_bugs = pd.merge(df_bugs,max_fixed_date,on=['id'],how='outer')

df_bugs = df_bugs.fillna('')
df_bugs.head()

Unnamed: 0,id,release,Product,creation_time,version,resolution,is_assigned,is_resolved,is_fixed,first_assignment_date,last_assignment_date,first_resolved_date,last_resolved_date,first_fixed_date,last_fixed_date
0,475361,4.5,Platform,2015-08-19 10:50:25,4.5,FIXED,0,1,1,,,2015-08-19T11:53:03Z,2015-08-19T11:53:03Z,2015-08-19T11:53:03Z,2015-08-19T11:53:03Z
1,475365,4.6,Platform,2015-08-19 11:34:37,4.6,FIXED,0,1,1,,,2015-08-26T14:29:30Z,2015-08-26T14:29:30Z,2015-08-26T14:29:30Z,2015-08-26T14:29:30Z
2,475370,4.5,Platform,2015-08-19 12:09:06,4.5,DUPLICATE,0,0,0,,,,,,
3,475379,4.5,Platform,2015-08-19 13:39:12,4.5,,0,0,0,,,,,,
4,475407,4.5,Platform,2015-08-19 17:06:10,4.5,FIXED,0,1,1,,,2015-08-19T17:15:25Z,2015-08-19T17:15:25Z,2015-08-19T17:15:25Z,2015-08-19T17:15:25Z


In [18]:
print('Injected assignments for bugs:'+str(len(injected_assignments.id.unique())))

Injected assignments for bugs:4811


In [19]:
bug_history.to_csv(processed_history_file,index=False,compression='zip')
df_bugs.to_csv(bugs_processing_file,index=False,compression='zip')