# Triaging time of the bugs in the pre-release when approaching its publishing date

In [None]:
import pandas as pd
import os
import json
import csv
import re, datetime
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date, timedelta
from lifelines.statistics import logrank_test
from lifelines import KaplanMeierFitter

from global_functions import *

In [None]:
from scipy import stats

def compare_distributions(tt,var1,var2,what): 
    
    normal_test_var1 = stats.kstest(tt[tt[what]==var1].triaging_time.values.tolist(), 'norm')
    normal_test_var2 = stats.kstest(tt[tt[what]==var2].triaging_time.values.tolist(), 'norm')
    
    #tt[[var1, var2]].plot(kind='box')
    
    #not normal distribution
    if normal_test_var1.pvalue<0.05 or  normal_test_var2.pvalue<0.05:
        print('At least one sample not normally distributed')
        #wilkoxon
        wresult = stats.ranksums(tt[tt[what]==var1].triaging_time, tt[tt[what]==var2].triaging_time)
        
        if wresult.pvalue<0.05:
            print('Statistically significant difference found for:'+var1)
        else:
            print('Statistically significant difference NOT found for:'+var1)
        print(wresult)
    else:
        print('Both samples are normally distributed')
        #t-test
        tresult = stats.ttest_rel(tt[tt[what]==var1].triaging_time, tt[tt[what]==var2].triaging_time)

        if tresult.pvalue<0.05:
            print('Statistically significant difference found for:'+var1)
        else:
            print('Statistically significant difference NOT found for:'+var1)
        print(tresult)
 

In [None]:
release_creation_ts_all = get_release_dates()

In [None]:
#Load data
df = pd.read_csv('data'+os.sep+'bugs_info.zip',index_col=False,compression='zip')
df = df.fillna('')
df['release'] = df.apply(lambda x : str(assign_to_closest_minor(x['version'])), axis=1)
df['release'] = df['release'].astype(str)
df = trans_to_datetime(df,['creation_time','first_assignment_date','last_assignment_date',
                          'first_resolved_date','last_resolved_date',
                           'first_fixed_date','last_fixed_date'])
df.head(n=2)

In [None]:
yearly_file = 'data'+os.sep+'bugs_info.zip'
df_monthly = pd.read_csv(yearly_file,index_col=False,compression='zip',
                                        dtype={'release': str})
df_monthly['release'] = df_monthly.apply(lambda x : str(assign_to_closest_minor(x['version'])), axis=1)
df_monthly.head()

In [None]:
df_monthly.release.unique()

In [None]:
def get_monthly_triaging_stats_2(df_work,assignedFirst,resolvedFirst):
    df_work = trans_to_datetime(df_work,['creation_time','first_assignment_date',
       'last_assignment_date', 'first_resolved_date', 'last_resolved_date',
       'first_fixed_date'])
    
    df_work['release'] = df_work['release'].astype(str)
    mon_stats = pd.DataFrame()
    
    if assignedFirst:
        assingment_column = 'first_assignment_date'
    else:
        assingment_column = 'last_assignment_date'
        
        
    if resolvedFirst:
        resolve_column = 'first_resolved_date'
    else:
        resolve_column = 'last_resolved_date'
        

    for release in sorted(df_work.release.unique().tolist()):
        
        
        release = str(release)
        pred=predecessor(list(release_creation_ts_all.keys()),release)
        #print(release,pred)
        release_date = release_creation_ts_all[release]
        release_date = pd.to_datetime(release_date)

        dfw = df_work[df_work['release']==release]
        dfw = trans_to_datetime(dfw,['creation_time','first_assignment_date',
       'last_assignment_date', 'first_resolved_date', 'last_resolved_date',
       'first_fixed_date', 'last_fixed_date'])
        
        release_events = []
        release_events.append(dfw.creation_time.min())
        release_events.append(dfw.first_assignment_date.min())
        end_inspection = min(release_events)
        
        month=-1
        while month <11:
            month+=1
            end_period = release_date-timedelta(30*month)
            if end_period < end_inspection:
                print('hi break')
                break
            start_period = release_date-timedelta(30*(month+1))
            
            bugs_assigned = (
                dfw[(dfw['first_assignment_date']>=start_period)
                                 & 
                    (dfw['first_assignment_date']<end_period)]
            )
            bugs_assigned = bugs_assigned[['id','release','Product','creation_time','first_assignment_date']].drop_duplicates()
            bugs_assigned['type'] = 'assigned'

            all_dfs = pd.DataFrame()
            all_dfs = all_dfs.append(bugs_assigned, ignore_index=True)
            all_dfs['start_period'] = start_period
            all_dfs['end_period'] = end_period
            all_dfs['month'] = month+1
            all_dfs['period'] = 'before'
            all_dfs['release'] = release
            #all_dfs['predecessor'] = str(pred)
            
            mon_stats = mon_stats.append(all_dfs,ignore_index=True)
        
    return mon_stats

In [None]:
e_target_file = 'data'+os.sep+'RQ3'+os.sep+'releases_monthly_stats_first_assign_E_2.zip'
if not os.path.exists(e_target_file):
    df_monthly_stats_first_2 = get_monthly_triaging_stats_2(df_monthly,assignedFirst=True,resolvedFirst=False)
    df_monthly_stats_first_2.to_csv(e_target_file,index=False,compression='zip')
else:
    df_monthly_stats_first_2 = pd.read_csv(e_target_file,index_col=False,compression='zip',
                                          dtype={'release': str,'predecessor': str})
    


In [None]:
df_monthly_stats_first_2['release'] = df_monthly_stats_first_2['release'].astype(str)
df_monthly_stats_first_2['month'] = df_monthly_stats_first_2.apply(lambda x : reverse_month(x['month']), axis=1)
#df_monthly_stats_first_2 = sort_df(df_monthly_stats_first_2,'release')
df_monthly_stats_first_2.head()

In [None]:
file_target_e = 'data'+os.sep+'RQ3'+os.sep+'time_assign_E_2.csv.gz'
if not os.path.exists(file_target_e):
    df_total=df_monthly_stats_first_2.copy()
    df_total = trans_to_datetime(df_total,['first_assignment_date','creation_time'])
    df_total['triaging_time'] =df_total.apply(lambda x : (x['first_assignment_date']-x['creation_time']).days, axis=1)
    df_total= df_total[['id','month','triaging_time' ,'release' ]]
    
    df_total.to_csv(file_target_e,index=False,compression='gzip')
else:
    df_total = pd.read_csv(file_target_e,index_col=False,compression='gzip',dtype={'release':str})
df_total.head()

In [None]:
r_month=[10,11,12]

releases_only = df_total[['release']].drop_duplicates()
releases_only = sort_df(releases_only,'release')
releases_only = releases_only[['release']]

for release in releases_only.release.unique():
#release='4.3'
    release=str(release)
    if (release!='4.9') & (release!='4.10'):
        print(release)
        sns.set(style="whitegrid")
        draft=df_total[(df_total['release']==release) ]
        ax = plt.subplot(1, 1,1)
        
         
        sns.boxplot(x="month", y="triaging_time",
              data=draft, palette="Set1")
        ax.set_xlabel("Approaching release "+release)
        ax.set_ylabel("Triaging duration in days")
        ax.set_yscale('log')
        ax.set_ylim(0, 1800)
        
        plt.tight_layout( )
        plt.show()
        #ax.get_figure().savefig('..'+os.sep+'paper'+os.sep+'figs'+os.sep+'RQ3'+os.sep+'triagingRate_next'+release.replace('.', '-')+'.pdf', bbox_inches="tight")
        
    else:
        print(release)
        sns.set(style="whitegrid")
        draft=df_total[(df_total['release']==release) & (df_total['month'].isin(r_month))]
        ax = plt.subplot(1, 1,1)  
        sns.boxplot(x="month", y="triaging_time",
              data=draft, palette="Set1")
        ax.set_xlabel("Approaching release "+release)
        ax.set_ylabel("Triaging duration in days")
        ax.set_yscale('log')
        ax.set_ylim(0, 1800)
        ax.set_xticklabels( ('1', '2','3') )

        plt.tight_layout( )
        plt.show()
        #ax.get_figure().savefig('..'+os.sep+'paper'+os.sep+'figs'+os.sep+'RQ3'+os.sep+'triagingRate_next'+release.replace('.', '-')+'.pdf', bbox_inches="tight")            
#plt.savefig('rq3t4-4.pdf')