In [2]:
import pandas
import matplotlib.pyplot as plt
import seaborn
import collections
import itertools
import numpy as np
import warnings
import subprocess
import requests
import lifelines


import sys
sys.path.append('..')

warnings.filterwarnings('ignore')

%matplotlib inline

### Load Commits

In [3]:
Cts = pandas.read_csv('../data/commits.csv.gz',
                    parse_dates=['commit_date'],
                    date_parser= lambda col: pandas.to_datetime(col).tz_localize(None))
Cts = Cts.set_index('commit_hash')
Cts.head()

Unnamed: 0_level_0,Unnamed: 0,author_email,author_name,commit_date,committer_email,committer_name,project_name
commit_hash,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
cbed88c2606aae9dd721fae9f5e5cead4df01f00,0,eduard.bopp@aepsil0n.de,Eduard Bopp,2017-09-22 13:44:19,eduard.bopp@aepsil0n.de,Eduard Bopp,https://github.com/aepsil0n/acacia
561136bebb18ae6769f282f331df02d499e7f382,1,eduard.bopp@aepsil0n.de,Eduard Bopp,2017-09-22 13:43:45,eduard.bopp@aepsil0n.de,Eduard Bopp,https://github.com/aepsil0n/acacia
0f334c8b1ebfb10c17a46cb6d1e0b656abc397ed,2,code@andre-bubel.de,Andr\xc3\xa9-Patrick Bubel,2017-02-26 16:30:54,code@andre-bubel.de,Andr\xc3\xa9-Patrick Bubel,https://github.com/aepsil0n/acacia
486c9edd907105cc989b9ae05299b68ecde7fbfe,3,github@andre-bubel.de,Andr\xc3\xa9-Patrick Bubel,2017-02-26 16:28:36,noreply@github.com,GitHub,https://github.com/aepsil0n/acacia
2f46bc1585103106deb87213b3db547a1d922898,4,carol.nichols@gmail.com,Carol (Nichols || Goulding),2017-02-26 09:46:52,noreply@github.com,GitHub,https://github.com/aepsil0n/acacia


### Load Pull Requests

In [4]:
Prs = pandas.read_csv('../data/pullrequests.csv.gz',
                    parse_dates=['closed_at','created_at','merged_at','updated_at'],
                    date_parser= lambda col: pandas.to_datetime(col).tz_localize(None))[['pr_id','project_name','user_login','created_at','merged_at','closed_at','merge_commit_sha']]
Prs.head()

Unnamed: 0,pr_id,project_name,user_login,created_at,merged_at,closed_at,merge_commit_sha
0,94.0,https://github.com/aepsil0n/acacia,carols10cents,2017-02-26 14:51:23,2017-02-26 15:28:37,2017-02-26 15:28:37,486c9edd907105cc989b9ae05299b68ecde7fbfe
1,93.0,https://github.com/aepsil0n/acacia,Moredread,2017-01-08 18:01:04,2017-01-13 11:11:27,2017-01-13 11:11:27,6b174c9af91c48008548074e788f106c17cc4d3d
2,91.0,https://github.com/aepsil0n/acacia,Moredread,2016-09-16 07:25:12,2016-09-16 20:41:32,2016-09-16 20:41:32,7a47ac25ca172d0af73e8845db89d418a0479bb4
3,90.0,https://github.com/aepsil0n/acacia,Moredread,2016-09-16 07:02:06,2016-09-16 20:45:28,2016-09-16 20:45:28,e1c518f2cfa22634d2510a6d9d06cb4dcf346000
4,89.0,https://github.com/aepsil0n/acacia,Moredread,2016-09-16 06:54:41,2016-09-16 20:43:22,2016-09-16 20:43:22,c594be4bfe08e92e360578841877354cc3a4d12e


#### Find commiter of each pull request (if exist)

In [5]:
def getcommiter(row):
    if pandas.isnull(row['merge_commit_sha']):
        return np.nan
    if row['merge_commit_sha'] in Cts.index :
        return Cts.loc[row['merge_commit_sha']].committer_name
    return np.nan
    

Prs['committer'] = Prs.apply(lambda x: getcommiter(x),axis = 1)

### Melting dataframe to have records for (final state of pr)

In [7]:
Prs = Prs.drop('merge_commit_sha',axis=1)
df_tmp = pandas.melt(Prs,id_vars=['pr_id','project_name','user_login','committer','merged_at'],var_name ='Event', value_name ='time').sort_values('time')

In [8]:
def checkevent(row):
    if(row['Event'] == 'created_at'):
        return 'created'
    elif (row['Event'] == 'closed_at' and pandas.isnull(row['time']) and pandas.isnull(row['merged_at'])):
        return 'open'
    elif (row['Event'] == 'closed_at' and pandas.notnull(row['merged_at']) ):
        return 'accepted'
    else:
        return 'rejected'
df_tmp['concept'] = df_tmp.apply(lambda x: checkevent(x),axis = 1)

In [9]:
for i, row in df_tmp.iterrows():
    if pandas.isnull(df_tmp.at[i,'time']):
        df_tmp.at[i,'time'] = df_tmp.time.max()

### Load Comments

In [11]:
Cms = pandas.read_csv('../data/pulls_comments.csv.gz',
                      parse_dates=['created_at','updated_at'],
                    date_parser= lambda col: pandas.to_datetime(col).tz_localize(None))[['issue_number','project_name','user_login','created_at']]
Cms = Cms.rename({'issue_number':'pr_id','created_at':'time'},axis=1).assign(concept='commented')

In [12]:
Cms.shape

(707530, 5)

#### Check if user is also contributor or not

In [13]:
exitingcms = Cms[Cms['user_login'].isin( Prs.user_login)]

In [14]:
exitingcms.pr_id = exitingcms.pr_id.astype(int)
df_tmp.pr_id = df_tmp.pr_id.astype(int)

### concat comments and prs

In [15]:
df_xes = (
    pandas
    .concat([df_tmp[['pr_id','project_name','user_login','committer','time','concept']],exitingcms])
    [['pr_id','project_name','user_login','committer','time','concept']]
    .sort_values('time')
    .rename({'project_name':'case:concept:name','time':'time:timestamp','concept':'concept:name'},axis = 1)
)
#df_xes['org:resource'] = df_xes['user_login']

##### load previously saved file because creating it is time consuming

In [16]:
df = pandas.read_csv('../data/prlog1.csv')

In [17]:
def addlifecycle(row):
    if row['concept:name'] == 'created':
        return 'complete'
    elif row['concept:name'] == 'commented':
        return 'complete'
    elif row['concept:name'] == 'rejected' or row['concept:name'] == 'accepted' or row['concept:name'] == 'open':
        return 'complete'
df['lifecycle:transition'] = df.apply(lambda x: addlifecycle(x),axis=1)

In [18]:
df_tmp = df.rename({'user_login':'org:resource','case:concept:name':'project'},axis=1)

In [19]:
df_tmp.pr_id = df_tmp.pr_id.astype(str)
df_tmp['case:concept:name'] = df_tmp['project'] +"|" + df_tmp['pr_id']

In [34]:
#### doing data cleaning ( there where a record with accepted status without being opened )

In [31]:
df_tmp = (
    df_tmp
    .sort_values(['case:concept:name','time:timestamp'])
    .drop(['pr_id','project'],axis=1)
    .drop(224942)
    .iloc[:,1:]
)

In [None]:
df[lambda x: x['case:concept:name'] == 'https://github.com/servo/servo'].drop('case:concept:name',axis=1).to_csv('../data/prlog-servo.csv',index=False)

In [32]:
from pm4py.objects.conversion.log import factory as conversion_factory

log = conversion_factory.apply(df_tmp)

In [33]:
from pm4py.objects.log.exporter.xes import factory as xes_exporter

xes_exporter.export_log(log, "../data/prlog_lifecycle.xes")

In [None]:
log