# Adding time of using a dependecy to dataset

In this notebook I added columns to dataset indicating when a dependency added or excluded from a package, The first and last release time of a package, exclusion period and lifespan of a dependency. 

In [212]:
import pandas
import numpy as np

import sys
sys.path.append('..')

from helper import load_data , load_repo

In [213]:
cargo = load_repo('Cargo')
cargo_git = cargo[cargo.Repository_URL.notnull()]
cargo_git = cargo_git[cargo_git.Repository_URL.str.contains('github')].drop_duplicates(subset='Repository_URL')
repos = (
    cargo_git
    [['Name','Repository_URL']]
    .set_index('Name')
)
cargo[cargo.Repository_URL.isnull()].shape

(1571, 59)

In [214]:
cargo[cargo.Repository_URL.notnull()][~cargo[cargo.Repository_URL.notnull()].Repository_URL.str.contains('github')].shape

(413, 59)

In [215]:
len(
    cargo
    [cargo.Repository_URL.notnull()]
    [cargo[cargo.Repository_URL.notnull()].Repository_URL.str.contains('github')]
    .drop_duplicates(subset='Repository_URL',keep='last')
)

9954

In [216]:
packages, dependencies = load_data('Cargo')

In [217]:
packages_dependencies = (
    packages.merge(dependencies,left_on=['package','version'], right_on=['package','version'])
    .sort_values(['package','date'])
    .reset_index()
    .drop('index',axis=1)
)
packages_dependencies.shape

(302834, 5)

#### Unique package versions

In [218]:
package_dependecies_unique = packages_dependencies.sort_values(['package','date']).drop_duplicates(subset=['package','version'],keep='first')
package_dependecies_unique.shape

(66105, 5)

#### Dropped duplicates with keep=last to find last time the package used as dependency

In [219]:
last_time_depend_used = packages_dependencies.sort_values(['package','date']).drop_duplicates(subset=['package','target'],keep='last')
last_time_depend_used.shape

(48597, 5)

#### Adding start time and time of the last version of the package

In [220]:
def get_package_finish(pack_name):
    return package_dependecies_unique[package_dependecies_unique['package'] == pack_name].iloc[-1]['date']
def get_package_start(pack_name):
    return package_dependecies_unique[package_dependecies_unique['package'] == pack_name].iloc[0]['date']
def get_last_time_visit(row):
    vers = last_time_depend_used[lambda x: x['package'] == row['package']][lambda x: x['target'] == row['target']]['version'].values[0]
    target_df = package_dependecies_unique[lambda x: x['package'] == row['package']]
    return ((target_df.iloc[target_df.set_index('version').index.get_loc(vers)+1]['date']) if 
            len(target_df) > target_df.set_index('version').index.get_loc(vers)+1 else 
            target_df.iloc[target_df.set_index('version').index.get_loc(vers)]['date'])

df_tmp = (
    packages_dependencies
    .sort_values(['package','date'])
    .drop_duplicates(subset=['package','target'],keep='first')
)

In [221]:
df_tmp = (
    df_tmp.assign(FirstRelease = df_tmp.apply( lambda d: get_package_start(d['package']),axis=1))
    .assign(LastRelease = df_tmp.apply( lambda d: get_package_finish(d['package']),axis=1))
    .assign(ExclusionTime = df_tmp.apply( lambda d: get_last_time_visit(d),axis=1))
)

#### Time interval between last release of the package and last time the dependency used

In [222]:
df_tmp = (
    df_tmp.rename(columns={'date':'FirstUse'})
    .assign(ExclusionDuration = df_tmp.apply(lambda d: (d['LastRelease'] - d['ExclusionTime']).total_seconds()/3600/24,axis=1))
)

#### Calculating lifespan of a dependency in days

In [223]:
df_tmp = (
    df_tmp.assign(Lifespan = df_tmp.apply(lambda d: (d['ExclusionTime'] - d['FirstUse']).total_seconds()/3600/24,axis=1))
    .assign(Excluded = df_tmp.apply(lambda d: 1 if d['LastRelease']> d['ExclusionTime'] else 0,axis=1))
    [['package','version','target','FirstRelease','LastRelease','FirstUse','ExclusionTime','ExclusionDuration','Lifespan','Excluded']]
)

In [224]:
df_tmp

Unnamed: 0,package,version,target,FirstRelease,LastRelease,FirstUse,ExclusionTime,ExclusionDuration,Lifespan,Excluded
0,BrewStillery,1.0.0,gtk,2017-10-05 21:12:40,2018-03-05 00:56:07,2017-10-05 21:12:40,2018-03-05 00:56:07,0.000000,150.155174,0
1,BrewStillery,1.0.0,regex,2017-10-05 21:12:40,2018-03-05 00:56:07,2017-10-05 21:12:40,2017-10-15 04:42:53,140.842523,9.312650,1
10,BrewStillery,5.0.0,gio,2017-10-05 21:12:40,2018-03-05 00:56:07,2018-01-11 02:18:35,2018-03-05 00:56:07,0.000000,52.942731,0
11,BrewStillery,5.0.0,gdk,2017-10-05 21:12:40,2018-03-05 00:56:07,2018-01-11 02:18:35,2018-03-05 00:56:07,0.000000,52.942731,0
27,ConExpression,0.1.0,rand,2017-07-20 07:48:55,2017-07-20 07:48:55,2017-07-20 07:48:55,2017-07-20 07:48:55,0.000000,0.000000,0
28,CoreFoundation-sys,0.1.0,libc,2015-06-01 16:21:50,2017-11-12 17:33:12,2015-06-01 16:21:50,2017-11-12 17:33:12,0.000000,895.049560,0
33,CoreFoundation-sys,0.1.4,mach,2015-06-01 16:21:50,2017-11-12 17:33:12,2017-11-12 17:33:12,2017-11-12 17:33:12,0.000000,0.000000,0
34,FPS,0.0.0,ilog2,2015-10-16 04:44:37,2015-12-07 04:01:51,2015-10-16 04:44:37,2015-11-09 13:14:44,27.616053,24.354248,1
35,FPS,0.0.1,time,2015-10-16 04:44:37,2015-12-07 04:01:51,2015-11-09 13:14:44,2015-12-07 03:15:23,0.032269,27.583785,1
36,FPS,0.0.2,millefeuille,2015-10-16 04:44:37,2015-12-07 04:01:51,2015-12-07 03:15:23,2015-12-07 04:01:51,0.000000,0.032269,0


In [225]:
df_tmp.to_csv('../data/package_dependencies_time.csv.gz',compression='gzip')