# adding time of start and finish using a dependecy to data set

In this notebook I added columns to dataset indicating when a dependency added or removed from a package. Decision on stop using a dependency is made by checking the next version.

In [1]:
import pandas
import numpy as np

import sys
sys.path.append('..')

from helper import load_data , load_repo

In [20]:
cargo = load_repo('Cargo')
cargo_git = cargo[cargo.Repository_URL.notnull()]
cargo_git = cargo_git[cargo_git.Repository_URL.str.contains('github')].drop_duplicates(subset='Repository_URL')
repos = cargo_git[['Name','Repository_URL']].set_index('Name')

In [21]:
packages, dependencies = load_data('Cargo')

In [22]:
packages_dependencies = (
    packages.merge(dependencies,left_on=['package','version'], right_on=['package','version'])
    .sort_values(['package','date']).reset_index().drop('index',axis=1)
)

#### Dropped duplicates with keep=last to find last time the package used as dependency

In [24]:
package_dependecies_unique = packages_dependencies.sort_values(['package','date']).drop_duplicates(subset=['package','version'],keep='first')
last_time_depend_used = packages_dependencies.sort_values(['package','date']).drop_duplicates(subset=['package','target'],keep='last')

#### Adding package start time and time of the last version

In [26]:
def get_package_finish(pack_name):
    return package_dependecies_unique[package_dependecies_unique['package'] == pack_name].iloc[-1]['date']
def get_package_start(pack_name):
    return package_dependecies_unique[package_dependecies_unique['package'] == pack_name].iloc[0]['date']
def get_last_time_visit(pack_name,depend):
    return last_time_depend_used[lambda x: x['package'] == pack_name][lambda x: x['target'] == depend]['date'].values[0]

df_tmp = (
    packages_dependencies
    .sort_values(['package','date'])
    .drop_duplicates(subset=['package','target'],keep='first')
)

In [27]:
df_tmp = (
    df_tmp.assign(fst_release = df_tmp.apply( lambda d: get_package_start(d['package']),axis=1))
    .assign(lst_release = df_tmp.apply( lambda d: get_package_finish(d['package']),axis=1))
    .assign(last_time = df_tmp.apply( lambda d: get_last_time_visit(d['package'],d['target']),axis=1))
)

#### Time interval between last release of the package and last time the dependency used

In [28]:
df_tmp = (
    df_tmp.rename(columns={'date':'first_time'})
    .assign(elimination = df_tmp.apply(lambda d: (d['lst_release'] - d['last_time']).total_seconds()/3600/24,axis=1))
)

#### Calculating lifespan of a dependency in days

In [29]:
df_tmp = (
    df_tmp.assign(period = df_tmp.apply(lambda d: (d['last_time'] - d['first_time']).total_seconds()/3600/24,axis=1))
    .assign(removed = df_tmp.apply(lambda d: 1 if d['lst_release']> d['last_time'] else 0,axis=1))
    [['package','version','target','fst_release','lst_release','first_time','last_time','elimination','period','removed']]
)

In [30]:
df_tmp.to_csv('../data/package_dependencies_time.csv.gz',compression='gzip')