In [1]:
from pathlib import Path
import pandas as pd
import sys
import os
module_path = Path("../src/")
os.chdir(module_path)
sys.path.append(module_path)
import warnings
warnings.filterwarnings("ignore")
from data_access.load_file import load_table
from data_processing.functions import time_for_phase
from data_processing import features
from data_preparation import compute_work_item_times

In this notebook we want to give an overview on the functions from the model data_processing.features. First we load the data and compute the times for every phase.

In [2]:
filename = Path("../data/gdsc2_public.csv")
data = load_table(filename)
times = time_for_phase(data, process=False)
total_times = compute_work_item_times(data)

In [3]:
times.head()

Unnamed: 0,work_item,from_timestamp,current_phase,current_resource,process_index,to_timestamp,duration,duration_in_days
0,WI_000001,2015-01-02 14:39:14,Analyze,ER_00043,1,2015-01-05 11:48:08,2 days 21:08:54,2.88
1,WI_000002,2015-01-02 15:04:20,Analyze,ER_00225,1,2015-01-14 09:46:19,11 days 18:41:59,11.78
2,WI_000003,2015-01-02 15:28:22,Analyze,ER_00225,1,2015-01-21 11:37:06,18 days 20:08:44,18.84
3,WI_000004,2015-01-02 15:33:54,Analyze,ER_00225,1,2015-01-14 10:41:26,11 days 19:07:32,11.8
4,WI_000005,2015-01-02 16:32:11,Analyze,ER_00206,1,2015-01-28 09:03:38,25 days 16:31:27,25.69


The first function transforms the times dataframe into a dataframe where the current_phase label is the feature label and the value is duration_in_time.

In [4]:
work_times = features.work_times(times, phase_col="current_phase", time_col="duration_in_days")
work_times.head()

Unnamed: 0,work_item,Accept,Analyze,Build,Clarify,Deploy,Design,End,Package,Test
0,WI_000001,1.1,4.82,5.83,0.0,7.02,0.17,0.0,3.91,2.03
1,WI_000002,0.0,11.78,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,WI_000003,0.0,44.89,0.0,0.0,0.0,6.88,0.0,0.0,3.08
3,WI_000004,2.05,11.8,0.0,0.0,11.89,0.0,0.0,0.0,0.0
4,WI_000005,0.05,25.69,0.98,0.0,6.1,0.03,0.0,0.0,0.0


If we don't add a value to the parameter time_col the function will calculates the frequency of the phases as values.

In [5]:
work_frequency = features.work_times(times, phase_col="current_phase")
work_frequency.head()

Unnamed: 0,work_item,Accept_freq,Analyze_freq,Build_freq,Clarify_freq,Deploy_freq,Design_freq,End_freq,Package_freq,Test_freq
0,WI_000001,1,3,1,0,1,3,1,1,1
1,WI_000002,0,1,1,0,0,1,1,0,0
2,WI_000003,0,2,1,0,0,2,1,1,1
3,WI_000004,1,1,1,0,1,1,1,1,1
4,WI_000005,1,1,2,0,1,2,1,0,0


The next funtion process_length calculates the total process length. Multiple phases are count multiple times.

In [6]:
process_length = features.process_length(times)
process_length.head()

Unnamed: 0,work_item,process_length
0,WI_000001,12
1,WI_000002,4
2,WI_000003,8
3,WI_000004,8
4,WI_000005,8


The function is_open is a 0/1 column, which is 1 if the item is still open and 0 if the item is closed.

In [7]:
is_open = features.is_open(data)
is_open.head()

Unnamed: 0,work_item,is_open
0,WI_000001,0
1,WI_000002,0
2,WI_000003,0
3,WI_000004,0
4,WI_000005,0


This funtion gets information from the timestamp

In [8]:
start_time = features.timestamp_information(times=total_times, timestamp_col="start")
start_time.head()

Unnamed: 0,work_item,start_weekday,start_day,start_week,start_month,start_year
0,WI_000001,5,2,1,1,2015
1,WI_000002,5,2,1,1,2015
2,WI_000003,5,2,1,1,2015
3,WI_000004,5,2,1,1,2015
4,WI_000005,5,2,1,1,2015


This function clusters platform and components

In [9]:
platcomp_cluster = features.plat_comp_cluster(data, min_samples=10)
platcomp_cluster.head()

Unnamed: 0,work_item,platcomp_cluster
0,WI_000001,0
1,WI_000002,1
2,WI_000003,1
3,WI_000004,1
4,WI_000005,2


This function clusters resources for collaboration

In [11]:
resource_cluster = features.resource_cluster(times.loc[:10])
resource_cluster.head()

Unnamed: 0,work_item,resource_cluster
0,WI_000001,-1
1,WI_000002,-1
2,WI_000003,-1
3,WI_000004,-1
4,WI_000005,-1


This function computes the current workload

In [12]:
resource_workload = features.resource_workload(times.loc[:10])
resource_workload.head()

Unnamed: 0,work_item,resource_workload
0,WI_000001,0.428571
1,WI_000002,0.428571
2,WI_000003,0.428571
3,WI_000004,0.428571
4,WI_000005,0.428571


This function computes the average workload, experience and employment rate of a work_item

In [13]:
resource_measures = features.resource_measures(times.loc[:10], resource_col="current_resource")
resource_measures.head()

Unnamed: 0,work_item,x_emp,x_exp,x_load
0,WI_000001,0,0,0.166667
1,WI_000002,0,0,0.5
2,WI_000003,0,0,0.5
3,WI_000004,0,0,0.5
4,WI_000005,0,0,0.333333


This function computes if the recieving timestamp is a holiday and the days until the next holidays are

In [14]:
hol = features.get_holidays(times.loc[:10])
hol.head()

Unnamed: 0,work_item,is_holiday,days_to_nexthol,is_vacation,days_to_nextvac
0,WI_000001,0,93.0,1,0.0
1,WI_000002,0,93.0,1,0.0
2,WI_000003,0,93.0,1,0.0
3,WI_000004,0,93.0,1,0.0
4,WI_000005,0,93.0,1,0.0
