# Aggregating Metrics from Raw Job Records

##### Purpose

The purpose of this document is to communicate how to Aggregate data from Raw, and provide a function that can be used to accomplish this, or be used as a starting point more more complex analysis.

##### Files

For this demo we are a raw sample slice of Raw containing 4 companies.

In [97]:
# Import Libraries

import itertools
import pandas as pd
import numpy as np

In [109]:
# Loading raw sample slice
JobRecords = pd.read_csv('../raw-sample/raw-sample-jobs.csv',
                         parse_dates = ['created','delete_date','last_checked','last_updated'])


# Function for Aggregation

In [114]:
def combo_roll_up(df, by = ['state','onet_occupation_code','company_id','company_name']):
    
    # Filling in missing values 
    df[by+['hash']]= df[by+['hash']].fillna('Blank')  
    
    # Drop timestamp to just keep date
    try:
        df['created'] = pd.to_datetime(df['created'].dt.date)
        df['delete_date'] = pd.to_datetime(df['delete_date'].dt.date)
    except:
        pass
    # Created Duration column in Job Records
    df['duration'] = (df.delete_date - df.created).astype('timedelta64[D]')
    df.loc[df['duration']>180,'duration'] = 180
    
    # Creating Daily Aggregate
    Aggs = pd.merge(
            df.groupby(by = ['created']+by)['hash'].count().reset_index().rename(columns={'hash': 'n_created', 'created': 'date'}),
            df.groupby(by = ['delete_date']+by)['hash'].count().reset_index().rename(columns={'hash': 'n_deleted', 'delete_date': 'date'}),
            on=['date']+by, how='outer').fillna(0)
     
    # Calculate Cumulative Created
    Aggs = Aggs.sort_values(by + ['date'])  
    cumsums = Aggs[by+['date','n_created']].groupby(by = by + ['date']).sum().fillna(0).groupby(level = list(
                                                                                    range(0,len(by)))).cumsum()
    Aggs.set_index(by + ['date'], inplace = True)
    Aggs['cumulative_created'] = cumsums['n_created']
    Aggs.reset_index(inplace=True)
    
    # Calculate Cumulative Deleted
    Aggs = Aggs.sort_values(by + ['date'])
    cumsums = Aggs[by+['date','n_deleted']].groupby(by + ['date']).sum().fillna(0).groupby(level = list(
                                                                                    range(0,len(by)))).cumsum()
    Aggs.set_index(by + ['date'], inplace = True)
    Aggs['cumulative_deleted'] = cumsums['n_deleted']
    Aggs.reset_index(inplace=True)    
    
    #calculate Unique Active
    Aggs['n_active'] = Aggs['cumulative_created']-Aggs['cumulative_deleted']
    Aggs = Aggs.sort_values(by + ['date'])    
            
    #Calculate Closed Duration
    Aggs = pd.merge(
        Aggs, 
        df.groupby(['delete_date'] + by)['duration'].agg(np.nanmean).reset_index(),
        how = 'left', left_on = ['date'] + by, right_on = ['delete_date']+by)
    
    # Get rid of calculation columns
    Aggs = Aggs.drop(['cumulative_created','cumulative_deleted','delete_date'],axis =1)
    Aggs.sort_values(by + ['date'])
    return Aggs   

by = ['company_id','company_name','state']
Aggregates = combo_roll_up(JobRecords, by)
Aggregates.head(10)

Unnamed: 0,company_id,company_name,state,date,n_created,n_deleted,n_active,duration
0,2,"General Mills, Inc.",AB,2018-03-06,1.0,0.0,1.0,
1,2,"General Mills, Inc.",AB,2018-04-25,0.0,1.0,0.0,50.0
2,2,"General Mills, Inc.",AB,2018-06-21,1.0,0.0,1.0,
3,2,"General Mills, Inc.",AB,2018-07-11,0.0,1.0,0.0,20.0
4,2,"General Mills, Inc.",AB,2018-11-21,1.0,0.0,1.0,
5,2,"General Mills, Inc.",AB,2019-01-29,0.0,1.0,0.0,69.0
6,2,"General Mills, Inc.",AL,2015-03-06,1.0,0.0,1.0,
7,2,"General Mills, Inc.",AL,2015-03-18,0.0,1.0,0.0,12.0
8,2,"General Mills, Inc.",AL,2015-04-29,1.0,0.0,1.0,
9,2,"General Mills, Inc.",AL,2015-05-08,0.0,1.0,0.0,9.0
