# Aggregating Metrics from Raw Job Records

##### Purpose

The purpose of this document is to communicate how to aggregate data from raw job records to be used as a starting point more more complex analysis.

##### Files

For this demo we are using a raw sample slice of Raw containing 4 companies.

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import tarfile

# Load the job records data

##### For this example we will use a slice of job records and descriptions that is in one csv file.  The full month historical archive is split into 16 files, but the same methodology can be used across all of those files.

In [2]:
# Loading raw sample slice
with tarfile.open('../../../Feeds/raw-sample.tar.gz', "r:*") as tar:
    # Get path to the job_records file within the tarfile
    csv_path = tar.getnames()[1]
    # Load job records file into pandas dataframe
    job_records = pd.read_csv(tar.extractfile(csv_path),
                    parse_dates = ['created','delete_date','last_checked','last_updated'],
                    low_memory = False)

In [3]:
job_records.loc[:10,'hash':'company_name']

Unnamed: 0,hash,title,company_id,company_name
0,0f659e59f8967f986ab53b898e543095,Financial Solutions Advisor - Bilingual Mandar...,381,Bank of America Corporation
1,058e013f78e737baf7a6d9c36b33ba9e,"Teller - Bishop, CA (Parttime, 20hrs)",381,Bank of America Corporation
2,0234422f9c5eff7f6d1d008c3c31dae6,"UI Artist, Double Helix Games, Amazon Game Stu...",469,"Amazon.com, Inc."
3,0a5cf0310984a8c86bb42b1269640b28,Home Retention Specialist/Collector I - Russel...,381,Bank of America Corporation
4,0a7cd92ca3f7277442bbd157c018a968,South San Diego-Sales & Service Specialist-Jac...,381,Bank of America Corporation
5,03ffb7f37da9f17b3c9806fb032363c8,Personal Banker - Hialeah Gardens Banking Cent...,381,Bank of America Corporation
6,07faeba744c2ca2d0525e78b6b08e142,Mortgage Loan Officer-Palm Desert,381,Bank of America Corporation
7,0c392d37eb6dfb15b1ffc2a48b8d95e7,"Warehouse Team Member (Seasonal, Part Time, Fl...",469,"Amazon.com, Inc."
8,0ee6d7263a67121e8405af8ad8fdef2a,Responsable des opérations de contrôle d'inven...,469,"Amazon.com, Inc."
9,0aaf97d36a5279ac18a43d1e34344232,"Relationship Manager-Newton/Waltham, MA Area",381,Bank of America Corporation


# Aggregating the data

In [4]:
def combo_roll_up(df, by = ['state','onet_occupation_code','company_id','company_name']):
    
    # Filling in missing values 
    df[by+['hash']]= df[by+['hash']].fillna('Blank')  
    
    # Drop timestamp to just keep date
    try:
        df['created'] = pd.to_datetime(df['created'].dt.date)
        df['delete_date'] = pd.to_datetime(df['delete_date'].dt.date)
    except:
        pass
    # Created Duration column in Job Records
    df['closed_duration'] = (df.delete_date - df.created).astype('timedelta64[D]')
    df.loc[df['closed_duration']>180,'closed_duration'] = 180
    
    # Creating Daily Aggregate
    Aggs = pd.merge(
            df.groupby(by = ['created']+by)['hash'].count().reset_index().rename(columns={
                'hash': 'n_created', 'created': 'date'}),
            df.groupby(by = ['delete_date']+by)['hash'].count().reset_index().rename(columns={
                'hash': 'n_deleted', 'delete_date': 'date'}),
            on=['date']+by, how='outer').fillna(0)
     
    # Calculate Cumulative Created
    Aggs = Aggs.sort_values(by + ['date'])  
    cumsums = Aggs[by+['date','n_created']].groupby(by = by + ['date']).sum()
    cumsums = cumsums.fillna(0).groupby(level = list(range(0,len(by)))).cumsum()
    Aggs.set_index(by + ['date'], inplace = True)
    Aggs['cumulative_created'] = cumsums['n_created']
    Aggs.reset_index(inplace=True)
    
    # Calculate Cumulative Deleted
    Aggs = Aggs.sort_values(by + ['date'])
    cumsums = Aggs[by+['date','n_deleted']].groupby(by + ['date']).sum()
    cumsums = cumsums.fillna(0).groupby(level = list(range(0,len(by)))).cumsum()
    Aggs.set_index(by + ['date'], inplace = True)
    Aggs['cumulative_deleted'] = cumsums['n_deleted']
    Aggs.reset_index(inplace=True)    
    
    #calculate Unique Active
    Aggs['n_active'] = Aggs['cumulative_created']-Aggs['cumulative_deleted']
    Aggs = Aggs.sort_values(by + ['date'])    
            
    #Calculate Closed Duration
    Aggs = pd.merge(
        Aggs, 
        df.groupby(['delete_date'] + by)['closed_duration'].agg(np.nanmean).reset_index(),
        how = 'left', left_on = ['date'] + by, right_on = ['delete_date']+by)
    
    # Get rid of calculation columns
    Aggs = Aggs.drop(['cumulative_created','cumulative_deleted','delete_date'],axis =1)
    Aggs.sort_values(by + ['date'])
    return Aggs   

by = ['state','onet_occupation_code','company_id','company_name']
Aggregates = combo_roll_up(job_records, by)
Aggregates.head(10)

Unnamed: 0,state,onet_occupation_code,company_id,company_name,date,n_created,n_deleted,n_active,closed_duration
0,AB,11-1021.00,469,"Amazon.com, Inc.",2017-07-26,2.0,0.0,2.0,
1,AB,11-1021.00,469,"Amazon.com, Inc.",2017-07-27,2.0,2.0,2.0,1.0
2,AB,11-1021.00,469,"Amazon.com, Inc.",2017-08-02,0.0,2.0,0.0,6.0
3,AB,11-1021.00,469,"Amazon.com, Inc.",2018-08-09,1.0,0.0,1.0,
4,AB,11-1021.00,469,"Amazon.com, Inc.",2018-12-05,0.0,1.0,0.0,118.0
5,AB,11-2011.00,469,"Amazon.com, Inc.",2014-02-12,2.0,0.0,2.0,
6,AB,11-2011.00,469,"Amazon.com, Inc.",2014-02-25,1.0,0.0,3.0,
7,AB,11-2011.00,469,"Amazon.com, Inc.",2014-03-12,2.0,1.0,4.0,28.0
8,AB,11-2011.00,469,"Amazon.com, Inc.",2014-06-12,0.0,1.0,3.0,107.0
9,AB,11-2011.00,469,"Amazon.com, Inc.",2014-07-15,1.0,2.0,2.0,139.0
