# Aggregating Metrics from Raw Job Records

##### Purpose

The purpose of this document is to communicate how to aggregate data from raw job records to be used as a starting point more more complex analysis.

##### Files

For this demo we are using a raw sample slice of Raw containing 4 companies.

In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import tarfile

# Load the job records data

##### For this example we will use a slice of job records and descriptions that is in one csv file.  The full month historical archive is split into 16 files, but the same methodology can be used across all of those files.

In [14]:
Onet_2019 = pd.read_csv('onet_taxonomy_2019.csv')
job_records = pd.read_csv('raw_job_archive.csv', parse_dates = ['created','delete_date','last_checked','last_updated'])

In [15]:
# update the raw data with Onet 2019 version
job_records = job_records.merge(Onet_2019.rename(columns={'job_hash':'hash','onet_occupation_code':'onet_occupation_code_2019'}), on = 'hash',how ='left')

In [16]:
job_records.loc[:8]

Unnamed: 0,hash,title,company_id,company_name,city,state,zip,country,created,last_checked,last_updated,delete_date,unmapped_location,onet_occupation_code,url,onet_occupation_code_2019
0,038ce0d30eb5e6eb8c4ed119d8ea23bc,Assistant-Patient Care,3255,Baptist Memorial Health Care,jonesboro,AR,72404.0,USA,2019-10-26 14:10:00+00:00,2019-11-19 19:39:00+00:00,NaT,2019-11-22 03:42:00+00:00,False,31-9092.00,https://careers-bmhcc.icims.com/jobs/96829/ass...,31-9092.00
1,038ce0e07929822b3868b0fb87cf276c,CDL A Company Team Dry Van Truck Driver,11906,"Celadon Trucking Services, Inc.",Horseheads,NY,14844.0,USA,2017-10-04 15:19:00+00:00,2017-11-22 00:06:00+00:00,2017-11-08 00:05:00+00:00,2017-12-13 00:08:00+00:00,False,53-3032.00,http://www.celadondrivers.com/teams-alt.php?ut...,53-3032.00
2,038ce0edbaf2cdb2b14390483033ee41,Recreation Supervisor,31926,City of Killeen,killeen,TX,76541.0,USA,2018-10-25 03:15:00+00:00,2018-10-29 06:44:00+00:00,NaT,2018-10-31 07:44:00+00:00,False,29-1125.00,http://agency.governmentjobs.com/killeen/defau...,39-1014.00
3,038ce0fee0ca25ab0469268a0e1d79d2,Service Engineer - Leicester,2878,Tyco International Ltd.,North West Leicestershire,England,,GBR,2016-07-27 22:58:00+00:00,2016-08-09 11:48:00+00:00,NaT,2016-08-11 08:20:00+00:00,False,49-2098.00,http://www.tyco.com/careers/detail/service-eng...,49-2098.00
4,038ce101aaec2c5c2e5d187916c7837e,Cloud Systems Administrator,36234,Leidos Holdings Inc.,gaithersburg,MD,20877.0,USA,2020-02-13 04:45:00+00:00,2020-02-21 17:39:00+00:00,NaT,2020-02-22 21:20:00+00:00,False,15-1142.00,https://careers.leidos.com/jobs/5042365-cloud-...,15-1244.00
5,038ce126078961493c6753a95b76bd7e,"Police Officer (Lateral Entry - February 10, 2...",3069,"City of Seattle, WA",Seattle,WA,98113.0,USA,2016-11-10 06:26:00+00:00,2017-01-26 08:21:00+00:00,NaT,2017-01-28 09:20:00+00:00,False,33-3051.01,http://agency.governmentjobs.com/seattle/defau...,33-3051.00
6,038ce12d909e501e4c8237aa8c29625e,Patient Care Technician,12630,Suburban Hospital,Bethesda,MD,20810.0,USA,2011-03-12 21:11:00+00:00,2011-03-22 00:02:00+00:00,NaT,2011-03-22 00:02:00+00:00,False,31-1014.00,https://www.healthcaresource.com/suburban/inde...,31-1131.00
7,038ce134670b8097a5f18e761c54b1e1,LAN WAN Technician / IT Specialist,180,Raytheon,Aurora,CO,80041.0,USA,2014-06-25 13:51:00+00:00,2014-06-25 13:51:00+00:00,NaT,2014-06-27 15:50:00+00:00,False,15-1152.00,http://jobs.raytheon.com/jobs/lan-wan-technici...,15-1231.00
8,038ce13d9c53adfafb9b3c5b5a29a8fd,Statistician and Quantitative Methodologist,34636,Arizona State University,,,,USA,2016-01-29 04:04:00+00:00,2016-02-10 16:29:00+00:00,NaT,2016-02-12 11:21:00+00:00,False,15-2041.00,https://sjobs.brassring.com/TGWebHost/jobdetai...,15-2041.00


# Aggregating the data

In [17]:
def combo_roll_up(df, by = ['state','onet_occupation_code_2019','company_id','company_name']):
    
    # Filling in missing values 
    df[by+['hash']]= df[by+['hash']].fillna('Blank')  
    
    # Drop timestamp to just keep date
    try:
        df['created'] = pd.to_datetime(df['created'].dt.date)
        df['delete_date'] = pd.to_datetime(df['delete_date'].dt.date)
    except:
        pass
    # Created Duration column in Job Records
    df['closed_duration'] = (df.delete_date - df.created).astype('timedelta64[D]')
    df.loc[df['closed_duration']>180,'closed_duration'] = 180
    
    # Creating Daily Aggregate
    Aggs = pd.merge(
            df.groupby(by = ['created']+by)['hash'].count().reset_index().rename(columns={
                'hash': 'n_created', 'created': 'date'}),
            df.groupby(by = ['delete_date']+by)['hash'].count().reset_index().rename(columns={
                'hash': 'n_deleted', 'delete_date': 'date'}),
            on=['date']+by, how='outer').fillna(0)
     
    # Calculate Cumulative Created
    Aggs = Aggs.sort_values(by + ['date'])  
    cumsums = Aggs[by+['date','n_created']].groupby(by = by + ['date']).sum()
    cumsums = cumsums.fillna(0).groupby(level = list(range(0,len(by)))).cumsum()
    Aggs.set_index(by + ['date'], inplace = True)
    Aggs['cumulative_created'] = cumsums['n_created']
    Aggs.reset_index(inplace=True)
    
    # Calculate Cumulative Deleted
    Aggs = Aggs.sort_values(by + ['date'])
    cumsums = Aggs[by+['date','n_deleted']].groupby(by + ['date']).sum()
    cumsums = cumsums.fillna(0).groupby(level = list(range(0,len(by)))).cumsum()
    Aggs.set_index(by + ['date'], inplace = True)
    Aggs['cumulative_deleted'] = cumsums['n_deleted']
    Aggs.reset_index(inplace=True)    
    
    #calculate Unique Active
    Aggs['n_active'] = Aggs['cumulative_created']-Aggs['cumulative_deleted']
    Aggs = Aggs.sort_values(by + ['date'])    
            
    #Calculate Closed Duration
    Aggs = pd.merge(
        Aggs, 
        df.groupby(['delete_date'] + by)['closed_duration'].agg(np.nanmean).reset_index(),
        how = 'left', left_on = ['date'] + by, right_on = ['delete_date']+by)
    
    # Get rid of calculation columns
    Aggs = Aggs.drop(['cumulative_created','cumulative_deleted','delete_date'],axis =1)
    Aggs.sort_values(by + ['date'])
    return Aggs   

by = ['state','onet_occupation_code_2019','company_id','company_name']
Aggregates = combo_roll_up(job_records, by)
Aggregates.head(10)



Unnamed: 0,state,onet_occupation_code_2019,company_id,company_name,date,n_created,n_deleted,n_active,closed_duration
0,AR,29-1141.03,13161,Catholic Health Initiatives,2010-03-03,1.0,0.0,1.0,
1,AR,29-1141.03,13161,Catholic Health Initiatives,2010-04-14,0.0,1.0,0.0,42.0
2,AR,31-9092.00,3255,Baptist Memorial Health Care,2019-10-26,1.0,0.0,1.0,
3,AR,31-9092.00,3255,Baptist Memorial Health Care,2019-11-22,0.0,1.0,0.0,27.0
4,AZ,15-1299.09,34636,Arizona State University,2015-11-16,1.0,0.0,1.0,
5,AZ,15-1299.09,34636,Arizona State University,2015-12-01,0.0,1.0,0.0,15.0
6,Blank,11-2021.00,469,"Amazon.com, Inc.",2021-04-09,1.0,0.0,1.0,
7,Blank,11-2021.00,469,"Amazon.com, Inc.",2021-04-16,0.0,1.0,0.0,7.0
8,Blank,11-3031.00,9159,UBS,2019-11-19,1.0,0.0,1.0,
9,Blank,11-3031.00,9159,UBS,2020-03-06,0.0,1.0,0.0,108.0


### question: why delete 'onet_occupation' attribute, the code fail?

In [18]:
def combo_roll_up(df, by = ['hash','state','company_id','company_name']):
    
    # Filling in missing values 
    df[by+['hash']]= df[by+['hash']].fillna('Blank')  
    
    # Drop timestamp to just keep date
    try:
        df['created'] = pd.to_datetime(df['created'].dt.date)
        df['delete_date'] = pd.to_datetime(df['delete_date'].dt.date)
    except:
        pass
    # Created Duration column in Job Records
    df['closed_duration'] = (df.delete_date - df.created).astype('timedelta64[D]')
    df.loc[df['closed_duration']>180,'closed_duration'] = 180
    
    # Creating Daily Aggregate
    Aggs = pd.merge(
            df.groupby(by = ['created']+by)['hash'].count().reset_index().rename(columns={
                'hash': 'n_created', 'created': 'date'}),
            df.groupby(by = ['delete_date']+by)['hash'].count().reset_index().rename(columns={
                'hash': 'n_deleted', 'delete_date': 'date'}),
            on=['date']+by, how='outer').fillna(0)
     
    # Calculate Cumulative Created
    Aggs = Aggs.sort_values(by + ['date'])  
    cumsums = Aggs[by+['date','n_created']].groupby(by = by + ['date']).sum()
    cumsums = cumsums.fillna(0).groupby(level = list(range(0,len(by)))).cumsum()
    Aggs.set_index(by + ['date'], inplace = True)
    Aggs['cumulative_created'] = cumsums['n_created']
    Aggs.reset_index(inplace=True)
    
    # Calculate Cumulative Deleted
    Aggs = Aggs.sort_values(by + ['date'])
    cumsums = Aggs[by+['date','n_deleted']].groupby(by + ['date']).sum()
    cumsums = cumsums.fillna(0).groupby(level = list(range(0,len(by)))).cumsum()
    Aggs.set_index(by + ['date'], inplace = True)
    Aggs['cumulative_deleted'] = cumsums['n_deleted']
    Aggs.reset_index(inplace=True)    
    
    #calculate Unique Active
    Aggs['n_active'] = Aggs['cumulative_created']-Aggs['cumulative_deleted']
    Aggs = Aggs.sort_values(by + ['date'])    
            
    #Calculate Closed Duration
    Aggs = pd.merge(
        Aggs, 
        df.groupby(['delete_date'] + by)['closed_duration'].agg(np.nanmean).reset_index(),
        how = 'left', left_on = ['date'] + by, right_on = ['delete_date']+by)
    
    # Get rid of calculation columns
    Aggs = Aggs.drop(['cumulative_created','cumulative_deleted','delete_date'],axis =1)
    Aggs.sort_values(by + ['date'])
    return Aggs   

by = ['hash','state','company_id','company_name']
Aggregates = combo_roll_up(job_records, by)
Aggregates.head(10)

ValueError: Columns must be same length as key