## Imports

In [1]:
import pandas as pd
import sqlite3 as sq
import matplotlib as mpl
from matplotlib import rcParams
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_rows', 1000); pd.set_option('display.max_columns', 1000); pd.set_option('display.width', 1000)
pd.options.mode.chained_assignment = None  # default='warn'

## Functions

In [2]:
def mantissa_round(x):
    y = np.floor(x)
    indices = np.argsort(x-y)
    hilowman = indices[::-1]
    diffxy = np.sum(x)-np.sum(y)
    z=0
    while z < diffxy:
        #z single dimemsion - need to adjust to take table
        y[hilowman[z]] +=1
        z +=1
    return y

## Dictionaries

In [3]:
wptobeadroppercents = {'WP%:Farm': 'Farm employment', 'WP%:Forestry, Fishing, & Related': 'Forestry, fishing, and related activities', 
                       'WP%:Mining': 'Mining, quarrying, and oil and gas extraction', 'WP%:Construction': 'Construction', 'WP%:Manufacturing': 'Manufacturing', 
                       'WP%:Utilities': 'Utilities', 'WP%:Wholesale Trade': 'Wholesale trade', 'WP%:Retail Trade': 'Retail trade', 
                       'WP%:Transportation & Warehousing': 'Transportation and warehousing', 'WP%:Information': 'Information', 
                       'WP%:Finance & Insurance': 'Finance and insurance', 'WP%:Real Estate, Rental, & Leasing': 'Real estate and rental and leasing', 
                       'WP%:Professional & Technical Services': 'Professional, scientific, and technical services', 
                       'WP%:Management of Companies & Enterprises': 'Management of companies and enterprises',
                       'WP%:Administrative & Waste Services': 'Administrative and support and waste management and remediation services', 
                       'WP%:Educational Services': 'Educational services', 'WP%:Healthcare & Social Assistance': 'Health care and social assistance', 
                       'WP%:Arts, Entertainment, & Recreation': 'Arts, entertainment, and recreation',
                       'WP%:Accommodation & Food Services': 'Accommodation and food services', 
                       'WP%:Other': 'Other services (except government and government enterprises)', 'WP%:Federal Civilian': 'Federal civilian', 
                       'WP%:Federal Military': 'Military', 'WP%:State & Local Government': 'State and local'}

In [4]:
wptobea = {'WP:Total': 'Total employment (number of jobs)', 'WP:Farm': 'Farm employment', 
           'WP:Forestry, Fishing, & Related': 'Forestry, fishing, and related activities', 'WP:Mining': 'Mining, quarrying, and oil and gas extraction', 
           'WP:Construction': 'Construction', 'WP:Manufacturing': 'Manufacturing', 'WP:Utilities': 'Utilities','WP:Wholesale Trade': 'Wholesale trade', 
           'WP:Retail Trade': 'Retail trade', 'WP:Transportation & Warehousing': 'Transportation and warehousing', 'WP:Information': 'Information', 
           'WP:Finance & Insurance': 'Finance and insurance','WP:Real Estate, Rental, & Leasing': 'Real estate and rental and leasing', 
           'WP:Professional & Technical Services': 'Professional, scientific, and technical services', 
           'WP:Management of Companies & Enterprises': 'Management of companies and enterprises',
           'WP:Administrative & Waste Services': 'Administrative and support and waste management and remediation services', 
           'WP:Educational Services': 'Educational services', 
           'WP:Healthcare & Social Assistance': 'Health care and social assistance', 'WP:Arts, Entertainment, & Recreation': 'Arts, entertainment, and recreation',
           'WP:Accommodation & Food Services': 'Accommodation and food services', 'WP:Other': 'Other services (except government and government enterprises)', 
           'WP:Federal Civilian': 'Federal civilian', 'WP:Federal Military': 'Military', 'WP:State & Local Government': 'State and local', 
           
           'WP%:Farm': 'Farm employment %', 'WP%:Forestry, Fishing, & Related': 'Forestry, fishing, and related activities %', 
           'WP%:Mining': 'Mining, quarrying, and oil and gas extraction %', 'WP%:Construction': 'Construction %', 'WP%:Manufacturing': 'Manufacturing %', 
           'WP%:Utilities': 'Utilities %', 'WP%:Wholesale Trade': 'Wholesale trade %', 'WP%:Retail Trade': 'Retail trade %', 
           'WP%:Transportation & Warehousing': 'Transportation and warehousing %', 'WP%:Information': 'Information %', 
           'WP%:Finance & Insurance': 'Finance and insurance %', 'WP%:Real Estate, Rental, & Leasing': 'Real estate and rental and leasing %', 
           'WP%:Professional & Technical Services': 'Professional, scientific, and technical services %', 
           'WP%:Management of Companies & Enterprises': 'Management of companies and enterprises %',
           'WP%:Administrative & Waste Services': 'Administrative and support and waste management and remediation services %', 
           'WP%:Educational Services': 'Educational services %', 'WP%:Healthcare & Social Assistance': 'Health care and social assistance %', 
           'WP%:Arts, Entertainment, & Recreation': 'Arts, entertainment, and recreation %', 'WP%:Accommodation & Food Services': 'Accommodation and food services %', 
           'WP%:Other': 'Other services (except government and government enterprises) %', 'WP%:Federal Civilian': 'Federal civilian %', 
           'WP%:Federal Military': 'Military %', 'WP%:State & Local Government': 'State and local %'}

# Initial Baseline

In [5]:
#bring in BEA data
conn = sq.connect('../../Data-Pipelines/Outputs/Labor_Economy.db')
sql_query = pd.read_sql('SELECT * FROM [BEA2017_2022Employment]', conn)
data = pd.DataFrame(sql_query)

In [6]:
#select only counties we want, narrow in on types of employment, and select only 2022 (most recent year)
thelist = ['Cheatham County, Tennessee', 'Davidson County, Tennessee', 'Dickson County, Tennessee', 'Houston County, Tennessee', 
           'Humphreys County, Tennessee', 'Maury County, Tennessee', 'Montgomery County, Tennessee', 'Robertson County, Tennessee', 
           'Rutherford County, Tennessee', 'Stewart County, Tennessee', 'Sumner County, Tennessee', 'Williamson County, Tennessee', 
           'Wilson County, Tennessee', 'Trousdale County, Tennessee']
data = data.loc[data['NAME'].isin(thelist)]
data = data.loc[(data['Category'] == 'By Type')|(data['Category'] == 'By industry')|(data['Category'] == 'Total')]
data = data[['NAME', 'Description', '2022']]

In [7]:
#index into employment categories
thelist = ['Total employment (number of jobs)', 'Farm employment', 'Forestry, fishing, and related activities', 
           'Mining, quarrying, and oil and gas extraction', 'Construction', 'Manufacturing', 
           'Utilities','Wholesale trade', 'Retail trade', 
           'Transportation and warehousing', 'Information', 'Finance and insurance',
           'Real estate and rental and leasing', 'Professional, scientific, and technical services', 'Management of companies and enterprises',
           'Administrative and support and waste management and remediation services', 'Educational services', 'Health care and social assistance', 
           'Arts, entertainment, and recreation','Accommodation and food services', 'Other services (except government and government enterprises)', 
           'Federal civilian', 'Military', 'State and local']
data = data.loc[data['Description'].isin(thelist)]

In [8]:
#suppressed industries are 666666666 in file, replace these with nan
value_to_replace = 666666666.0
data.replace(value_to_replace, np.nan, inplace = True)

In [9]:
#bring in the W+P Data
conn = sq.connect('../../Data-Pipelines/Outputs/Labor_Economy.db')
sql_query = pd.read_sql('SELECT * FROM [WP2023_IndustryEmployment_Annual_Change_2023Base]', conn)
initial = pd.DataFrame(sql_query)
thelist = ['Cheatham County, Tennessee', 'Davidson County, Tennessee', 'Dickson County, Tennessee', 'Houston County, Tennessee', 
           'Humphreys County, Tennessee', 'Maury County, Tennessee', 'Montgomery County, Tennessee', 'Robertson County, Tennessee', 
           'Rutherford County, Tennessee', 'Stewart County, Tennessee', 'Sumner County, Tennessee', 'Williamson County, Tennessee', 
           'Wilson County, Tennessee', 'Trousdale County, Tennessee']
initial = initial.loc[initial['NAME'].isin(thelist)].reset_index(drop = True)
initial = initial.loc[initial['Year'] == '2022'].reset_index(drop = True)
initial = initial[['NAME', 
                   'WP:Total', 'WP:Farm', 'WP:Forestry, Fishing, & Related','WP:Mining', 'WP:Construction', 'WP:Manufacturing', 'WP:Utilities', 
                   'WP:Wholesale Trade', 'WP:Retail Trade', 'WP:Transportation & Warehousing', 'WP:Information', 'WP:Finance & Insurance', 
                   'WP:Real Estate, Rental, & Leasing', 'WP:Professional & Technical Services', 'WP:Management of Companies & Enterprises',
                   'WP:Administrative & Waste Services', 'WP:Educational Services', 'WP:Healthcare & Social Assistance', 'WP:Arts, Entertainment, & Recreation', 
                   'WP:Accommodation & Food Services', 'WP:Other', 'WP:Federal Civilian', 'WP:Federal Military', 'WP:State & Local Government']]

In [10]:
#rename columns to match bea employment descriptions
initial = initial.rename(columns = wptobea)
cols = list(initial.columns)
cols.remove('NAME')
initial = initial.melt(id_vars = ['NAME'], value_vars = cols, var_name = 'Description', value_name = '2022')

In [11]:
initial.head(2)

Unnamed: 0,NAME,Description,2022
0,"Cheatham County, Tennessee",Total employment (number of jobs),16640.0
1,"Davidson County, Tennessee",Total employment (number of jobs),713524.0


In [12]:
# Merge based on category columns
merged_df = pd.merge(data, initial, on = ['NAME', 'Description'], how='left', suffixes=('_BEA', '_WP'))

In [13]:
df = merged_df

In [14]:
df.head(2)

Unnamed: 0,NAME,Description,2022_BEA,2022_WP
0,"Cheatham County, Tennessee",Total employment (number of jobs),17368.0,16640.0
1,"Cheatham County, Tennessee",Farm employment,432.0,423.0


In [15]:
#hardcode list of geographies that have suppressed industries and make this a separate dataframe
geos = ['Cheatham County, Tennessee', 'Dickson County, Tennessee', 'Houston County, Tennessee', 
           'Humphreys County, Tennessee', 'Maury County, Tennessee', 'Montgomery County, Tennessee', 'Robertson County, Tennessee', 
           'Rutherford County, Tennessee', 'Stewart County, Tennessee', 'Sumner County, Tennessee', 
           'Wilson County, Tennessee', 'Trousdale County, Tennessee']
#create a hardcoded separate dataframe of geographies with no suppressed industries to add back in after imputation process
thelist = ['Davidson County, Tennessee', 'Williamson County, Tennessee']
nonsuppressedcos = df.loc[df['NAME'].isin(thelist)]
#suppressed counties re-named as df
df = df.loc[~df['NAME'].isin(thelist)]

In [16]:
nonsuppressedcos = nonsuppressedcos.rename(columns = {'2022_BEA': '2022 Base'})
nonsuppressedcos = nonsuppressedcos.drop(columns = '2022_WP')

In [17]:
#impute suppressed industries
#identify suppressed industries in the BEA column by seeing where the value is null and grouping these
bea_suppressed_mask = df['2022_BEA'].isna()
suppressed_bea = df[bea_suppressed_mask].groupby('NAME')['Description'].apply(list)

# Use the same mask to identify mirroring suppressed industries in WP
wp_suppressed_sum = df[bea_suppressed_mask].groupby('NAME')['2022_WP'].sum()

#get the total of suppressed employment in BEA to later distribute to suppressed industries based on shares from W+P
bea_suppressed_sum = df[bea_suppressed_mask].groupby('NAME')['2022_BEA'].sum()

# Initialize the new column with NaN
df['Share of Suppressed Industries'] = np.nan

# Initialize a new column '2022 Base'
df['2022 Base'] = np.nan

# Iterate over unique geographies and update 'Share of Suppressed Industries' and '2022 Base'
for geography in geos:
    # Update 'Share of Suppressed Industries'
    #this is keeping track of the suppressed bea industries per geography
    wp_mask = (df['NAME'] == geography) & df['Description'].isin(suppressed_bea[geography])
    #then we're getting the sum of the equivalent industry employment from the w+p data to mirror the bea
    wp_suppressed_sum_geography = wp_suppressed_sum.get(geography, np.nan)
    if not np.isnan(wp_suppressed_sum_geography):
        df.loc[wp_mask, 'Share of Suppressed Industries'] = df.loc[wp_mask, '2022_WP'] / wp_suppressed_sum_geography
    
    # Calculate bea_suppressed_sum for the current geography
    bea_mask = (df['NAME'] == geography)
    total_bea = df.loc[bea_mask & (df['Description'] == 'Total employment (number of jobs)'), '2022_BEA'].iloc[0]
    non_null_values = df.loc[bea_mask & ~df['2022_BEA'].isnull() & (df['Description'] != 'Total employment (number of jobs)'), '2022_BEA'].sum()
    bea_suppressed_sum_geography = total_bea - non_null_values
    
    # Update '2022 Base'
    base_mask = (df['NAME'] == geography)
    bea_present_mask = base_mask & ~df['2022_BEA'].isnull()
    df.loc[bea_present_mask, '2022 Base'] = df.loc[bea_present_mask, '2022_BEA']
    wp_suppressed_mask = base_mask & df['Description'].isin(suppressed_bea[geography])
    df.loc[wp_suppressed_mask, '2022 Base'] = bea_suppressed_sum_geography * df.loc[wp_suppressed_mask, 'Share of Suppressed Industries']

In [18]:
#add the counties with no suppressed values back into the main df
df = df[['NAME', 'Description', '2022 Base']]
df = pd.concat([df, nonsuppressedcos])

# Industry Aggregations

In [19]:
#group the government industries
thelist = ['Federal civilian', 'Military', 'State and local']
government = df.loc[df['Description'].isin(thelist)]
other = df.loc[~df['Description'].isin(thelist)]
government = government.drop(columns = 'Description')
government = government.groupby(['NAME']).agg({'2022 Base': 'sum'}).reset_index()
government['Description'] = 'Government'
df = pd.concat([government, other])

In [20]:
#group the ag forestry fishing and hunting industries
thelist = ['Farm employment', 'Forestry, fishing, and related activities']
eleven = df.loc[df['Description'].isin(thelist)]
other = df.loc[~df['Description'].isin(thelist)]
eleven = eleven.drop(columns = 'Description')
eleven = eleven.groupby(['NAME']).agg({'2022 Base': 'sum'}).reset_index()
eleven['Description'] = 'Agriculture, forestry, fishing and hunting'
df = pd.concat([eleven, other])

In [21]:
df.head(2)

Unnamed: 0,NAME,2022 Base,Description
0,"Cheatham County, Tennessee",491.015712,"Agriculture, forestry, fishing and hunting"
1,"Davidson County, Tennessee",693.0,"Agriculture, forestry, fishing and hunting"


# Smoothing

In [22]:
conn = sq.connect('../../Data-Pipelines/Outputs/Labor_Economy.db')
sql_query = pd.read_sql('SELECT * FROM [WP2023_IndustryEmployment_Annual_Change_2023Base]', conn)
emp = pd.DataFrame(sql_query)
emp = emp.loc[emp['Year'] != 'None']
emp['Year'] = emp['Year'].astype(int)
emp= emp[emp['Year'] > 2021]
# Create a boolean array indicating which columns contain the string "%"
cols_to_drop = emp.columns[emp.columns.str.contains('%')]
# Drop the columns containing the string "%"
emp.drop(cols_to_drop, axis=1, inplace=True)
cols_to_drop = emp.columns[emp.columns.str.contains('Change')]
# Drop the columns containing the string "Change"
emp.drop(cols_to_drop, axis=1, inplace=True)
#filter and renname
emp['Agriculture, forestry, fishing and hunting'] = emp['WP:Farm'] + emp['WP:Forestry, Fishing, & Related']
thelist = [emp['WP:Federal Civilian'], emp['WP:Federal Military'], emp['WP:State & Local Government']]
emp['Government'] = sum(thelist)
emp = emp.drop(columns = ['WP:Farm', 'WP:Federal Civilian', 'WP:Federal Military','WP:Forestry, Fishing, & Related', 'WP:State & Local Government', 
                         'Ind:Construction', 'Ind:Education & Health Services', 'Ind:Farm', 'Ind:Financial Activities','Ind:Goods Producing', 'Ind:Information', 
                          'Ind:Leisure & Hospitality', 'Ind:Manufacturing', 'Ind:Natural Resources & Mining', 'Ind:Other','Ind:Professional & Business Services',
                          'Ind:Public Administration', 'Ind:Service Producing', 'Ind:Snapshot: Blue Collar','Ind:Snapshot: Farm','Ind:Snapshot: Government', 
                          'Ind:Snapshot: Industrial, Manufacturing, Utilities, & Logistics','Ind:Snapshot: Professional Services', 'Time Frame', 
                          'Ind:Snapshot: Retail & Hospitality', 'Ind:Snapshot: White Collar', 'Ind:Total', 'Source', 'GEO_ID', 'Ind:Trade, Transportation, & Utilities'])
inddict = {'WP:Accommodation & Food Services': 'Accommodation and food services', 
           'WP:Administrative & Waste Services': 'Administrative and support and waste management and remediation services',
           'WP:Arts, Entertainment, & Recreation': 'Arts, entertainment, and recreation', 
           'WP:Construction': 'Construction', 
           'WP:Educational Services': 'Educational services', 
           'WP:Finance & Insurance': 'Finance and insurance', 
           'WP:Healthcare & Social Assistance': 'Health care and social assistance', 
           'WP:Information': 'Information',
           'WP:Management of Companies & Enterprises': 'Management of companies and enterprises', 
           'WP:Manufacturing': 'Manufacturing', 
           'WP:Mining': 'Mining, quarrying, and oil and gas extraction', 
           'WP:Other': 'Other services (except government and government enterprises)', 
           'WP:Professional & Technical Services': 'Professional, scientific, and technical services',
           'WP:Real Estate, Rental, & Leasing': 'Real estate and rental and leasing', 
           'WP:Retail Trade': 'Retail trade', 
           'WP:Total': 'Total employment (number of jobs)', 
           'WP:Transportation & Warehousing': 'Transportation and warehousing',
           'WP:Utilities': 'Utilities', 
           'WP:Wholesale Trade': 'Wholesale trade'}
emp = emp.rename(columns = inddict)
emp['Year'] = emp['Year'].astype(str)
emp['Year'] = emp['Year'].replace({'2022':'2022WP'})
#select only counties we want, narrow in on types of employment, and select only 2022 (most recent year)
thelist = ['Cheatham County, Tennessee', 'Davidson County, Tennessee', 'Dickson County, Tennessee', 'Houston County, Tennessee', 
           'Humphreys County, Tennessee', 'Maury County, Tennessee', 'Montgomery County, Tennessee', 'Robertson County, Tennessee', 
           'Rutherford County, Tennessee', 'Stewart County, Tennessee', 'Sumner County, Tennessee', 'Williamson County, Tennessee', 
           'Wilson County, Tennessee', 'Trousdale County, Tennessee']
emp = emp.loc[emp['NAME'].isin(thelist)]
emp.head(2)

Unnamed: 0,NAME,Year,Accommodation and food services,Administrative and support and waste management and remediation services,"Arts, entertainment, and recreation",Construction,Educational services,Finance and insurance,Health care and social assistance,Information,Management of companies and enterprises,Manufacturing,"Mining, quarrying, and oil and gas extraction",Other services (except government and government enterprises),"Professional, scientific, and technical services",Real estate and rental and leasing,Retail trade,Total employment (number of jobs),Transportation and warehousing,Utilities,Wholesale trade,"Agriculture, forestry, fishing and hunting",Government
32,"Cheatham County, Tennessee",2022WP,924.0,1205.0,599.0,1870.0,246.0,508.0,759.0,130.0,111.0,2779.0,16.0,1154.0,726.0,771.0,1446.0,16640.0,884.0,36.0,216.0,477.0,1783.0
33,"Cheatham County, Tennessee",2023,957.0,1198.0,591.0,1856.0,250.0,481.0,760.0,121.0,117.0,2752.0,16.0,1159.0,718.0,778.0,1425.0,16590.0,896.0,35.0,212.0,473.0,1795.0


In [23]:
df = df.pivot(index = 'NAME', columns = 'Description', values = '2022 Base').reset_index(drop = False)
df['Year'] = '2022BL'

In [24]:
baseline = pd.concat([emp, df]).reset_index(drop = True)
baseline.head(2)

Unnamed: 0,NAME,Year,Accommodation and food services,Administrative and support and waste management and remediation services,"Arts, entertainment, and recreation",Construction,Educational services,Finance and insurance,Health care and social assistance,Information,Management of companies and enterprises,Manufacturing,"Mining, quarrying, and oil and gas extraction",Other services (except government and government enterprises),"Professional, scientific, and technical services",Real estate and rental and leasing,Retail trade,Total employment (number of jobs),Transportation and warehousing,Utilities,Wholesale trade,"Agriculture, forestry, fishing and hunting",Government
0,"Cheatham County, Tennessee",2022WP,924.0,1205.0,599.0,1870.0,246.0,508.0,759.0,130.0,111.0,2779.0,16.0,1154.0,726.0,771.0,1446.0,16640.0,884.0,36.0,216.0,477.0,1783.0
1,"Cheatham County, Tennessee",2023,957.0,1198.0,591.0,1856.0,250.0,481.0,760.0,121.0,117.0,2752.0,16.0,1159.0,718.0,778.0,1425.0,16590.0,896.0,35.0,212.0,473.0,1795.0


In [25]:
baseline = baseline.melt(id_vars = ['NAME', 'Year'], var_name = 'Industry', value_name = 'Employment')
baseline = baseline.pivot(index = ['Industry', 'NAME'], columns = 'Year', values = 'Employment').reset_index()
baseline.head(2)

Year,Industry,NAME,2022BL,2022WP,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
0,Accommodation and food services,"Cheatham County, Tennessee",877.0,924.0,957.0,989.0,1025.0,1061.0,1098.0,1136.0,1175.0,1215.0,1256.0,1297.0,1340.0,1384.0,1428.0,1474.0,1521.0,1569.0,1618.0,1668.0,1720.0,1772.0,1826.0,1881.0,1938.0,1995.0,2055.0,2115.0,2177.0,2241.0
1,Accommodation and food services,"Davidson County, Tennessee",59594.0,59437.0,63524.0,67610.0,68655.0,69679.0,70682.0,71665.0,72627.0,73567.0,74487.0,75385.0,76262.0,77118.0,77953.0,78766.0,79558.0,80329.0,81078.0,81806.0,82513.0,83199.0,83864.0,84507.0,85129.0,85731.0,86311.0,86871.0,87410.0,87929.0


In [26]:
#creating a non adjusted aside in case we want to export it later
non_adj = baseline
non_adj = non_adj.drop(columns = ['2022BL', '2022WP']).set_index(['NAME', 'Industry'])
non_adj = non_adj.add_suffix(" Non-Adjusted").reset_index()

In [27]:
years = list(baseline.columns)
years.remove('NAME')
years.remove('Industry')
years.remove('2022BL')
years.remove('2022WP')
years.remove('2050')

In [28]:
base_final = baseline[['Industry', 'NAME', '2022BL', '2022WP', '2050']]

In [29]:
def apply_formula(row):
    # Assuming '2022' is the base year for BL and WP, and '2050' is the last year
    base_year = 2022
    final_year = 2050

    # Calculate the denominator part of the formula
    denominator = final_year - base_year
    
    # If denominator is zero, return 0 to avoid division by zero error
    if denominator == 0:
        return 0
    # Loop through the years between 2024 and 2049 (inclusive)
    for year in range(base_year + 1, final_year):
        # Calculate the adjusted value for the current year
        adjusted_value = row[f'{year}'] - (row[f'{base_year}WP'] - row[f'{base_year}BL']) / denominator * (final_year - int(year))
       # Update the value for the current year
        row[f'{year}'] = adjusted_value 
    return row

# Apply the formula to each row in the DataFrame
baseline = baseline.apply(apply_formula, axis=1)

In [30]:
baseline.head(2)

Year,Industry,NAME,2022BL,2022WP,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
0,Accommodation and food services,"Cheatham County, Tennessee",877.0,924.0,911.678571,945.357143,983.035714,1020.714286,1059.392857,1099.071429,1139.75,1181.428571,1224.107143,1266.785714,1311.464286,1357.142857,1402.821429,1450.5,1499.178571,1548.857143,1599.535714,1651.214286,1704.892857,1758.571429,1814.25,1870.928571,1929.607143,1988.285714,2049.964286,2111.642857,2175.321429,2241.0
1,Accommodation and food services,"Davidson County, Tennessee",59594.0,59437.0,63675.392857,67755.785714,68795.178571,69813.571429,70810.964286,71788.357143,72744.75,73679.142857,74593.535714,75485.928571,76357.321429,77207.714286,78037.107143,78844.5,79630.892857,80396.285714,81139.678571,81862.071429,82563.464286,83243.857143,83903.25,84540.642857,85157.035714,85753.428571,86327.821429,86882.214286,87415.607143,87929.0


# Government Distribution

In [31]:
dist = pd.read_csv('../data/jobseqdistr.csv')
dist.head(2)

Unnamed: 0,NAME,Industry,Share Private,Share Self-Employed,Share Government
0,"Cheatham County, Tennessee",Accommodation and food services,89.455529,2.746548,7.797922
1,"Cheatham County, Tennessee",Administrative and support and waste managemen...,52.805013,42.308646,4.886341


In [32]:
data = baseline.merge(dist, on = ['NAME', 'Industry'], how = 'outer')
data.fillna(0, inplace = True)

In [33]:
data.head(2)

Unnamed: 0,Industry,NAME,2022BL,2022WP,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050,Share Private,Share Self-Employed,Share Government
0,Accommodation and food services,"Cheatham County, Tennessee",877.0,924.0,911.678571,945.357143,983.035714,1020.714286,1059.392857,1099.071429,1139.75,1181.428571,1224.107143,1266.785714,1311.464286,1357.142857,1402.821429,1450.5,1499.178571,1548.857143,1599.535714,1651.214286,1704.892857,1758.571429,1814.25,1870.928571,1929.607143,1988.285714,2049.964286,2111.642857,2175.321429,2241.0,89.455529,2.746548,7.797922
1,Accommodation and food services,"Davidson County, Tennessee",59594.0,59437.0,63675.392857,67755.785714,68795.178571,69813.571429,70810.964286,71788.357143,72744.75,73679.142857,74593.535714,75485.928571,76357.321429,77207.714286,78037.107143,78844.5,79630.892857,80396.285714,81139.678571,81862.071429,82563.464286,83243.857143,83903.25,84540.642857,85157.035714,85753.428571,86327.821429,86882.214286,87415.607143,87929.0,98.923177,1.076823,0.0


In [34]:
#hardcode these values in
data.loc[data['Industry'] == 'Government', 'Share Government'] = 100
#calculate numbers from the shares
cols = list(data.columns)
cols.remove('Industry')
cols.remove('NAME')
cols.remove('Share Private')
cols.remove('Share Government')
cols.remove('Share Self-Employed')
for col in cols:
    #currently don't want this value in the dataframe
    #data['{} Private'.format(col)] = data['{}'.format(col)] * (data['Share Private']/100)
    data['{} Government'.format(col)] = data['{}'.format(col)] * (data['Share Government']/100)
    #currently don't want this value in the dataframe
    #data['{} Self-Employed'.format(col)] = data['{}'.format(col)] * (data['Share Self-Employed']/100)
    data.loc[data['Industry'] == 'Government', '{} Government'.format(col)] = (data['{}'.format(col)]*data['Share Government']/100)

In [35]:
data.head()

Unnamed: 0,Industry,NAME,2022BL,2022WP,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050,Share Private,Share Self-Employed,Share Government,2022BL Government,2022WP Government,2023 Government,2024 Government,2025 Government,2026 Government,2027 Government,2028 Government,2029 Government,2030 Government,2031 Government,2032 Government,2033 Government,2034 Government,2035 Government,2036 Government,2037 Government,2038 Government,2039 Government,2040 Government,2041 Government,2042 Government,2043 Government,2044 Government,2045 Government,2046 Government,2047 Government,2048 Government,2049 Government,2050 Government
0,Accommodation and food services,"Cheatham County, Tennessee",877.0,924.0,911.678571,945.357143,983.035714,1020.714286,1059.392857,1099.071429,1139.75,1181.428571,1224.107143,1266.785714,1311.464286,1357.142857,1402.821429,1450.5,1499.178571,1548.857143,1599.535714,1651.214286,1704.892857,1758.571429,1814.25,1870.928571,1929.607143,1988.285714,2049.964286,2111.642857,2175.321429,2241.0,89.455529,2.746548,7.797922,68.387778,72.052801,71.091986,73.718215,76.65636,79.594506,82.610631,85.704735,88.876819,92.126881,95.454923,98.782965,102.266965,105.828944,109.390924,113.108862,116.904779,120.778675,124.730551,128.760406,132.946219,137.132032,141.473804,145.893555,150.469264,155.044974,159.854621,164.664268,169.629873,174.751437
1,Accommodation and food services,"Davidson County, Tennessee",59594.0,59437.0,63675.392857,67755.785714,68795.178571,69813.571429,70810.964286,71788.357143,72744.75,73679.142857,74593.535714,75485.928571,76357.321429,77207.714286,78037.107143,78844.5,79630.892857,80396.285714,81139.678571,81862.071429,82563.464286,83243.857143,83903.25,84540.642857,85157.035714,85753.428571,86327.821429,86882.214286,87415.607143,87929.0,98.923177,1.076823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Accommodation and food services,"Dickson County, Tennessee",2097.0,2266.0,2245.035714,2393.071429,2472.107143,2553.142857,2634.178571,2717.214286,2800.25,2885.285714,2971.321429,3058.357143,3146.392857,3235.428571,3325.464286,3417.5,3509.535714,3603.571429,3699.607143,3795.642857,3893.678571,3992.714286,4093.75,4195.785714,4298.821429,4403.857143,4510.892857,4618.928571,4727.964286,4839.0,98.636114,1.363886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Accommodation and food services,"Houston County, Tennessee",148.0,174.0,147.928571,147.857143,152.785714,157.714286,162.642857,168.571429,173.5,178.428571,184.357143,189.285714,195.214286,200.142857,206.071429,211.0,216.928571,221.857143,227.785714,233.714286,238.642857,244.571429,250.5,256.428571,262.357143,268.285714,274.214286,280.142857,286.071429,292.0,95.627575,4.372425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Accommodation and food services,"Humphreys County, Tennessee",725.0,719.0,724.785714,724.571429,730.357143,737.142857,742.928571,748.714286,754.5,760.285714,765.071429,769.857143,774.642857,779.428571,784.214286,788.0,792.785714,796.571429,800.357143,803.142857,806.928571,809.714286,813.5,816.285714,819.071429,821.857143,823.642857,826.428571,828.214286,831.0,98.109217,1.890783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


So what we need is to find the share of government employment that is distributed to which industries based on the number employed in the government in each industry where there is any government employment.

In [36]:
cols.append('NAME')
cols.append('Industry')
data = data.fillna(0)

In [37]:
#aside = data[['NAME', '2022 Base', 'Description', 'Government']]
#creating a couple "aside" dfs, one that has the 2022 base employment for the "government industry" 
aside1 = data.loc[data['Industry'] == 'Government']
aside1 = aside1[cols]
#and one that is the government employment per geo that isn't in the "government industry"
aside2 = data.loc[(data['Industry'] != 'Government') & (data['Industry'] != 'Total employment (number of jobs)')]
aside2 = aside2[['NAME', '2022BL Government', '2022WP Government','2023 Government','2024 Government','2025 Government','2026 Government',
                 '2027 Government','2028 Government','2029 Government','2030 Government','2031 Government','2032 Government','2033 Government',
                 '2034 Government','2035 Government','2036 Government','2037 Government','2038 Government','2039 Government','2040 Government',
                 '2041 Government','2042 Government','2043 Government','2044 Government','2045 Government','2046 Government','2047 Government',
                 '2048 Government','2049 Government','2050 Government']]

In [38]:
aside1.head(2)

Unnamed: 0,2022BL,2022WP,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050,NAME,Industry
98,1774.0,1783.0,1786.321429,1796.642857,1802.964286,1811.285714,1818.607143,1825.928571,1833.25,1840.571429,1847.892857,1855.214286,1862.535714,1869.857143,1876.178571,1883.5,1890.821429,1898.142857,1905.464286,1911.785714,1919.107143,1926.428571,1931.75,1939.071429,1945.392857,1952.714286,1959.035714,1966.357143,1972.678571,1980.0,"Cheatham County, Tennessee",Government
99,48130.0,47132.0,48155.357143,48180.714286,48004.071429,47822.428571,47636.785714,47448.142857,47255.5,47059.857143,46860.214286,46656.571429,46449.928571,46240.285714,46026.642857,45811.0,45591.357143,45368.714286,45143.071429,44913.428571,44681.785714,44446.142857,44208.5,43967.857143,43725.214286,43479.571429,43229.928571,42978.285714,42725.642857,42470.0,"Davidson County, Tennessee",Government


In [39]:
aside1 = aside1.melt(id_vars = 'NAME', var_name = 'Year', value_name = 'Government Industry Employment')

In [40]:
aside1.head(2)

Unnamed: 0,NAME,Year,Government Industry Employment
0,"Cheatham County, Tennessee",2022BL,1774.0
1,"Davidson County, Tennessee",2022BL,48130.0


In [41]:
aside2 = aside2.melt(id_vars = 'NAME', var_name = 'Year', value_name = 'Employment')

In [42]:
boop = aside2['Year'].str.split(pat = " ", expand = True)
aside2['Year'] = boop[0].str.strip()

In [43]:
#group by geo to get the total non-"government industry" government employment
aside2 = aside2.groupby(['NAME', 'Year'])['Employment'].sum()
aside2 = pd.DataFrame(aside2)
aside2.reset_index(inplace = True)
aside2 = aside2.rename(columns = {'Employment': 'Non-Public Administration Government'})

In [44]:
aside2.head(2)

Unnamed: 0,NAME,Year,Non-Public Administration Government
0,"Cheatham County, Tennessee",2022BL,565.743096
1,"Cheatham County, Tennessee",2022WP,543.172516


In [45]:
aside3 = aside1.merge(aside2, on = ['NAME', 'Year'])

In [46]:
aside3.head(2)

Unnamed: 0,NAME,Year,Government Industry Employment,Non-Public Administration Government
0,"Cheatham County, Tennessee",2022BL,1774.0,565.743096
1,"Davidson County, Tennessee",2022BL,48130.0,21867.714936


In [47]:
#the only industry is government so we are subtracting non public admin from the total for "public admin"
aside3['Public Administration'] = aside3['Government Industry Employment'] - aside3['Non-Public Administration Government']

In [48]:
aside3['Employment'] = aside3['Public Administration']
aside3['Industry'] = 'Public Administration'
aside3 = aside3[['NAME', 'Year', 'Industry', 'Employment']]

In [49]:
aside3.head(2)

Unnamed: 0,NAME,Year,Industry,Employment
0,"Cheatham County, Tennessee",2022BL,Public Administration,1208.256904
1,"Davidson County, Tennessee",2022BL,Public Administration,26262.285064


In [50]:
aside3 = aside3.pivot(index = ['NAME', 'Industry'], columns = 'Year', values = 'Employment').reset_index()
aside3.head(2)

Year,NAME,Industry,2022BL,2022WP,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050
0,"Cheatham County, Tennessee",Public Administration,1208.256904,1239.827484,1215.415368,1221.145069,1215.296572,1209.957381,1204.881155,1199.72346,1193.549865,1187.723651,1182.086544,1176.536511,1170.734795,1164.400053,1157.301427,1152.045294,1145.658708,1139.174759,1133.052414,1125.221057,1118.244473,1112.087031,1103.244242,1095.750036,1087.975116,1080.136992,1071.485129,1064.189096,1055.697225,1047.901315
1,"Davidson County, Tennessee",Public Administration,26262.285064,24755.739839,25700.588654,25139.278615,24492.546221,23862.824402,23239.137446,22624.321598,22015.892757,21413.603219,20821.254161,20235.576614,19658.854631,19089.450041,18527.778976,17974.895235,17429.583853,16892.452091,16362.836421,15841.303884,15328.757958,14822.106827,14325.142267,13835.334787,13352.118452,12876.510275,12406.51973,11943.493915,11487.947328,11038.669488


In [51]:
data.head(2)

Unnamed: 0,Industry,NAME,2022BL,2022WP,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050,Share Private,Share Self-Employed,Share Government,2022BL Government,2022WP Government,2023 Government,2024 Government,2025 Government,2026 Government,2027 Government,2028 Government,2029 Government,2030 Government,2031 Government,2032 Government,2033 Government,2034 Government,2035 Government,2036 Government,2037 Government,2038 Government,2039 Government,2040 Government,2041 Government,2042 Government,2043 Government,2044 Government,2045 Government,2046 Government,2047 Government,2048 Government,2049 Government,2050 Government
0,Accommodation and food services,"Cheatham County, Tennessee",877.0,924.0,911.678571,945.357143,983.035714,1020.714286,1059.392857,1099.071429,1139.75,1181.428571,1224.107143,1266.785714,1311.464286,1357.142857,1402.821429,1450.5,1499.178571,1548.857143,1599.535714,1651.214286,1704.892857,1758.571429,1814.25,1870.928571,1929.607143,1988.285714,2049.964286,2111.642857,2175.321429,2241.0,89.455529,2.746548,7.797922,68.387778,72.052801,71.091986,73.718215,76.65636,79.594506,82.610631,85.704735,88.876819,92.126881,95.454923,98.782965,102.266965,105.828944,109.390924,113.108862,116.904779,120.778675,124.730551,128.760406,132.946219,137.132032,141.473804,145.893555,150.469264,155.044974,159.854621,164.664268,169.629873,174.751437
1,Accommodation and food services,"Davidson County, Tennessee",59594.0,59437.0,63675.392857,67755.785714,68795.178571,69813.571429,70810.964286,71788.357143,72744.75,73679.142857,74593.535714,75485.928571,76357.321429,77207.714286,78037.107143,78844.5,79630.892857,80396.285714,81139.678571,81862.071429,82563.464286,83243.857143,83903.25,84540.642857,85157.035714,85753.428571,86327.821429,86882.214286,87415.607143,87929.0,98.923177,1.076823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
data = pd.concat([data, aside3])

In [53]:
data.head(2)

Unnamed: 0,Industry,NAME,2022BL,2022WP,2023,2024,2025,2026,2027,2028,2029,2030,2031,2032,2033,2034,2035,2036,2037,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047,2048,2049,2050,Share Private,Share Self-Employed,Share Government,2022BL Government,2022WP Government,2023 Government,2024 Government,2025 Government,2026 Government,2027 Government,2028 Government,2029 Government,2030 Government,2031 Government,2032 Government,2033 Government,2034 Government,2035 Government,2036 Government,2037 Government,2038 Government,2039 Government,2040 Government,2041 Government,2042 Government,2043 Government,2044 Government,2045 Government,2046 Government,2047 Government,2048 Government,2049 Government,2050 Government
0,Accommodation and food services,"Cheatham County, Tennessee",877.0,924.0,911.678571,945.357143,983.035714,1020.714286,1059.392857,1099.071429,1139.75,1181.428571,1224.107143,1266.785714,1311.464286,1357.142857,1402.821429,1450.5,1499.178571,1548.857143,1599.535714,1651.214286,1704.892857,1758.571429,1814.25,1870.928571,1929.607143,1988.285714,2049.964286,2111.642857,2175.321429,2241.0,89.455529,2.746548,7.797922,68.387778,72.052801,71.091986,73.718215,76.65636,79.594506,82.610631,85.704735,88.876819,92.126881,95.454923,98.782965,102.266965,105.828944,109.390924,113.108862,116.904779,120.778675,124.730551,128.760406,132.946219,137.132032,141.473804,145.893555,150.469264,155.044974,159.854621,164.664268,169.629873,174.751437
1,Accommodation and food services,"Davidson County, Tennessee",59594.0,59437.0,63675.392857,67755.785714,68795.178571,69813.571429,70810.964286,71788.357143,72744.75,73679.142857,74593.535714,75485.928571,76357.321429,77207.714286,78037.107143,78844.5,79630.892857,80396.285714,81139.678571,81862.071429,82563.464286,83243.857143,83903.25,84540.642857,85157.035714,85753.428571,86327.821429,86882.214286,87415.607143,87929.0,98.923177,1.076823,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
data = data.drop(columns = ['Share Private', 'Share Government', 'Share Self-Employed'])

In [55]:
cols = ['2022BL', '2022WP', '2023', '2024', '2025', '2026', '2027', '2028', '2029', '2030', '2031', '2032', '2033', '2034', '2035', 
        '2036', '2037', '2038', '2039', '2040', '2041', '2042', '2043', '2044', '2045', '2046', '2047', '2048', '2049', '2050']

In [56]:
#for all columns (years) when industry is equal to public administration, the value of that equivalent year's government owned establishment employment for that 
#industry is equal to the value of public administration's value for the year (total industry employment)
#essentially all public administration employment is at a government establishment so can properly re-sum the whole so we can redistribute government emp.
for col in cols:
    data.loc[data['Industry'] == 'Public Administration', '{} Government'.format(col)] = data.loc[data['Industry'] == 'Public Administration', '{}'.format(col)]

In [57]:
data = data.melt(id_vars = ['NAME', 'Industry'], var_name = 'Year', value_name = 'Employment')

In [58]:
data.head(2)

Unnamed: 0,NAME,Industry,Year,Employment
0,"Cheatham County, Tennessee",Accommodation and food services,2022BL,877.0
1,"Davidson County, Tennessee",Accommodation and food services,2022BL,59594.0


In [59]:
#Get the total base for "Government" for each "NAME" and Year
gov_total_base = data[data['Industry'] == 'Government']
gov_total_base = gov_total_base.rename(columns = {'Employment': 'Total Government Employment'})
gov_total_base.drop(columns = 'Industry', inplace = True)

#Map this total base to the original DataFrame
data = data.merge(gov_total_base, on = ['NAME', 'Year'], how = 'left')

In [60]:
gov_total_base.head(2)

Unnamed: 0,NAME,Year,Total Government Employment
98,"Cheatham County, Tennessee",2022BL,1774.0
99,"Davidson County, Tennessee",2022BL,48130.0


In [61]:
data.head(2)

Unnamed: 0,NAME,Industry,Year,Employment,Total Government Employment
0,"Cheatham County, Tennessee",Accommodation and food services,2022BL,877.0,1774.0
1,"Davidson County, Tennessee",Accommodation and food services,2022BL,59594.0,48130.0


In [62]:
#reformat so that government establishment industry employment is a separate column
thelist = ['2022BL Government', '2022WP Government', '2023 Government', '2024 Government', '2025 Government', '2026 Government', '2027 Government', 
           '2028 Government', '2029 Government', '2030 Government', '2031 Government', '2032 Government', '2033 Government', '2034 Government', 
           '2035 Government', '2036 Government', '2037 Government', '2038 Government', '2039 Government', '2040 Government', '2041 Government', 
           '2042 Government', '2043 Government', '2044 Government', '2045 Government', '2046 Government', '2047 Government', 
           '2048 Government', '2049 Government', '2050 Government']
gov = data.loc[data['Year'].isin(thelist)]
gov.drop(columns = 'Total Government Employment', inplace = True)
nongov = data.loc[~data['Year'].isin(thelist)]

In [63]:
nongov.head(2)

Unnamed: 0,NAME,Industry,Year,Employment,Total Government Employment
0,"Cheatham County, Tennessee",Accommodation and food services,2022BL,877.0,1774.0
1,"Davidson County, Tennessee",Accommodation and food services,2022BL,59594.0,48130.0


In [64]:
nongov['Year'].unique()

array(['2022BL', '2022WP', '2023', '2024', '2025', '2026', '2027', '2028',
       '2029', '2030', '2031', '2032', '2033', '2034', '2035', '2036',
       '2037', '2038', '2039', '2040', '2041', '2042', '2043', '2044',
       '2045', '2046', '2047', '2048', '2049', '2050'], dtype=object)

In [65]:
gov.head(2)

Unnamed: 0,NAME,Industry,Year,Employment
9240,"Cheatham County, Tennessee",Accommodation and food services,2022BL Government,68.387778
9241,"Davidson County, Tennessee",Accommodation and food services,2022BL Government,0.0


In [66]:
gov['Year'].unique()

array(['2022BL Government', '2022WP Government', '2023 Government',
       '2024 Government', '2025 Government', '2026 Government',
       '2027 Government', '2028 Government', '2029 Government',
       '2030 Government', '2031 Government', '2032 Government',
       '2033 Government', '2034 Government', '2035 Government',
       '2036 Government', '2037 Government', '2038 Government',
       '2039 Government', '2040 Government', '2041 Government',
       '2042 Government', '2043 Government', '2044 Government',
       '2045 Government', '2046 Government', '2047 Government',
       '2048 Government', '2049 Government', '2050 Government'],
      dtype=object)

In [67]:
gov = gov.rename(columns = {'Employment': 'Industry Government Employment'})
boop = gov['Year'].str.split(pat = " ", expand = True)
gov['Year'] = boop[0].str.strip()

In [68]:
data = nongov.merge(gov, on = ['NAME', 'Industry', 'Year'])

In [69]:
data.head(2)

Unnamed: 0,NAME,Industry,Year,Employment,Total Government Employment,Industry Government Employment
0,"Cheatham County, Tennessee",Accommodation and food services,2022BL,877.0,1774.0,68.387778
1,"Davidson County, Tennessee",Accommodation and food services,2022BL,59594.0,48130.0,0.0


In [70]:
data['Year'].unique()

array(['2022BL', '2022WP', '2023', '2024', '2025', '2026', '2027', '2028',
       '2029', '2030', '2031', '2032', '2033', '2034', '2035', '2036',
       '2037', '2038', '2039', '2040', '2041', '2042', '2043', '2044',
       '2045', '2046', '2047', '2048', '2049', '2050'], dtype=object)

In [71]:
#Calculate the "Share of Total Government Employment"
#This is done by dividing the 'Government' by the 'Total Gov Base' for each row
data['Share of Total Government Employment'] = (data['Industry Government Employment'] / data['Total Government Employment'])*100

In [72]:
data.head(2)

Unnamed: 0,NAME,Industry,Year,Employment,Total Government Employment,Industry Government Employment,Share of Total Government Employment
0,"Cheatham County, Tennessee",Accommodation and food services,2022BL,877.0,1774.0,68.387778,3.855004
1,"Davidson County, Tennessee",Accommodation and food services,2022BL,59594.0,48130.0,0.0,0.0


In [73]:
#hardcode the value of share of total government employment to be 0 for both total and government employment
data.loc[data['Industry'] == 'Total employment (number of jobs)', 'Share of Total Government Employment'] = 0
data.loc[data['Industry'] == 'Government', 'Share of Total Government Employment'] = 0

In [74]:
#multiply value of total government employment by the shares
data.loc[data['Industry'] == 'Public Administration', 'Employment'] = 0
data['Extra Government Employment'] = (data['Share of Total Government Employment']/100) * data['Total Government Employment']
data['NAICS Employment'] = data['Employment'] + data['Extra Government Employment']
#hardcode government NAICS employment to be 0 as an extra measure
data.loc[data['Industry'] == 'Government', 'NAICS Employment'] = 0

In [75]:
catdict = {'Agriculture, forestry, fishing and hunting': 'BEA and NAICS', 'Government': 'BEA', 'Total employment (number of jobs)': 'BEA and NAICS', 
           'Mining, quarrying, and oil and gas extraction': 'BEA and NAICS', 'Utilities': 'BEA and NAICS', 'Construction': 'BEA and NAICS', 
           'Manufacturing': 'BEA and NAICS', 'Wholesale trade': 'BEA and NAICS', 'Retail trade': 'BEA and NAICS', 
           'Transportation and warehousing': 'BEA and NAICS', 'Information': 'BEA and NAICS', 'Finance and insurance': 'BEA and NAICS', 
           'Real estate and rental and leasing': 'BEA and NAICS', 'Professional, scientific, and technical services': 'BEA and NAICS', 
           'Management of companies and enterprises': 'BEA and NAICS', 
           'Administrative and support and waste management and remediation services': 'BEA and NAICS', 'Educational services': 'BEA and NAICS', 
           'Health care and social assistance': 'BEA and NAICS', 'Arts, entertainment, and recreation': 'BEA and NAICS', 
           'Accommodation and food services': 'BEA and NAICS', 'Other services (except government and government enterprises)': 'BEA and NAICS', 
           'Public Administration': 'NAICS'}
naicsdict = {'Agriculture, forestry, fishing and hunting': '11', 'Government': 'NA', 'Total employment (number of jobs)': 'NA', 
           'Mining, quarrying, and oil and gas extraction': '21', 'Utilities': '22', 'Construction': '23', 'Manufacturing': '31-33', 
           'Wholesale trade': '42', 'Retail trade': '44-45', 'Transportation and warehousing': '48-49', 'Information': '51', 
           'Finance and insurance': '52', 'Real estate and rental and leasing': '53', 'Professional, scientific, and technical services': '54', 
           'Management of companies and enterprises': '55', 'Administrative and support and waste management and remediation services': '56', 
           'Educational services': '61', 'Health care and social assistance': '62', 'Arts, entertainment, and recreation': '71', 
           'Accommodation and food services': '72', 'Other services (except government and government enterprises)': '81', 'Public Administration': '92'}
beadict = {'Agriculture, forestry, fishing and hunting': '70, 100', 'Government': '2000', 'Total employment (number of jobs)': '10', 
           'Mining, quarrying, and oil and gas extraction': '200', 'Utilities': '300', 'Construction': '400', 'Manufacturing': '500', 
           'Wholesale trade': '600', 'Retail trade': '700', 'Transportation and warehousing': '800', 'Information': '900', 'Finance and insurance': '1000', 
           'Real estate and rental and leasing': '1100', 'Professional, scientific, and technical services': '1200', 
           'Management of companies and enterprises': '1300', 'Administrative and support and waste management and remediation services': '1400', 
           'Educational services': '1500', 'Health care and social assistance': '1600', 'Arts, entertainment, and recreation': '1700', 
           'Accommodation and food services': '1800', 'Other services (except government and government enterprises)': '1900', 'Public Administration': 'NA'}

In [76]:
data['NAICS Code'] = data['Industry'].map(naicsdict)
data['BEA Line Code'] = data['Industry'].map(beadict)
data['Category'] = data['Industry'].map(catdict)

In [77]:
data.head(2)

Unnamed: 0,NAME,Industry,Year,Employment,Total Government Employment,Industry Government Employment,Share of Total Government Employment,Extra Government Employment,NAICS Employment,NAICS Code,BEA Line Code,Category
0,"Cheatham County, Tennessee",Accommodation and food services,2022BL,877.0,1774.0,68.387778,3.855004,68.387778,945.387778,72,1800,BEA and NAICS
1,"Davidson County, Tennessee",Accommodation and food services,2022BL,59594.0,48130.0,0.0,0.0,0.0,59594.0,72,1800,BEA and NAICS


In [78]:
data['Year'].unique()

array(['2022BL', '2022WP', '2023', '2024', '2025', '2026', '2027', '2028',
       '2029', '2030', '2031', '2032', '2033', '2034', '2035', '2036',
       '2037', '2038', '2039', '2040', '2041', '2042', '2043', '2044',
       '2045', '2046', '2047', '2048', '2049', '2050'], dtype=object)

In [79]:
data = data.rename(columns = {'Employment': 'BEA Employment'}).sort_values(by = 'NAME')

In [80]:
data = data[['NAME', 'Year', 'Industry', 'Category', 'NAICS Code', 'BEA Line Code', 'BEA Employment', 'NAICS Employment']]

# Mantissa Round

In [81]:
df = data.pivot(columns = 'NAME', index = ['Industry', 'Year'], values = 'NAICS Employment')
df.reset_index(drop = False, inplace = True)
df = df.loc[df['Industry'] != 'Government']

In [82]:
df.head(2)

NAME,Industry,Year,"Cheatham County, Tennessee","Davidson County, Tennessee","Dickson County, Tennessee","Houston County, Tennessee","Humphreys County, Tennessee","Maury County, Tennessee","Montgomery County, Tennessee","Robertson County, Tennessee","Rutherford County, Tennessee","Stewart County, Tennessee","Sumner County, Tennessee","Trousdale County, Tennessee","Williamson County, Tennessee","Wilson County, Tennessee"
0,Accommodation and food services,2022BL,945.387778,59594.0,2097.0,148.0,725.0,4387.911113,9337.0,2143.0,15705.0,277.548701,7007.0,166.0,14921.0,6540.0
1,Accommodation and food services,2022WP,996.052801,59437.0,2266.0,174.0,719.0,4427.67146,9809.0,2047.0,16331.0,278.0,7073.0,183.0,16163.0,6569.0


In [83]:
years = list(df['Year'].unique())

In [84]:
cols = list(df.columns)
cols.remove('Industry')
cols.remove('Year')
df[cols] = df[cols].astype(float)
df = df.fillna(0)

In [85]:
#separate the total as this shouldn't be included in the mantissa
totes = df.loc[df['Industry'] == 'Total employment (number of jobs)']
df = df.loc[df['Industry'] != 'Total employment (number of jobs)']

In [86]:
df.head()

NAME,Industry,Year,"Cheatham County, Tennessee","Davidson County, Tennessee","Dickson County, Tennessee","Houston County, Tennessee","Humphreys County, Tennessee","Maury County, Tennessee","Montgomery County, Tennessee","Robertson County, Tennessee","Rutherford County, Tennessee","Stewart County, Tennessee","Sumner County, Tennessee","Trousdale County, Tennessee","Williamson County, Tennessee","Wilson County, Tennessee"
0,Accommodation and food services,2022BL,945.387778,59594.0,2097.0,148.0,725.0,4387.911113,9337.0,2143.0,15705.0,277.548701,7007.0,166.0,14921.0,6540.0
1,Accommodation and food services,2022WP,996.052801,59437.0,2266.0,174.0,719.0,4427.67146,9809.0,2047.0,16331.0,278.0,7073.0,183.0,16163.0,6569.0
2,Accommodation and food services,2023,982.770557,63675.392857,2245.035714,147.928571,724.785714,4489.241741,9784.857143,2157.571429,16609.357143,290.564819,7249.357143,169.607143,15876.357143,6851.035714
3,Accommodation and food services,2024,1019.075358,67755.785714,2393.071429,147.857143,724.571429,4589.552873,10232.714286,2173.142857,17512.714286,302.580937,7492.714286,173.214286,16830.714286,7163.071429
4,Accommodation and food services,2025,1059.692075,68795.178571,2472.107143,152.785714,730.357143,4691.902997,10625.571429,2197.714286,18176.071429,307.597055,7782.071429,177.821429,17540.071429,7459.107143


In [87]:
# Create an empty list to collect dataframes
df_list = []
for year in years:
    df_filtered = df.loc[df['Year'] == year]
    df_filtered.reset_index(drop=False, inplace=True)
    cols = ['Cheatham County, Tennessee', 'Davidson County, Tennessee',
            'Dickson County, Tennessee', 'Houston County, Tennessee',
            'Humphreys County, Tennessee', 'Maury County, Tennessee',
            'Montgomery County, Tennessee', 'Robertson County, Tennessee',
            'Rutherford County, Tennessee', 'Stewart County, Tennessee',
            'Sumner County, Tennessee', 'Trousdale County, Tennessee',
            'Williamson County, Tennessee', 'Wilson County, Tennessee']
    for col in cols:
        df_filtered['{}'.format(col)] = mantissa_round(df_filtered['{}'.format(col)])
    # Append the processed dataframe to the list
    df_list.append(df_filtered)
# Concatenate all dataframes in the list
final_df = pd.concat(df_list, ignore_index=True)

In [88]:
final_df.head(2)

NAME,index,Industry,Year,"Cheatham County, Tennessee","Davidson County, Tennessee","Dickson County, Tennessee","Houston County, Tennessee","Humphreys County, Tennessee","Maury County, Tennessee","Montgomery County, Tennessee","Robertson County, Tennessee","Rutherford County, Tennessee","Stewart County, Tennessee","Sumner County, Tennessee","Trousdale County, Tennessee","Williamson County, Tennessee","Wilson County, Tennessee"
0,0,Accommodation and food services,2022BL,945.0,59595.0,2098.0,149.0,726.0,4387.0,9338.0,2144.0,15706.0,277.0,7008.0,167.0,14922.0,6541.0
1,30,Administrative and support and waste managemen...,2022BL,1061.0,63312.0,2147.0,67.0,246.0,3723.0,6029.0,2328.0,14139.0,174.0,6392.0,237.0,15036.0,5025.0


In [89]:
totes.head(2)

NAME,Industry,Year,"Cheatham County, Tennessee","Davidson County, Tennessee","Dickson County, Tennessee","Houston County, Tennessee","Humphreys County, Tennessee","Maury County, Tennessee","Montgomery County, Tennessee","Robertson County, Tennessee","Rutherford County, Tennessee","Stewart County, Tennessee","Sumner County, Tennessee","Trousdale County, Tennessee","Williamson County, Tennessee","Wilson County, Tennessee"
540,Total employment (number of jobs),2022BL,17368.0,740334.0,28454.0,2641.0,8902.0,59490.0,96324.0,38264.0,208413.0,4950.0,102088.0,3630.0,238165.0,94211.0
541,Total employment (number of jobs),2022WP,16640.0,713524.0,27915.0,2730.0,8953.0,56581.0,90326.0,36638.0,200971.0,4876.0,98641.0,3474.0,235941.0,85956.0


In [90]:
final_df.drop(columns = 'index', inplace = True)
final_df = pd.concat([final_df, totes])

In [91]:
final_df.head()

NAME,Industry,Year,"Cheatham County, Tennessee","Davidson County, Tennessee","Dickson County, Tennessee","Houston County, Tennessee","Humphreys County, Tennessee","Maury County, Tennessee","Montgomery County, Tennessee","Robertson County, Tennessee","Rutherford County, Tennessee","Stewart County, Tennessee","Sumner County, Tennessee","Trousdale County, Tennessee","Williamson County, Tennessee","Wilson County, Tennessee"
0,Accommodation and food services,2022BL,945.0,59595.0,2098.0,149.0,726.0,4387.0,9338.0,2144.0,15706.0,277.0,7008.0,167.0,14922.0,6541.0
1,Administrative and support and waste managemen...,2022BL,1061.0,63312.0,2147.0,67.0,246.0,3723.0,6029.0,2328.0,14139.0,174.0,6392.0,237.0,15036.0,5025.0
2,"Agriculture, forestry, fishing and hunting",2022BL,491.0,693.0,1273.0,408.0,698.0,1742.0,1075.0,1914.0,1650.0,414.0,1656.0,304.0,1589.0,1763.0
3,"Arts, entertainment, and recreation",2022BL,593.0,31535.0,450.0,7.0,75.0,1301.0,1620.0,639.0,3525.0,66.0,3144.0,29.0,7932.0,1944.0
4,Construction,2022BL,2020.0,42154.0,2700.0,261.0,856.0,3963.0,6246.0,3586.0,13691.0,693.0,8569.0,325.0,12314.0,6593.0


In [92]:
final_df.to_csv('../data/TwoDigit_FinalCountyLevel.csv', index = False)

# Aggregate to higher-level industry groups

In [93]:
newagg = final_df.melt(id_vars = ['Industry', 'Year'], var_name = 'NAME', value_name = 'Employment')
newagg = newagg.pivot(index = ['Year', 'NAME'], columns = 'Industry', values = 'Employment').reset_index(drop = False)

In [94]:
newagg.head(2)

Industry,Year,NAME,Accommodation and food services,Administrative and support and waste management and remediation services,"Agriculture, forestry, fishing and hunting","Arts, entertainment, and recreation",Construction,Educational services,Finance and insurance,Health care and social assistance,Information,Management of companies and enterprises,Manufacturing,"Mining, quarrying, and oil and gas extraction",Other services (except government and government enterprises),"Professional, scientific, and technical services",Public Administration,Real estate and rental and leasing,Retail trade,Total employment (number of jobs),Transportation and warehousing,Utilities,Wholesale trade
0,2022BL,"Cheatham County, Tennessee",945.0,1061.0,491.0,593.0,2020.0,519.0,623.0,832.0,230.0,121.0,2963.0,17.0,1148.0,823.0,1208.0,913.0,1517.0,17368.0,1052.0,57.0,236.0
1,2022BL,"Davidson County, Tennessee",59595.0,63312.0,693.0,31535.0,42154.0,38447.0,45266.0,97834.0,22181.0,16621.0,22976.0,708.0,37087.0,65586.0,26262.0,40628.0,51864.0,740334.0,50941.0,711.0,25933.0


In [95]:
thelist = [newagg['Utilities'], newagg['Manufacturing'], newagg['Wholesale trade'], newagg['Transportation and warehousing']]
newagg['Industrial'] = sum(thelist)
newagg = newagg.drop(columns = ['Utilities', 'Manufacturing', 'Wholesale trade', 'Transportation and warehousing'])

In [96]:
thelist = [newagg['Information'], newagg['Finance and insurance'], newagg['Real estate and rental and leasing'], 
           newagg['Professional, scientific, and technical services'], newagg['Management of companies and enterprises'], 
           newagg['Administrative and support and waste management and remediation services']]
newagg['Office'] = sum(thelist)
newagg = newagg.drop(columns = ['Information', 'Finance and insurance', 'Real estate and rental and leasing', 'Professional, scientific, and technical services', 
                                'Management of companies and enterprises', 'Administrative and support and waste management and remediation services'])

In [97]:
thelist = [newagg['Arts, entertainment, and recreation'], newagg['Other services (except government and government enterprises)']]
newagg['Service'] = sum(thelist)
newagg = newagg.drop(columns = ['Arts, entertainment, and recreation', 'Other services (except government and government enterprises)'])

In [98]:
thelist = [newagg['Agriculture, forestry, fishing and hunting'], newagg['Mining, quarrying, and oil and gas extraction'], newagg['Construction']]
newagg['Other'] = sum(thelist)
newagg = newagg.drop(columns = ['Agriculture, forestry, fishing and hunting', 'Mining, quarrying, and oil and gas extraction', 'Construction'])

In [99]:
newagg['Education'] = newagg['Educational services']
newagg['Food Services'] = newagg['Accommodation and food services']
newagg['Government'] = newagg['Public Administration']
newagg['Medical'] = newagg['Health care and social assistance']
newagg['Retail'] = newagg['Retail trade']
newagg = newagg.drop(columns = ['Educational services', 'Accommodation and food services', 'Public Administration', 'Health care and social assistance', 
                                'Retail trade'])

In [105]:
newagg.head()

Industry,Year,NAME,Total employment (number of jobs),Industrial,Office,Service,Other,Education,Food Services,Government,Medical,Retail
0,2022BL,"Cheatham County, Tennessee",17368.0,4308.0,3771.0,1741.0,2528.0,519.0,945.0,1208.0,832.0,1517.0
1,2022BL,"Davidson County, Tennessee",740334.0,100561.0,253594.0,68622.0,43555.0,38447.0,59595.0,26262.0,97834.0,51864.0
2,2022BL,"Dickson County, Tennessee",28454.0,5412.0,5758.0,2182.0,4063.0,390.0,2098.0,2235.0,3155.0,3161.0
3,2022BL,"Houston County, Tennessee",2641.0,299.0,283.0,197.0,670.0,20.0,149.0,473.0,331.0,219.0
4,2022BL,"Humphreys County, Tennessee",8902.0,2257.0,984.0,586.0,1663.0,159.0,726.0,846.0,726.0,956.0


In [106]:
newagg.to_csv('../data/Aggregated_FinalCountyLevel.csv', index = False)