In [55]:
import pandas as pd
import sqlite3 as sq
import matplotlib as mpl
from matplotlib import rcParams
import matplotlib.pyplot as plt
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 150)
np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:20,.2f}'.format

In [56]:
data = pd.read_csv('../data/jobseq_2022_farmbreakout.csv', dtype = str)
data.head(2)

Unnamed: 0,Industry,NAICS,Ownership,"Cheatham County, Tennessee","Davidson County, Tennessee","Dickson County, Tennessee","Houston County, Tennessee","Humphreys County, Tennessee","Maury County, Tennessee","Montgomery County, Tennessee","Robertson County, Tennessee","Rutherford County, Tennessee","Stewart County, Tennessee","Sumner County, Tennessee","Trousdale County, Tennessee","Williamson County, Tennessee","Wilson County, Tennessee"
0,Total - All Industries,,Total Employment,11083.33494,580063.7505,20915.34806,1825.910939,6790.175738,44525.94432,67755.70831,26078.00992,155101.0868,3549.294632,68570.03751,2389.200001,168647.5906,68695.96063
1,Total - All Industries,,Covered Employment,9410.410541,547039.6749,18691.62486,1451.611485,6006.57841,40363.62806,62006.51248,23063.31165,143131.6684,3077.965375,60736.79941,2003.156011,155134.444,62553.87491


In [57]:
#reformat the JobsEQ download so that the geos are a columns
data = data.melt(id_vars = ['Industry', 'NAICS', 'Ownership'], value_vars = ['Cheatham County, Tennessee', 'Davidson County, Tennessee', 'Dickson County, Tennessee', 
                                                                           'Houston County, Tennessee', 'Humphreys County, Tennessee', 'Maury County, Tennessee', 
                                                                           'Montgomery County, Tennessee', 'Robertson County, Tennessee', 
                                                                           'Rutherford County, Tennessee', 'Stewart County, Tennessee', 'Sumner County, Tennessee', 
                                                                           'Trousdale County, Tennessee', 'Williamson County, Tennessee', 'Wilson County, Tennessee'], 
               var_name = 'NAME', value_name = 'Empl')

In [58]:
data.head(2)

Unnamed: 0,Industry,NAICS,Ownership,NAME,Empl
0,Total - All Industries,,Total Employment,"Cheatham County, Tennessee",11083.33494
1,Total - All Industries,,Covered Employment,"Cheatham County, Tennessee",9410.410541


In [59]:
#we are removing covered employment, that is the sum of private and all three categories of government employment
#this leaves Total, Private, Local Government, State Government, and Federal Government, as well as Self-Employment
data = data.loc[data['Ownership'] != 'Covered Employment']
#drop the NAICS code, no need to carry it through all of the code
data = data.drop(columns = 'NAICS')

In [6]:
#we need railroad and religious organization employment - which will be the difference between the total employment and all covered + self-employed

In [10]:
#make sure the 
data['Empl'] = data['Empl'].astype(float)
totals = data.loc[data['Ownership'] == 'Total Employment'].reset_index(drop = True)
grouped = data.loc[data['Ownership'] != 'Total Employment'].reset_index(drop = True)

In [11]:
totals.head()

Unnamed: 0,Industry,Ownership,NAME,Empl
0,Total - All Industries,Total Employment,"Cheatham County, Tennessee",11083.33
1,Crop Production,Total Employment,"Cheatham County, Tennessee",76.18
2,Animal Production and Aquaculture,Total Employment,"Cheatham County, Tennessee",83.95
3,Forestry and Logging,Total Employment,"Cheatham County, Tennessee",4.6
4,"Fishing, Hunting and Trapping",Total Employment,"Cheatham County, Tennessee",0.0


In [12]:
test = grouped.groupby(['NAME', 'Industry'])['Empl'].sum()

In [13]:
test = pd.DataFrame(test)
test.reset_index(inplace = True)
test['Ownership'] = 'All Covered and Self-Employed'
test.head()

Unnamed: 0,NAME,Industry,Empl,Ownership
0,"Cheatham County, Tennessee",Accommodation and Food Services,967.66,All Covered and Self-Employed
1,"Cheatham County, Tennessee",Administrative and Support and Waste Managemen...,415.08,All Covered and Self-Employed
2,"Cheatham County, Tennessee",Animal Production and Aquaculture,83.95,All Covered and Self-Employed
3,"Cheatham County, Tennessee","Arts, Entertainment, and Recreation",249.35,All Covered and Self-Employed
4,"Cheatham County, Tennessee",Construction,1276.81,All Covered and Self-Employed


In [14]:
df = pd.concat([test, totals])

In [15]:
data = df

In [16]:
data.tail()

Unnamed: 0,NAME,Industry,Empl,Ownership
387,"Wilson County, Tennessee","Arts, Entertainment, and Recreation",723.06,Total Employment
388,"Wilson County, Tennessee",Accommodation and Food Services,6406.05,Total Employment
389,"Wilson County, Tennessee",Other Services (except Public Administration),3164.97,Total Employment
390,"Wilson County, Tennessee",Public Administration,1961.44,Total Employment
391,"Wilson County, Tennessee",Unclassified,40.1,Total Employment


In [17]:
pivot_df = df.pivot_table(index=['NAME', 'Industry'], columns='Ownership', values='Empl', aggfunc='sum', fill_value=0)

In [18]:
pivot_df.head()

Unnamed: 0_level_0,Ownership,All Covered and Self-Employed,Total Employment
NAME,Industry,Unnamed: 2_level_1,Unnamed: 3_level_1
"Cheatham County, Tennessee",Accommodation and Food Services,967.66,967.66
"Cheatham County, Tennessee",Administrative and Support and Waste Management and Remediation Services,415.08,415.08
"Cheatham County, Tennessee",Animal Production and Aquaculture,83.95,83.95
"Cheatham County, Tennessee","Arts, Entertainment, and Recreation",249.35,249.35
"Cheatham County, Tennessee",Construction,1276.81,1276.81


In [19]:
pivot_df['Non-Covered Religious and Rail'] = pivot_df['Total Employment'] - pivot_df['All Covered and Self-Employed']
pivot_df['Non-Covered Religious and Rail'] = pivot_df['Non-Covered Religious and Rail'].clip(lower=0)

In [20]:
result_df = pivot_df.reset_index().melt(id_vars=['NAME', 'Industry'], var_name='Ownership', value_name='Empl')

In [21]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1176 entries, 0 to 1175
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   NAME       1176 non-null   object 
 1   Industry   1176 non-null   object 
 2   Ownership  1176 non-null   object 
 3   Empl       1176 non-null   float64
dtypes: float64(1), object(3)
memory usage: 36.9+ KB


In [22]:
rr = result_df.loc[result_df['Ownership'] == 'Non-Covered Religious and Rail']
rr.head(2)

Unnamed: 0,NAME,Industry,Ownership,Empl
784,"Cheatham County, Tennessee",Accommodation and Food Services,Non-Covered Religious and Rail,0.0
785,"Cheatham County, Tennessee",Administrative and Support and Waste Managemen...,Non-Covered Religious and Rail,0.0


In [23]:
rr['Ownership'] = 'Private'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rr['Ownership'] = 'Private'


In [24]:
rr.head(2)

Unnamed: 0,NAME,Industry,Ownership,Empl
784,"Cheatham County, Tennessee",Accommodation and Food Services,Private,0.0
785,"Cheatham County, Tennessee",Administrative and Support and Waste Managemen...,Private,0.0


In [25]:
private_only = grouped.loc[grouped['Ownership'] == 'Private']
non_private = grouped.loc[grouped['Ownership'] != 'Private']

In [26]:
non_private.head()

Unnamed: 0,Industry,Ownership,NAME,Empl
0,Total - All Industries,Federal Government,"Cheatham County, Tennessee",78.18
1,Total - All Industries,State Government,"Cheatham County, Tennessee",76.42
2,Total - All Industries,Local Government,"Cheatham County, Tennessee",1413.97
4,Total - All Industries,Self-Employment,"Cheatham County, Tennessee",1562.27
5,Crop Production,Federal Government,"Cheatham County, Tennessee",0.0


In [27]:
temp_private = rr.merge(private_only, on = ['NAME', 'Industry', 'Ownership'])

In [28]:
temp_private['Empl'] = temp_private['Empl_x'] + temp_private['Empl_y']
temp_private = temp_private.drop(columns = ['Empl_x', 'Empl_y'])

In [29]:
temp_private.head()

Unnamed: 0,NAME,Industry,Ownership,Empl
0,"Cheatham County, Tennessee",Accommodation and Food Services,Private,865.62
1,"Cheatham County, Tennessee",Administrative and Support and Waste Managemen...,Private,219.18
2,"Cheatham County, Tennessee",Animal Production and Aquaculture,Private,4.8
3,"Cheatham County, Tennessee","Arts, Entertainment, and Recreation",Private,143.45
4,"Cheatham County, Tennessee",Construction,Private,801.15


In [30]:
final = pd.concat([temp_private, non_private, totals])

In [31]:
final.tail()

Unnamed: 0,NAME,Industry,Ownership,Empl
387,"Wilson County, Tennessee","Arts, Entertainment, and Recreation",Total Employment,723.06
388,"Wilson County, Tennessee",Accommodation and Food Services,Total Employment,6406.05
389,"Wilson County, Tennessee",Other Services (except Public Administration),Total Employment,3164.97
390,"Wilson County, Tennessee",Public Administration,Total Employment,1961.44
391,"Wilson County, Tennessee",Unclassified,Total Employment,40.1


In [32]:
final['Ownership'].unique()

array(['Private', 'Federal Government', 'State Government',
       'Local Government', 'Self-Employment', 'Total Employment'],
      dtype=object)

In [33]:
data = final

In [34]:
#group federal state and local government ownership as "all government ownership"
thelist = ['Federal Government', 'State Government', 'Local Government']
government = data.loc[data['Ownership'].isin(thelist)]
data = data.loc[~data['Ownership'].isin(thelist)]
#group agricultural industries as farm
thelist = ['Crop Production', 'Animal Production and Aquaculture', 'Support Activities for Crop Production', 'Support Activities for Animal Production', 
          'Forestry and Logging', 'Fishing, Hunting and Trapping', 'Support Activities for Forestry']
eleven = data.loc[data['Industry'].isin(thelist)]
data = data.loc[~data['Industry'].isin(thelist)]

In [35]:
# #group agricultural industries as farm
# thelist = ['Crop Production', 'Animal Production and Aquaculture', 'Support Activities for Crop Production', 'Support Activities for Animal Production']
# farm = data.loc[data['Industry'].isin(thelist)]
# data = data.loc[~data['Industry'].isin(thelist)]
# #group forestry fishing hunting and trapping industries as one
# thelist = ['Forestry and Logging', 'Fishing, Hunting and Trapping', 'Support Activities for Forestry']
# forestry = data.loc[data['Industry'].isin(thelist)]
# data = data.loc[~data['Industry'].isin(thelist)]

In [36]:
data['Ownership'].unique()

array(['Private', 'Self-Employment', 'Total Employment'], dtype=object)

In [37]:
government = government.groupby(['NAME', 'Industry']).agg({'Empl': 'sum'}).reset_index()
government['Ownership'] = 'Government'
eleven = eleven.groupby(['NAME', 'Ownership']).agg({'Empl': 'sum'}).reset_index()
eleven['Industry'] = 'Agriculture, forestry, fishing and hunting'

In [38]:
# farm = farm.groupby(['NAME', 'Ownership']).agg({'Empl': 'sum'}).reset_index()
# farm['Industry'] = 'Farm employment'
# forestry = forestry.groupby(['NAME', 'Ownership']).agg({'Empl': 'sum'}).reset_index()
# forestry['Industry'] = 'Forestry, fishing, and related activities'

In [39]:
data = pd.concat([data, government, eleven])

In [40]:
# data = pd.concat([data, government, farm, forestry])

In [41]:
data.head()

Unnamed: 0,NAME,Industry,Ownership,Empl
0,"Cheatham County, Tennessee",Accommodation and Food Services,Private,865.62
1,"Cheatham County, Tennessee",Administrative and Support and Waste Managemen...,Private,219.18
3,"Cheatham County, Tennessee","Arts, Entertainment, and Recreation",Private,143.45
4,"Cheatham County, Tennessee",Construction,Private,801.15
6,"Cheatham County, Tennessee",Educational Services,Private,33.58


In [42]:
data['Ownership'].unique()

array(['Private', 'Self-Employment', 'Total Employment', 'Government'],
      dtype=object)

In [43]:
thelist = ['Animal Production and Aquaculture', 'Crop Production',
           'Fishing, Hunting and Trapping', 'Forestry and Logging', 'Support Activities for Animal Production',
           'Support Activities for Crop Production', 'Support Activities for Forestry', 'Unclassified', 'Public Administration']
data = data.loc[~data['Industry'].isin(thelist)]

In [44]:
data['Ownership'].unique()

array(['Private', 'Self-Employment', 'Total Employment', 'Government'],
      dtype=object)

In [45]:
pivot_df = data.pivot_table(index=['NAME', 'Industry'], columns='Ownership', values='Empl', aggfunc='sum')
pivot_df.reset_index(inplace=True)

In [46]:
pivot_df.head()

Ownership,NAME,Industry,Government,Private,Self-Employment,Total Employment
0,"Cheatham County, Tennessee",Accommodation and Food Services,75.46,865.62,26.58,967.66
1,"Cheatham County, Tennessee",Administrative and Support and Waste Managemen...,20.28,219.18,175.61,415.08
2,"Cheatham County, Tennessee","Agriculture, forestry, fishing and hunting",,10.1,161.39,171.48
3,"Cheatham County, Tennessee","Arts, Entertainment, and Recreation",0.0,143.45,105.9,249.35
4,"Cheatham County, Tennessee",Construction,35.09,801.15,440.57,1276.81


In [47]:
# Calculate the "Share of Total Employment" for Private, Self-Employment, and Government
for ownership_type in ['Private', 'Self-Employment', 'Government']:
    column_name = f'Share of {ownership_type}'
    pivot_df[column_name] = (pivot_df[ownership_type] / pivot_df['Total Employment']) * 100


In [48]:
#pivot_df['NAICS'] = pivot_df['Industry'].map(indnaics)

In [49]:
data = pivot_df[['NAME', 'Industry', 'Share of Private', 'Share of Self-Employment', 'Share of Government']]

In [50]:
data = data.rename(columns = {'Share of Private': 'Share Private', 'Share of Self-Employment': 'Share Self-Employed', 
                             'Share of Government': 'Share Government'})

In [51]:
inddict = {'Total - All Industries': 'Total employment (number of jobs)',    
           'Accommodation and Food Services': 'Accommodation and food services',
           'Administrative and Support and Waste Management and Remediation Services': 'Administrative and support and waste management and remediation services',
           
           'Arts, Entertainment, and Recreation': 'Arts, entertainment, and recreation', 
           'Educational Services': 'Educational services', 
           'Finance and Insurance': 'Finance and insurance',
           'Health Care and Social Assistance': 'Health care and social assistance', 
           'Management of Companies and Enterprises': 'Management of companies and enterprises', 
           'Mining, Quarrying, and Oil and Gas Extraction': 'Mining, quarrying, and oil and gas extraction',
           'Other Services (except Public Administration)': 'Other services (except government and government enterprises)',
           'Professional, Scientific, and Technical Services': 'Professional, scientific, and technical services',
           'Real Estate and Rental and Leasing': 'Real estate and rental and leasing',
           'Retail Trade': 'Retail trade', 
           'Transportation and Warehousing': 'Transportation and warehousing', 
           'Utilities': 'Utilities', 
           'Wholesale Trade': 'Wholesale trade'}

In [52]:
data['Industry'] = data['Industry'].replace(inddict)

In [54]:
data.to_csv('../data/jobseqdistr.csv', index = False)