In [88]:
import pandas as pd
import numpy as np
import requests
import geopandas as gpd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 150)
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
import sys
sys.path.append("../../Functions and Dictionaries/") # Adds higher directory to python modules path
import sqlite3 as sq

In [89]:
import geodict
tofullcensus = geodict.tofullcensus
geotogeoid = geodict.geotogeoid
GNRC = geodict.GNRC
GNRCFULLFIPS = geodict.GNRCFULLFIPS
namestocommon = geodict.namestocommon
KY = geodict.KY

In [90]:
def percent(x, y):
    return (x/y)

# Selecting LEHD datasets: 

https://lehd.ces.census.gov/data/lodes/LODES7/LODESTechDoc7.5.pdf  

Pages 5 & 6 of this document outlines the file structure for Origin-Destination data. This data is limited to disaggregations by the following categories:  
+ age (29 and younger, 30 to 54, and 55 or older)  
+ earnings (1,250 a month or less, 1,251 to 3,333 a month, and greater than 3,333 a month)  
+ Goods-Producing versus Trade, Transportation & Utilities, versus All Other Services industry sectors  

The earnings breakout is kind of useless so we'll only use the age and industry sector data.  

Page 7 of this document outlines the file structure for the Residence Area Characteristics data. This shows the same totals for age and earnings listed above, but further disaggregates the industry component into 2-digit NAICS codes. It also disaggregates job counts by race, Hispanic or Latino ethnicity, sex, and educational attainment (High School or Equivalent, Some college or Associate degree, Bachelor's degree or advanced degree). 

Page 8 of this document outlines the file structure for the Workplace Area Characteristics. This shows all of the same information as the RAC but also firm age and size. 

This data package also has a geography crosswalk which will be helpful in aggregating all of these numbers to a place and county level.


## Geography Crosswalk  
What we need from the geography crosswalk:  
+ a list of block groups in the GNRC region to narrow down the WAC and RAC preliminarily (we'll only be looking at the region here, not the state, because we're simply looking at in vs out instead of flow)  
+ a way to tag county and place to WAC, RAC, and OD block codes so that we can aggregate everything available by county, place, and unincorporated areas - dictionaries for now

In [91]:
def mantissa_round(x):
    y = np.floor(x)
    indices = np.argsort(x-y)
    hilowman = indices[::-1]
    diffxy = np.sum(x)-np.sum(y)
    z=0
    while z < diffxy:
        #z single dimemsion - need to adjust to take table
        y[hilowman[z]] +=1
        z +=1
    return y

In [92]:
#Tennessee Blocks
url= 'https://www2.census.gov/geo/tiger/TIGER2020/TABBLOCK20/tl_2020_47_tabblock20.zip'
tnb = gpd.read_file(url)
print('Your geographic file is ready.')

Your geographic file is ready.


In [93]:
tnb = tnb[['GEOID20', 'geometry']]

In [94]:
url = 'https://lehd.ces.census.gov/data/lodes/LODES8/tn/tn_xwalk.csv.gz'
geo = pd.read_csv(url, dtype = str)
print('Your file is ready.')

Your file is ready.


In [95]:
#rename and drop unneeded columns
geocols = {'tabblk2020': 'Block Code', 'st': 'StateFIPS', 'stusps': 'StateUSPS', 'stname': 'State', 'cty': 'CountyFIPS', 'ctyname': 'County', 
           'trct': 'TractCODE', 'trctname': 'Tract', 'bgrp': 'BlockgroupCODE', 'bgrpname': 'Blockgroup', 'cbsa': 'CBSACODE', 'cbsaname': 'CBSA', 
           'zcta': 'ZCTACODE', 'zctaname': 'ZCTA', 'stplc': 'PlaceCODE', 'stplcname': 'Place', 'stwib': 'WIBCODE', 'stwibname': 'WIB', 
           'blklatdd': 'Lat', 'blklondd': 'Lon'}
geo.rename(columns = geocols, inplace = True)
cols = ['ctycsub', 'ctycsubname', 'stcd116', 'stcd116name', 'stsldl', 'stsldlname', 'stsldu', 'stslduname', 'stschool',
       'stschoolname', 'stsecon', 'stseconname', 'trib', 'tribname', 'tsub', 'tsubname', 'stanrc', 'stanrcname', 'necta', 'nectaname', 'mil',
       'milname']
geo = geo.drop(columns = cols)

In [96]:
countyfips = ['47021', '47037', '47043', '47083', '47085', '47119', '47125', '47147', '47149', '47161', '47165', '47169', '47187', '47189']
geo = geo.loc[geo['CountyFIPS'].isin(countyfips)].reset_index(drop = True)

In [97]:
test = geo[['County']]
test['Summer'] = 1
test = test.groupby('County').sum()
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Summer'] = 1


Unnamed: 0_level_0,Summer
County,Unnamed: 1_level_1
"Cheatham County, TN",829
"Davidson County, TN",9097
"Dickson County, TN",1826
"Houston County, TN",544
"Humphreys County, TN",1360
"Maury County, TN",2986
"Montgomery County, TN",2954
"Robertson County, TN",1568
"Rutherford County, TN",4461
"Stewart County, TN",884


In [98]:
# #old method
#make a list of block codes in the GNRC region
blockcodegnrcregion = list(geo['Block Code'].unique())
# #blockcodegnrcregion
#create a dictionary of county names to a list of their block codes
tncounties = list(geo['County'].unique())

c = {}
for i in tncounties:
    c['{}'.format(i)] = list(geo.loc[geo['County'] == '{}'.format(i), 'Block Code'])

In [99]:
#invert that dictionary so that we can map it later
c_inverted = {}
for k,v in c.items():
    for x in v:
        c_inverted.setdefault(x,[]).append(k)
#c_inverted

In [100]:
allblockcodes = pd.DataFrame(c_inverted.items(), columns=['BC', 'County'])
allblockcodes.head()

Unnamed: 0,BC,County
0,471190102051036,"[Maury County, TN]"
1,471190103011065,"[Maury County, TN]"
2,471190102051047,"[Maury County, TN]"
3,471190104012016,"[Maury County, TN]"
4,471190108024001,"[Maury County, TN]"


In [101]:
allblockcodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36642 entries, 0 to 36641
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   BC      36642 non-null  object
 1   County  36642 non-null  object
dtypes: object(2)
memory usage: 572.7+ KB


## Workplace Area Characteristics

In [102]:
#al jobs: https://lehd.ces.census.gov/data/lodes/LODES8/LODESTechDoc8.1.pdf
url = 'https://lehd.ces.census.gov/data/lodes/LODES8/tn/wac/tn_wac_S000_JT00_2021.csv.gz'
data = pd.read_csv(url)
print('Your file is ready.')

Your file is ready.


In [103]:
data['w_geocode'].nunique()

42009

In [104]:
waccols = {'w_geocode': 'GEO_ID', 'C000': 'Total Jobs', 'CA01': 'Age:Workers 29 or Younger', 'CA02': 'Age:Workers 30 to 54', 
           'CA03': 'Age:Workers 55 Older', 'CE01': 'Earnings:1250mo or Less', 'CE02': 'Earnings:1251 to 3333mo', 
           'CE03': 'Earnings:Greater Than 3333mo', 
           
           'CNS01': 'Agriculture, forestry, fishing and hunting', 
           'CNS02': 'Mining, quarrying, and oil and gas extraction', 
           'CNS03': 'Utilities', 
           'CNS04': 'Construction', 
           'CNS05': 'Manufacturing',
           'CNS06': 'Wholesale trade', 
           'CNS07': 'Retail trade', 
           'CNS08': 'Transportation and warehousing',
           'CNS09': 'Information', 
           'CNS10': 'Finance and insurance', 
           'CNS11': 'Real estate and rental and leasing', 
           'CNS12': 'Professional, scientific, and technical services', 
           'CNS13': 'Management of companies and enterprises', 
           'CNS14': 'Administrative and support and waste management and remediation services', 
           'CNS15': 'Educational services', 
           'CNS16': 'Health care and social assistance', 
           'CNS17': 'Arts, entertainment, and recreation', 
           'CNS18': 'Accommodation and food services', 
           'CNS19': 'Other services (except government and government enterprises)', 
           'CNS20': 'Public Administration',
           
           'CR01': 'Race:White Alone', 'CR02': 'Race:Black or African American Alone', 'CR03': 'Race:American Indian or Alaska Native Alone', 
           'CR04': 'Race:Asian Alone', 'CR05': 'Race:Native Hawaiian or Other Pacific Islander Alone', 'CR07': 'Race:Two or More Races', 
           'CT01': 'Ethnicity:Not Hispanic or Latino', 'CT02': 'Ethnicity:Hispanic or Latino', 'CD01':'Ed:Less than High School', 
           'CD02': 'Ed:High School or Equivalent', 'CD03': 'Ed:Some College or Associates', 'CD04': 'Ed:Bachelors or Advanced Degree', 'CS01': 'Sex:Male', 
           'CS02': 'Sex:Female', 'CFA01': 'FirmAge:0 to 1 Year', 'CFA02': 'FirmAge:2 to 3 Years', 'CFA03': 'FirmAge:4 to 5 Years', 'CFA04': 'FirmAge:6 to 10 Years',
           'CFA05': 'FirmAge:11 or More Years', 'CFS01': 'FirmSize:0 to 19 Employees', 'CFS02': 'FirmSize:20 to 49 Employees', 
           'CFS03': 'FirmSize:50 to 249 Employees', 'CFS04': 'FirmSize:250 to 499 Employees', 'CFS05': 'FirmSize:500 or More Employees'}

In [105]:
data.rename(columns = waccols, inplace = True)
data = data.fillna(0)

In [106]:
data['GEO_ID'] = data['GEO_ID'].astype(str)

In [107]:
public = pd.read_csv('../data/PublicSchools.csv', dtype = str)
public = public[['FTE', 'GEOID_BLOCK']]
public['FTE'] = public['FTE'].replace({'<Null>':0})
public = public.rename(columns = {'FTE': 'FTE PUBLIC'})
public['FTE PUBLIC'] = public['FTE PUBLIC'].astype(float)
public = public.groupby('GEOID_BLOCK').sum()
public.reset_index(drop = False, inplace = True)
private = pd.read_csv('../data/PrivateSchools.csv', dtype = str)
private['GEOID_BLOCK'] = private['GEOID20']
private = private[['FTE', 'GEOID_BLOCK']]
private['FTE'] = private['FTE'].replace({'<Null>':0})
private = private.rename(columns = {'FTE': 'FTE PRIVATE'})
private['FTE PRIVATE'] = private['FTE PRIVATE'].astype(float)
private = private.groupby('GEOID_BLOCK').sum()
private.reset_index(drop = False, inplace = True)
universities = pd.read_csv('../data/Universities.csv', dtype = str)
universities = universities[['EMP', 'GEOID_BLOCK']]
universities['EMP'] = universities['EMP'].replace({'<Null>':0})
universities = universities.rename(columns = {'EMP': 'FTE UNI'})
universities['FTE UNI'] = universities['FTE UNI'].astype(float)
universities = universities.groupby('GEOID_BLOCK').sum()
universities.reset_index(drop = False, inplace = True)
schools = public.merge(private, on = 'GEOID_BLOCK', how = 'outer')
schools = schools.merge(universities, on = 'GEOID_BLOCK', how = 'outer')
schools.fillna(0, inplace = True)
cols = ['FTE PUBLIC', 'FTE PRIVATE', 'FTE UNI']
schools[cols] = schools[cols].astype(float)
thelist = [schools['FTE PUBLIC'], schools['FTE PRIVATE'], schools['FTE UNI']]
schools['Employment'] = sum(thelist)
#schools['County'] = schools['GEOID_BLOCK'].map(c_inverted).str.get(0)
schools = schools[['GEOID_BLOCK', 'Employment']]
schools = schools.rename(columns = {'GEOID_BLOCK': 'GEO_ID', 'Employment': 'Educational services'})

data = data.drop(columns = 'Educational services')

data = data.merge(schools, on = 'GEO_ID', how = 'outer')
data.fillna(0, inplace = True)

In [108]:
data['Educational services'] = mantissa_round(data['Educational services'])

In [109]:
thelist = [data['Agriculture, forestry, fishing and hunting'], data['Mining, quarrying, and oil and gas extraction'], 
           data['Utilities'], data['Construction'], data['Manufacturing'],data['Wholesale trade'], data['Retail trade'], 
           data['Transportation and warehousing'],data['Information'], data['Finance and insurance'], data['Real estate and rental and leasing'], 
           data['Professional, scientific, and technical services'], data['Management of companies and enterprises'], 
           data['Administrative and support and waste management and remediation services'], data['Educational services'], 
           data['Health care and social assistance'], data['Arts, entertainment, and recreation'], data['Accommodation and food services'], 
           data['Other services (except government and government enterprises)'], data['Public Administration']]
data['Total Jobs'] = sum(thelist)

In [110]:
data.head(2)

Unnamed: 0,GEO_ID,Total Jobs,Age:Workers 29 or Younger,Age:Workers 30 to 54,Age:Workers 55 Older,Earnings:1250mo or Less,Earnings:1251 to 3333mo,Earnings:Greater Than 3333mo,"Agriculture, forestry, fishing and hunting","Mining, quarrying, and oil and gas extraction",Utilities,Construction,Manufacturing,Wholesale trade,Retail trade,Transportation and warehousing,Information,Finance and insurance,Real estate and rental and leasing,"Professional, scientific, and technical services",Management of companies and enterprises,Administrative and support and waste management and remediation services,Health care and social assistance,"Arts, entertainment, and recreation",Accommodation and food services,Other services (except government and government enterprises),Public Administration,Race:White Alone,Race:Black or African American Alone,Race:American Indian or Alaska Native Alone,Race:Asian Alone,Race:Native Hawaiian or Other Pacific Islander Alone,Race:Two or More Races,Ethnicity:Not Hispanic or Latino,Ethnicity:Hispanic or Latino,Ed:Less than High School,Ed:High School or Equivalent,Ed:Some College or Associates,Ed:Bachelors or Advanced Degree,Sex:Male,Sex:Female,FirmAge:0 to 1 Year,FirmAge:2 to 3 Years,FirmAge:4 to 5 Years,FirmAge:6 to 10 Years,FirmAge:11 or More Years,FirmSize:0 to 19 Employees,FirmSize:20 to 49 Employees,FirmSize:50 to 249 Employees,FirmSize:250 to 499 Employees,FirmSize:500 or More Employees,createdate,Educational services
0,470010201001000,148.0,59.0,61.0,27.0,43.0,73.0,31.0,0.0,0.0,0.0,1.0,0.0,15.0,15.0,0.0,0.0,15.0,0.0,14.0,0.0,0.0,22.0,6.0,42.0,17.0,0.0,133.0,6.0,1.0,4.0,1.0,2.0,145.0,2.0,17.0,22.0,28.0,21.0,46.0,101.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20231016.0,1.0
1,470010201001001,99.0,43.0,34.0,22.0,51.0,38.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,94.0,4.0,0.0,0.0,0.0,1.0,96.0,3.0,7.0,25.0,13.0,11.0,49.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20231016.0,0.0


In [111]:
data['GEO_ID'] = data['GEO_ID'].astype(str)
data = data.loc[data['GEO_ID'].isin(blockcodegnrcregion)]

In [112]:
data.head(2)

Unnamed: 0,GEO_ID,Total Jobs,Age:Workers 29 or Younger,Age:Workers 30 to 54,Age:Workers 55 Older,Earnings:1250mo or Less,Earnings:1251 to 3333mo,Earnings:Greater Than 3333mo,"Agriculture, forestry, fishing and hunting","Mining, quarrying, and oil and gas extraction",Utilities,Construction,Manufacturing,Wholesale trade,Retail trade,Transportation and warehousing,Information,Finance and insurance,Real estate and rental and leasing,"Professional, scientific, and technical services",Management of companies and enterprises,Administrative and support and waste management and remediation services,Health care and social assistance,"Arts, entertainment, and recreation",Accommodation and food services,Other services (except government and government enterprises),Public Administration,Race:White Alone,Race:Black or African American Alone,Race:American Indian or Alaska Native Alone,Race:Asian Alone,Race:Native Hawaiian or Other Pacific Islander Alone,Race:Two or More Races,Ethnicity:Not Hispanic or Latino,Ethnicity:Hispanic or Latino,Ed:Less than High School,Ed:High School or Equivalent,Ed:Some College or Associates,Ed:Bachelors or Advanced Degree,Sex:Male,Sex:Female,FirmAge:0 to 1 Year,FirmAge:2 to 3 Years,FirmAge:4 to 5 Years,FirmAge:6 to 10 Years,FirmAge:11 or More Years,FirmSize:0 to 19 Employees,FirmSize:20 to 49 Employees,FirmSize:50 to 249 Employees,FirmSize:250 to 499 Employees,FirmSize:500 or More Employees,createdate,Educational services
3450,470210701021001,79.0,9.0,50.0,20.0,12.0,27.0,40.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,0.0,0.0,0.0,8.0,70.0,7.0,1.0,0.0,0.0,1.0,78.0,1.0,8.0,27.0,19.0,16.0,15.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20231016.0,0.0
3451,470210701021004,17.0,6.0,8.0,3.0,3.0,4.0,10.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,16.0,1.0,0.0,4.0,5.0,2.0,12.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20231016.0,0.0


In [113]:
#add columns with county name and place name respectively
data['County'] = data['GEO_ID'].map(c_inverted)
data['County'] = data['County'].str.get(0)

In [114]:
tnb.head()

Unnamed: 0,GEOID20,geometry
0,471570026001003,"POLYGON ((-90.00569 35.14487, -90.00550 35.146..."
1,471570042003027,"POLYGON ((-90.05694 35.13600, -90.05684 35.135..."
2,471839684003070,"POLYGON ((-88.64560 36.25843, -88.64555 36.258..."
3,471839682013032,"POLYGON ((-88.87268 36.29224, -88.87267 36.293..."
4,471570217212002,"POLYGON ((-89.86599 35.05808, -89.86599 35.058..."


In [115]:
tnb = tnb.rename(columns = {'GEOID20': 'GEO_ID'})
df_geo = data.merge(tnb, on = 'GEO_ID', how = 'inner')
df_geo = gpd.GeoDataFrame(df_geo)

In [116]:
df_geo.head()

Unnamed: 0,GEO_ID,Total Jobs,Age:Workers 29 or Younger,Age:Workers 30 to 54,Age:Workers 55 Older,Earnings:1250mo or Less,Earnings:1251 to 3333mo,Earnings:Greater Than 3333mo,"Agriculture, forestry, fishing and hunting","Mining, quarrying, and oil and gas extraction",Utilities,Construction,Manufacturing,Wholesale trade,Retail trade,Transportation and warehousing,Information,Finance and insurance,Real estate and rental and leasing,"Professional, scientific, and technical services",Management of companies and enterprises,Administrative and support and waste management and remediation services,Health care and social assistance,"Arts, entertainment, and recreation",Accommodation and food services,Other services (except government and government enterprises),Public Administration,Race:White Alone,Race:Black or African American Alone,Race:American Indian or Alaska Native Alone,Race:Asian Alone,Race:Native Hawaiian or Other Pacific Islander Alone,Race:Two or More Races,Ethnicity:Not Hispanic or Latino,Ethnicity:Hispanic or Latino,Ed:Less than High School,Ed:High School or Equivalent,Ed:Some College or Associates,Ed:Bachelors or Advanced Degree,Sex:Male,Sex:Female,FirmAge:0 to 1 Year,FirmAge:2 to 3 Years,FirmAge:4 to 5 Years,FirmAge:6 to 10 Years,FirmAge:11 or More Years,FirmSize:0 to 19 Employees,FirmSize:20 to 49 Employees,FirmSize:50 to 249 Employees,FirmSize:250 to 499 Employees,FirmSize:500 or More Employees,createdate,Educational services,County,geometry
0,470210701021001,79.0,9.0,50.0,20.0,12.0,27.0,40.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67.0,0.0,0.0,0.0,8.0,70.0,7.0,1.0,0.0,0.0,1.0,78.0,1.0,8.0,27.0,19.0,16.0,15.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20231016.0,0.0,"Cheatham County, TN","POLYGON ((-87.05783 36.23509, -87.05755 36.235..."
1,470210701021004,17.0,6.0,8.0,3.0,3.0,4.0,10.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,16.0,1.0,0.0,4.0,5.0,2.0,12.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20231016.0,0.0,"Cheatham County, TN","POLYGON ((-87.04684 36.22030, -87.04678 36.220..."
2,470210701021009,3.0,0.0,2.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20231016.0,0.0,"Cheatham County, TN","POLYGON ((-87.04717 36.17217, -87.04674 36.172..."
3,470210701021012,2.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20231016.0,0.0,"Cheatham County, TN","POLYGON ((-87.04587 36.16943, -87.04531 36.169..."
4,470210701021013,2.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20231016.0,0.0,"Cheatham County, TN","POLYGON ((-86.99860 36.13944, -86.99854 36.139..."


In [117]:
data = data.drop(columns = ['createdate', 'Age:Workers 29 or Younger', 'Age:Workers 30 to 54', 'Age:Workers 55 Older', 'Earnings:1250mo or Less', 
                 'Earnings:1251 to 3333mo', 'Earnings:Greater Than 3333mo', 'Race:White Alone', 'Race:Black or African American Alone', 
                 'Race:American Indian or Alaska Native Alone', 'Race:Asian Alone', 'Race:Native Hawaiian or Other Pacific Islander Alone', 
                 'Race:Two or More Races', 'Ethnicity:Not Hispanic or Latino', 'Ethnicity:Hispanic or Latino', 'Ed:Less than High School', 
                 'Ed:High School or Equivalent', 'Ed:Some College or Associates', 'Ed:Bachelors or Advanced Degree', 'Sex:Male', 'Sex:Female', 
                 'FirmAge:0 to 1 Year', 'FirmAge:2 to 3 Years', 'FirmAge:4 to 5 Years', 'FirmAge:6 to 10 Years', 'FirmAge:11 or More Years', 
                 'FirmSize:0 to 19 Employees', 'FirmSize:20 to 49 Employees', 'FirmSize:50 to 249 Employees', 'FirmSize:250 to 499 Employees', 
                 'FirmSize:500 or More Employees'])

In [118]:
interim = data
interim['Blocks'] = 1

In [119]:
stop

NameError: name 'stop' is not defined

In [120]:
counties = interim.drop(columns = 'GEO_ID')
counties = counties.groupby('County').sum()
counties = counties.drop(columns = 'Blocks')

In [121]:
counties.head()

Unnamed: 0_level_0,Total Jobs,"Agriculture, forestry, fishing and hunting","Mining, quarrying, and oil and gas extraction",Utilities,Construction,Manufacturing,Wholesale trade,Retail trade,Transportation and warehousing,Information,Finance and insurance,Real estate and rental and leasing,"Professional, scientific, and technical services",Management of companies and enterprises,Administrative and support and waste management and remediation services,Health care and social assistance,"Arts, entertainment, and recreation",Accommodation and food services,Other services (except government and government enterprises),Public Administration,Educational services
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Cheatham County, TN",8123.0,7.0,5.0,50.0,943.0,2584.0,75.0,909.0,390.0,29.0,140.0,66.0,270.0,2.0,349.0,581.0,114.0,692.0,146.0,394.0,377.0
"Davidson County, TN",506544.0,223.0,172.0,1453.0,24102.0,22255.0,24150.0,44858.0,27180.0,14969.0,27934.0,9826.0,43313.0,13210.0,45840.0,88236.0,9034.0,41016.0,15364.0,33374.0,20035.0
"Dickson County, TN",16721.0,28.0,17.0,258.0,1053.0,3747.0,688.0,2331.0,182.0,103.0,533.0,78.0,294.0,159.0,1146.0,2611.0,84.0,1590.0,398.0,820.0,601.0
"Houston County, TN",1215.0,11.0,0.0,7.0,97.0,190.0,1.0,148.0,3.0,16.0,41.0,0.0,42.0,0.0,2.0,345.0,0.0,112.0,18.0,98.0,84.0
"Humphreys County, TN",5279.0,9.0,34.0,43.0,387.0,1456.0,123.0,668.0,342.0,19.0,96.0,6.0,71.0,0.0,146.0,631.0,22.0,549.0,216.0,257.0,204.0


In [122]:
thelist = [counties['Utilities'], counties['Manufacturing'], counties['Wholesale trade'], counties['Transportation and warehousing']]
counties['Industrial'] = sum(thelist)
counties = counties.drop(columns = ['Utilities', 'Manufacturing', 'Wholesale trade', 'Transportation and warehousing'])

thelist = [counties['Information'], counties['Finance and insurance'], counties['Real estate and rental and leasing'], 
           counties['Professional, scientific, and technical services'], counties['Management of companies and enterprises'], 
           counties['Administrative and support and waste management and remediation services']]
counties['Office'] = sum(thelist)
counties = counties.drop(columns = ['Information', 'Finance and insurance', 'Real estate and rental and leasing', 
                            'Professional, scientific, and technical services', 'Management of companies and enterprises', 
                            'Administrative and support and waste management and remediation services'])

thelist = [counties['Arts, entertainment, and recreation'], counties['Other services (except government and government enterprises)']]
counties['Service'] = sum(thelist)
counties = counties.drop(columns = ['Arts, entertainment, and recreation', 'Other services (except government and government enterprises)'])

thelist = [counties['Agriculture, forestry, fishing and hunting'], counties['Mining, quarrying, and oil and gas extraction'], counties['Construction']]
counties['Other'] = sum(thelist)
counties = counties.drop(columns = ['Agriculture, forestry, fishing and hunting', 'Mining, quarrying, and oil and gas extraction', 'Construction'])

counties['Education'] = counties['Educational services']
counties['Food Services'] = counties['Accommodation and food services']
counties['Government'] = counties['Public Administration']
counties['Medical'] = counties['Health care and social assistance']
counties['Retail'] = counties['Retail trade']
counties = counties.drop(columns = ['Educational services', 'Accommodation and food services', 'Public Administration', 
                            'Health care and social assistance', 'Retail trade'])

In [123]:
#counties = counties.drop(columns = 'Blocks')
counties = counties.add_suffix(' Entire County')
counties.reset_index(drop = False, inplace = True)

In [124]:
#counties = counties[['County', 'Total Jobs Entire County']]

In [125]:
counties

Unnamed: 0,County,Total Jobs Entire County,Industrial Entire County,Office Entire County,Service Entire County,Other Entire County,Education Entire County,Food Services Entire County,Government Entire County,Medical Entire County,Retail Entire County
0,"Cheatham County, TN",8123.0,3099.0,856.0,260.0,955.0,377.0,692.0,394.0,581.0,909.0
1,"Davidson County, TN",506544.0,75038.0,155092.0,24398.0,24497.0,20035.0,41016.0,33374.0,88236.0,44858.0
2,"Dickson County, TN",16721.0,4875.0,2313.0,482.0,1098.0,601.0,1590.0,820.0,2611.0,2331.0
3,"Houston County, TN",1215.0,201.0,101.0,18.0,108.0,84.0,112.0,98.0,345.0,148.0
4,"Humphreys County, TN",5279.0,1964.0,338.0,238.0,430.0,204.0,549.0,257.0,631.0,668.0
5,"Maury County, TN",35387.0,9320.0,7184.0,1015.0,1532.0,1515.0,3174.0,1505.0,5677.0,4465.0
6,"Montgomery County, TN",52136.0,9260.0,8806.0,1638.0,2923.0,4211.0,7237.0,2123.0,7138.0,8800.0
7,"Robertson County, TN",22298.0,9404.0,2642.0,557.0,1523.0,919.0,1717.0,856.0,1778.0,2902.0
8,"Rutherford County, TN",128098.0,41356.0,22892.0,3868.0,5880.0,5288.0,12782.0,3580.0,15088.0,17364.0
9,"Stewart County, TN",2004.0,577.0,145.0,71.0,302.0,113.0,170.0,199.0,154.0,273.0


See what percentage of the county whole every block has per industry. Here's where I'll aggregate the new groups 5/29

In [126]:
thelist = [data['Utilities'], data['Manufacturing'], data['Wholesale trade'], data['Transportation and warehousing']]
data['Industrial'] = sum(thelist)
data = data.drop(columns = ['Utilities', 'Manufacturing', 'Wholesale trade', 'Transportation and warehousing'])

thelist = [data['Information'], data['Finance and insurance'], data['Real estate and rental and leasing'], 
           data['Professional, scientific, and technical services'], data['Management of companies and enterprises'], 
           data['Administrative and support and waste management and remediation services']]
data['Office'] = sum(thelist)
data = data.drop(columns = ['Information', 'Finance and insurance', 'Real estate and rental and leasing', 
                            'Professional, scientific, and technical services', 'Management of companies and enterprises', 
                            'Administrative and support and waste management and remediation services'])

thelist = [data['Arts, entertainment, and recreation'], data['Other services (except government and government enterprises)']]
data['Service'] = sum(thelist)
data = data.drop(columns = ['Arts, entertainment, and recreation', 'Other services (except government and government enterprises)'])

thelist = [data['Agriculture, forestry, fishing and hunting'], data['Mining, quarrying, and oil and gas extraction'], data['Construction']]
data['Other'] = sum(thelist)
data = data.drop(columns = ['Agriculture, forestry, fishing and hunting', 'Mining, quarrying, and oil and gas extraction', 'Construction'])

data['Education'] = data['Educational services']
data['Food Services'] = data['Accommodation and food services']
data['Government'] = data['Public Administration']
data['Medical'] = data['Health care and social assistance']
data['Retail'] = data['Retail trade']
data = data.drop(columns = ['Educational services', 'Accommodation and food services', 'Public Administration', 
                            'Health care and social assistance', 'Retail trade'])

In [127]:
data.head()

Unnamed: 0,GEO_ID,Total Jobs,County,Blocks,Industrial,Office,Service,Other,Education,Food Services,Government,Medical,Retail
3450,470210701021001,79.0,"Cheatham County, TN",1,0.0,0.0,0.0,4.0,0.0,0.0,8.0,67.0,0.0
3451,470210701021004,17.0,"Cheatham County, TN",1,0.0,6.0,0.0,10.0,0.0,0.0,0.0,1.0,0.0
3452,470210701021009,3.0,"Cheatham County, TN",1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3453,470210701021012,2.0,"Cheatham County, TN",1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3454,470210701021013,2.0,"Cheatham County, TN",1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [128]:
data = data.drop(columns = 'Blocks')

In [129]:
joined = data.merge(counties, on = 'County', how = 'outer')

In [130]:
joined.head()

Unnamed: 0,GEO_ID,Total Jobs,County,Industrial,Office,Service,Other,Education,Food Services,Government,Medical,Retail,Total Jobs Entire County,Industrial Entire County,Office Entire County,Service Entire County,Other Entire County,Education Entire County,Food Services Entire County,Government Entire County,Medical Entire County,Retail Entire County
0,470210701021001,79.0,"Cheatham County, TN",0.0,0.0,0.0,4.0,0.0,0.0,8.0,67.0,0.0,8123.0,3099.0,856.0,260.0,955.0,377.0,692.0,394.0,581.0,909.0
1,470210701021004,17.0,"Cheatham County, TN",0.0,6.0,0.0,10.0,0.0,0.0,0.0,1.0,0.0,8123.0,3099.0,856.0,260.0,955.0,377.0,692.0,394.0,581.0,909.0
2,470210701021009,3.0,"Cheatham County, TN",1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,8123.0,3099.0,856.0,260.0,955.0,377.0,692.0,394.0,581.0,909.0
3,470210701021012,2.0,"Cheatham County, TN",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8123.0,3099.0,856.0,260.0,955.0,377.0,692.0,394.0,581.0,909.0
4,470210701021013,2.0,"Cheatham County, TN",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8123.0,3099.0,856.0,260.0,955.0,377.0,692.0,394.0,581.0,909.0


In [131]:
joined.tail()

Unnamed: 0,GEO_ID,Total Jobs,County,Industrial,Office,Service,Other,Education,Food Services,Government,Medical,Retail,Total Jobs Entire County,Industrial Entire County,Office Entire County,Service Entire County,Other Entire County,Education Entire County,Food Services Entire County,Government Entire County,Medical Entire County,Retail Entire County
12035,471890305001003,41.0,"Wilson County, TN",0.0,0.0,0.0,0.0,41.0,0.0,0.0,0.0,0.0,51934.0,19003.0,8235.0,1912.0,3287.0,1825.0,4859.0,1875.0,3968.0,6970.0
12036,471890305002003,36.0,"Wilson County, TN",0.0,0.0,0.0,0.0,36.0,0.0,0.0,0.0,0.0,51934.0,19003.0,8235.0,1912.0,3287.0,1825.0,4859.0,1875.0,3968.0,6970.0
12037,471890306002013,72.0,"Wilson County, TN",0.0,0.0,0.0,0.0,72.0,0.0,0.0,0.0,0.0,51934.0,19003.0,8235.0,1912.0,3287.0,1825.0,4859.0,1875.0,3968.0,6970.0
12038,471890310004030,23.0,"Wilson County, TN",0.0,0.0,0.0,0.0,23.0,0.0,0.0,0.0,0.0,51934.0,19003.0,8235.0,1912.0,3287.0,1825.0,4859.0,1875.0,3968.0,6970.0
12039,471890304011033,0.0,"Wilson County, TN",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,51934.0,19003.0,8235.0,1912.0,3287.0,1825.0,4859.0,1875.0,3968.0,6970.0


In [132]:
joined.columns

Index(['GEO_ID', 'Total Jobs', 'County', 'Industrial', 'Office', 'Service', 'Other', 'Education', 'Food Services', 'Government', 'Medical', 'Retail',
       'Total Jobs Entire County', 'Industrial Entire County', 'Office Entire County', 'Service Entire County', 'Other Entire County',
       'Education Entire County', 'Food Services Entire County', 'Government Entire County', 'Medical Entire County', 'Retail Entire County'],
      dtype='object')

In [133]:
data.head()

Unnamed: 0,GEO_ID,Total Jobs,County,Industrial,Office,Service,Other,Education,Food Services,Government,Medical,Retail
3450,470210701021001,79.0,"Cheatham County, TN",0.0,0.0,0.0,4.0,0.0,0.0,8.0,67.0,0.0
3451,470210701021004,17.0,"Cheatham County, TN",0.0,6.0,0.0,10.0,0.0,0.0,0.0,1.0,0.0
3452,470210701021009,3.0,"Cheatham County, TN",1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3453,470210701021012,2.0,"Cheatham County, TN",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3454,470210701021013,2.0,"Cheatham County, TN",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [134]:
data = joined

industries = ['Total Jobs', 'Industrial', 'Office', 'Service', 
             'Other', 'Education', 'Food Services', 'Government', 'Medical', 'Retail']
for ind in industries:
    data['{} %'.format(ind)] = percent(data['{}'.format(ind)], data['{} Entire County'.format(ind)])

In [135]:
data.head()

Unnamed: 0,GEO_ID,Total Jobs,County,Industrial,Office,Service,Other,Education,Food Services,Government,Medical,Retail,Total Jobs Entire County,Industrial Entire County,Office Entire County,Service Entire County,Other Entire County,Education Entire County,Food Services Entire County,Government Entire County,Medical Entire County,Retail Entire County,Total Jobs %,Industrial %,Office %,Service %,Other %,Education %,Food Services %,Government %,Medical %,Retail %
0,470210701021001,79.0,"Cheatham County, TN",0.0,0.0,0.0,4.0,0.0,0.0,8.0,67.0,0.0,8123.0,3099.0,856.0,260.0,955.0,377.0,692.0,394.0,581.0,909.0,0.009725,0.0,0.0,0.0,0.004188,0.0,0.0,0.020305,0.115318,0.0
1,470210701021004,17.0,"Cheatham County, TN",0.0,6.0,0.0,10.0,0.0,0.0,0.0,1.0,0.0,8123.0,3099.0,856.0,260.0,955.0,377.0,692.0,394.0,581.0,909.0,0.002093,0.0,0.007009,0.0,0.010471,0.0,0.0,0.0,0.001721,0.0
2,470210701021009,3.0,"Cheatham County, TN",1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,8123.0,3099.0,856.0,260.0,955.0,377.0,692.0,394.0,581.0,909.0,0.000369,0.000323,0.0,0.0,0.001047,0.0,0.0,0.0,0.0,0.0011
3,470210701021012,2.0,"Cheatham County, TN",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8123.0,3099.0,856.0,260.0,955.0,377.0,692.0,394.0,581.0,909.0,0.000246,0.0,0.002336,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,470210701021013,2.0,"Cheatham County, TN",0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8123.0,3099.0,856.0,260.0,955.0,377.0,692.0,394.0,581.0,909.0,0.000246,0.0,0.002336,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
data = data[['GEO_ID', 'County', 'Total Jobs %', 'Industrial %', 'Office %', 'Service %', 
             'Other %', 'Education %', 'Food Services %', 'Government %', 'Medical %', 'Retail %']]

In [137]:
data.head()

Unnamed: 0,GEO_ID,County,Total Jobs %,Industrial %,Office %,Service %,Other %,Education %,Food Services %,Government %,Medical %,Retail %
0,470210701021001,"Cheatham County, TN",0.009725,0.0,0.0,0.0,0.004188,0.0,0.0,0.020305,0.115318,0.0
1,470210701021004,"Cheatham County, TN",0.002093,0.0,0.007009,0.0,0.010471,0.0,0.0,0.0,0.001721,0.0
2,470210701021009,"Cheatham County, TN",0.000369,0.000323,0.0,0.0,0.001047,0.0,0.0,0.0,0.0,0.0011
3,470210701021012,"Cheatham County, TN",0.000246,0.0,0.002336,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,470210701021013,"Cheatham County, TN",0.000246,0.0,0.002336,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [138]:
data.to_csv('../data/LEHDDistributionnewgroups.csv', index = False)

In [139]:
countytest = data.drop(columns = 'GEO_ID')
countytest = countytest.groupby('County').sum()

In [140]:
countytest

Unnamed: 0_level_0,Total Jobs %,Industrial %,Office %,Service %,Other %,Education %,Food Services %,Government %,Medical %,Retail %
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"Cheatham County, TN",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Davidson County, TN",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Dickson County, TN",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Houston County, TN",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Humphreys County, TN",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Maury County, TN",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Montgomery County, TN",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Robertson County, TN",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Rutherford County, TN",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"Stewart County, TN",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
c_inverted