In [1]:
import pathlib
import pandas as pd
from tqdm import tqdm

In [23]:
# Load the raw data

data_dir = pathlib.Path.cwd().parent / 'data_raw'
bea_data_dir = data_dir / 'BEA_Industry_Factors' / 'county_level'
[f.name for f in bea_data_dir.iterdir() if '_2012' in f.name]

['industriesPerCounty_colorado_2012.csv',
 'industriesPerCounty_southdakota_2012.csv',
 'industriesPerCounty_northcarolina_2012.csv',
 'industriesPerCounty_delaware_2012.csv',
 'industriesPerCounty_arkansas_2012.csv',
 'industriesPerCounty_rhodeisland_2012.csv',
 'industriesPerCounty_virginia_2012.csv',
 'industriesPerCounty_kansas_2012.csv',
 'industriesPerCounty_georgia_2012.csv',
 'industriesPerCounty_wyoming_2012.csv',
 'industriesPerCounty_wisconsin_2012.csv',
 'industriesPerCounty_vermont_2012.csv',
 'industriesPerCounty_utah_2012.csv',
 'industriesPerCounty_newyork_2012.csv',
 'industriesPerCounty_southcarolina_2012.csv',
 'industriesPerCounty_alaska_2012.csv',
 'industriesPerCounty_louisiana_2012.csv',
 'industriesPerCounty_massachusetts_2012.csv',
 'industriesPerCounty_tennessee_2012.csv',
 'industriesPerCounty_pennsylvania_2012.csv',
 'industriesPerCounty_minnesota_2012.csv',
 'industriesPerCounty_california_2012.csv',
 'industriesPerCounty_arizona_2012.csv',
 'industriesPerC

In [48]:
# Load the 2012 data

def load_all_states(bea_data_dir):

    files = [f for f in bea_data_dir.iterdir() if '_2012' in f.name]
    
    for f in files:
        
        df = pd.read_csv(f).drop("Unnamed: 11", axis=1)
        df['NAICS_Sector'] = df.NAICS2012.apply(lambda v: v[:2]).apply(int)

        yield df
    
df = pd.concat(load_all_states(bea_data_dir))

# test = list(load_all_states(bea_data_dir))

# df

In [47]:
files

[PosixPath('/Users/merrelbook/Projects/HealthDataVizGA/DataVisualAnalytics_Industries-cancer/data_raw/BEA_Industry_Factors/county_level/industriesPerCounty_colorado_2012.csv'),
 PosixPath('/Users/merrelbook/Projects/HealthDataVizGA/DataVisualAnalytics_Industries-cancer/data_raw/BEA_Industry_Factors/county_level/industriesPerCounty_southdakota_2012.csv'),
 PosixPath('/Users/merrelbook/Projects/HealthDataVizGA/DataVisualAnalytics_Industries-cancer/data_raw/BEA_Industry_Factors/county_level/industriesPerCounty_northcarolina_2012.csv'),
 PosixPath('/Users/merrelbook/Projects/HealthDataVizGA/DataVisualAnalytics_Industries-cancer/data_raw/BEA_Industry_Factors/county_level/industriesPerCounty_delaware_2012.csv'),
 PosixPath('/Users/merrelbook/Projects/HealthDataVizGA/DataVisualAnalytics_Industries-cancer/data_raw/BEA_Industry_Factors/county_level/industriesPerCounty_arkansas_2012.csv'),
 PosixPath('/Users/merrelbook/Projects/HealthDataVizGA/DataVisualAnalytics_Industries-cancer/data_raw/BEA_I

In [49]:
# [_ for _ in df.NAICS2012.unique() if _[:2] == '11']

df.state.unique()

array([ 8, 46, 37, 10,  5, 44, 51, 20, 13, 56, 55, 50, 49, 36, 45,  2, 22,
       25, 47, 42, 27,  6,  4, 26, 38, 16, 48, 19,  9, 53, 54, 30, 23, 18,
       41, 34, 17, 24, 28,  1, 39, 12, 31, 33, 21, 29, 35, 40, 15, 32])

### Process FIPS Code

FIPS is the federal/census unique ID for each geographic area.  States have 2 digives and counties have 5

In [50]:
# Process FIPS code
df['fips'] = df.GEO_ID.apply(lambda GID: GID.split('US')[1])
# # Remove the state_level data
# df_state = df[df['fips'].str.len() == 2]
# df_county = df[df['fips'].str.len() != 2]

def county_level(df):
    return df[df['fips'].str.len() == 5]

def state_level(df):
    return df[df['fips'].str.len() == 5]

### Group data by NAICS Sector

NAICS is the North American Industry Classification System. The coarsest level of classification is the *Sector*.

The organization of NAICS is as follows:  <-- from [this page](https://www.census.gov/programs-surveys/economic-census/guidance/understanding-naics.html) on census.gov
- Sector: 2-digit code
    - Subsector: 3-digit code
        - Industry Group: 4-digit code
            - NAICS Industry: 5-digit code
                - National Industry: 6-digit code

Start by grouping the data by sector:

In [51]:
def naics_level(df, naics_level):
    return df[df['NAICS2012'].str.len() == naics_level]

In [52]:
df.head()
# df_county['NAICS2012'].apply(lambda v: v.split('-')[0])

df_naics_2 = naics_level(df, 2).reset_index(drop=True)
df_naics_3 = naics_level(df, 3).reset_index(drop=True)
df_naics_4 = naics_level(df, 4).reset_index(drop=True)
df_naics_5 = naics_level(df, 5).reset_index(drop=True)
df_naics_6 = naics_level(df, 6).reset_index(drop=True)

# First remove all the per-county totals across all sectors
df_totals = df_naics_2[df_naics_2.NAICS2012 == '00']
df_naics_2 = df_naics_2[df_naics_2.NAICS2012 != '00']

In [53]:


# Now extract all the top-level sectors
# mask = (df['NAICS2012'].str.len() == 2) | (df['NAICS2012'].str.contains('-'))
# df[mask]
# df.query('fips==13001 & NAICS_Sector==32')

In [54]:
county_level(df_naics_2)

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2012,NAICS2012_TTL,ESTAB,EMP,PAYANN,state,county,NAICS_Sector,fips
1,0500000US08001,"Adams County, Colorado",1,2012,11,"Agriculture, forestry, fishing and hunting",7,0,323,8,1,11,08001
2,0500000US08001,"Adams County, Colorado",1,2012,21,"Mining, quarrying, and oil and gas extraction",34,346,28632,8,1,21,08001
3,0500000US08001,"Adams County, Colorado",1,2012,22,Utilities,19,0,0,8,1,22,08001
4,0500000US08001,"Adams County, Colorado",1,2012,23,Construction,1111,14050,712224,8,1,23,08001
5,0500000US08001,"Adams County, Colorado",1,2012,42,Wholesale trade,727,14970,868645,8,1,42,08001
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51902,0500000US32510,"Carson City, Nevada",510,2012,62,Health care and social assistance,227,3655,186052,32,510,62,32510
51903,0500000US32510,"Carson City, Nevada",510,2012,71,"Arts, entertainment, and recreation",49,1164,29021,32,510,71,32510
51904,0500000US32510,"Carson City, Nevada",510,2012,72,Accommodation and food services,159,2742,41412,32,510,72,32510
51905,0500000US32510,"Carson City, Nevada",510,2012,81,Other services (except public administration),145,811,22399,32,510,81,32510


In [55]:
df_naics_5[df_naics_5.fips=='13001']

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2012,NAICS2012_TTL,ESTAB,EMP,PAYANN,state,county,NAICS_Sector,fips
107107,0500000US13001,"Appling County, Georgia",1,2012,11331,Logging,11,61,2256,13,1,11,13001
107108,0500000US13001,"Appling County, Georgia",1,2012,11511,Support activities for crop production,1,0,0,13,1,11,13001
107109,0500000US13001,"Appling County, Georgia",1,2012,22111,Electric power generation,1,0,0,13,1,22,13001
107110,0500000US13001,"Appling County, Georgia",1,2012,22112,"Electric power transmission, control, and dist...",6,0,0,13,1,22,13001
107111,0500000US13001,"Appling County, Georgia",1,2012,23611,Residential building construction,3,0,0,13,1,23,13001
...,...,...,...,...,...,...,...,...,...,...,...,...,...
107257,0500000US13001,"Appling County, Georgia",1,2012,81299,All other personal services,1,0,0,13,1,81,13001
107258,0500000US13001,"Appling County, Georgia",1,2012,81311,Religious organizations,23,94,1669,13,1,81,13001
107259,0500000US13001,"Appling County, Georgia",1,2012,81331,Social advocacy organizations,1,0,0,13,1,81,13001
107260,0500000US13001,"Appling County, Georgia",1,2012,81341,Civic and social organizations,1,0,0,13,1,81,13001


---

In [56]:
# # Load NAICS 2007--> 2012 concordance
# naics_keys = pd.read_csv(data_dir / '2007_to_2012_NAICS.csv', 
#                          header=4, 
#                          usecols=['2007 NAICS Code','2007 NAICS Title','2012 NAICS Code','2012 NAICS Title'])

In [57]:
# naics_keys

In [58]:
# # Load USEEIO --> NAICS Concordance
# useeio_keys = pd.read_csv(data_dir / 'USEEIO-NAICS-Concordance_2012.csv', header=5).loc[0:958,:]

---
### Join USEEIO and BEA data

In [59]:
# Load USEEIO Indicators matrix

indicator_list = ['ACID','ENRG','ETOX','EUTR','FOOD','GCC','HAPS','HAZW','HC','HNC','HRSP','HTOX','JOBS','LAND','METL','MINE','MSW','NREN','OZON','PEST','REN','SMOG','VADD','WATR']

indicators = pd.read_csv(data_dir / 'USEEIO_Industry_Indicators' / 'indicators.csv')

# Filter out 'rous' duplicates
indicators = indicators[indicators.industry_region == 'us']

# REMOVE Construction industry due to incompatible subsets
indicators = indicators[~indicators['relavant_naics_codes'].str.contains('\*')]

# Reset the index
indicators = indicators.reset_index(drop=True)

# Convert column 'relavant_naics_codes' fmor strinm to list of integers
conv_str_list = lambda x: [int(_) for _ in x.strip('][').split(', ')]
indicators.loc[:, 'relavant_naics_codes'] = indicators.loc[:, 'relavant_naics_codes'].apply(conv_str_list)

indicators.head()

Unnamed: 0,industry_code,industry_detail,industry_region,ACID,ENRG,ETOX,EUTR,FOOD,GCC,HAPS,...,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR,Related 2012 NAICS Codes,relavant_naics_codes
0,1111a0,"fresh soybeans, canola, flaxseeds, and other o...",us,0.003038,0.0,31.608836,0.011122,2.6e-05,0.558816,2.5e-05,...,9.9e-05,0.0,0.0,0.000366175,0.0,0.022389,0.471064,0.002513,11111-2,"[11111, 11112]"
1,1111b0,"fresh wheat, corn, rice, and other grains",us,0.020069,0.0,12.615068,0.032458,0.000122,1.97852,6.7e-05,...,0.000472,0.0,2.278307e-09,0.0006212053,0.0,0.055404,-0.002548,0.460768,"11113-6, 11119","[11113, 11114, 11115, 11116, 11119]"
2,111200,"fresh vegetables, melons, and potatoes",us,0.007174,0.003225,28.396543,0.006305,0.001194,0.457168,0.000285,...,0.004637,0.0,7.327268e-06,0.001075716,0.003225,0.021458,0.604597,0.591335,1112,[1112]
3,111300,fresh fruits and tree nuts,us,0.003656,0.0,20.718815,0.000275,0.001945,0.286005,3.8e-05,...,0.007557,0.0,1.289248e-06,0.0004691275,0.0,0.018263,0.605345,0.456172,1113,[1113]
4,111400,"greenhouse crops, mushrooms, nurseries, and fl...",us,0.000952,5e-06,0.000677,0.000118,0.001786,0.70975,4.4e-05,...,0.006971,0.0,8.190052e-10,3.562148e-14,5e-06,0.0378,0.53402,0.046371,1114,[1114]


In [60]:
for naic in indicators.loc[10, 'relavant_naics_codes']:
#     print(naic)

    test = df[df.NAICS2012 == str(naic)]

test

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2012,NAICS2012_TTL,ESTAB,EMP,PAYANN,state,county,NAICS_Sector,fips
2,0500000US08001,"Adams County, Colorado",1,2012,113,Forestry and logging,1,0,0,8,1,11,08001
3615,0500000US08007,"Archuleta County, Colorado",7,2012,113,Forestry and logging,1,0,0,8,7,11,08007
4569,0500000US08013,"Boulder County, Colorado",13,2012,113,Forestry and logging,3,0,0,8,13,11,08013
8613,0500000US08023,"Costilla County, Colorado",23,2012,113,Forestry and logging,1,0,0,8,23,11,08023
8854,0500000US08027,"Custer County, Colorado",27,2012,113,Forestry and logging,1,0,0,8,27,11,08027
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39039,0500000US40143,"Tulsa County, Oklahoma",143,2012,113,Forestry and logging,1,0,0,40,143,11,40143
2,0500000US15001,"Hawaii County, Hawaii",1,2012,113,Forestry and logging,1,0,0,15,1,11,15001
599,0500000US32003,"Clark County, Nevada",3,2012,113,Forestry and logging,1,0,0,32,3,11,32003
2223,0500000US32005,"Douglas County, Nevada",5,2012,113,Forestry and logging,1,0,0,32,5,11,32005


In [61]:
available_naics = []

for idx, row in indicators.iterrows():
    for naic in row.relavant_naics_codes:
        if str(naic) in df.NAICS2012.unique():
            available_naics.append(naic)

In [62]:
# iterate through the counties
for fips in county_level(df).fips.unique():
    # Get county data
    
    df_county = df[df.fips==fips]

In [63]:
df_county

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2012,NAICS2012_TTL,ESTAB,EMP,PAYANN,state,county,NAICS_Sector,fips
9137,0500000US32510,"Carson City, Nevada",510,2012,115,Support activities for agriculture and forestry,1,0,0,32,510,11,32510
9144,0500000US32510,"Carson City, Nevada",510,2012,00,Total for all sectors,1971,21485,835579,32,510,0,32510
9145,0500000US32510,"Carson City, Nevada",510,2012,11,"Agriculture, forestry, fishing and hunting",5,0,0,32,510,11,32510
9146,0500000US32510,"Carson City, Nevada",510,2012,114,"Fishing, hunting and trapping",4,0,0,32,510,11,32510
9147,0500000US32510,"Carson City, Nevada",510,2012,1141,Fishing,3,0,0,32,510,11,32510
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10272,0500000US32510,"Carson City, Nevada",510,2012,813930,Labor unions and similar labor organizations,1,0,0,32,510,81,32510
10273,0500000US32510,"Carson City, Nevada",510,2012,81394,Political organizations,1,0,0,32,510,81,32510
10274,0500000US32510,"Carson City, Nevada",510,2012,813940,Political organizations,1,0,0,32,510,81,32510
10275,0500000US32510,"Carson City, Nevada",510,2012,81399,"Other similar organizations (except business, ...",4,0,0,32,510,81,32510


In [64]:
entries = []

unique_fips_list = county_level(df).fips.unique()
n_unique_fips = len(unique_fips_list)

# iterate through the counties
for fips in unique_fips_list:
    # Get county data
    
    df_county = df[df.fips==fips]


    available_naics = []
    available_industry_codes = []

    for idx, row in indicators.iterrows():
        for naic in row.relavant_naics_codes:
            if str(naic) in df_county.NAICS2012.unique():
                available_naics.append(naic)
                if row.industry_code not in available_industry_codes:
                    available_industry_codes.append(row.industry_code)
                    

    for code in available_industry_codes:
        df_ind = indicators.set_index('industry_code').loc[code]

        # Make the new row
        payann_total = 0
        for naic in df_ind.relavant_naics_codes:
            # pull county data
            test = df_county[df_county.NAICS2012 == str(naic)]
            try:
                payann_total += test.PAYANN.values[0]
            except IndexError:
                continue


        new_entry = {
            'fips': df_county.iloc[0].fips,
            'county': df_county.iloc[0].COUNTY,
            'name': df_county.iloc[0].GEO_TTL,
            'industry_code': df_ind.name,
            'industry_detail': df_ind.industry_detail,
            'relevant_naics': df_ind.relavant_naics_codes,
            'year': df_county.iloc[0].YEAR,
            'payann_total': payann_total,
        }

        for indicator_name, indicator_rate in df_ind[indicator_list].iteritems():
            new_entry.update({
                indicator_name: indicator_rate * payann_total
            })

        entries.append(new_entry)

In [65]:
# available_naics

code = available_industry_codes[1]

# row

In [66]:
# indicator_list

In [67]:
# df_ind = indicators.set_index('industry_code').loc[code]
# # df_ind[indicator_list]
# df_ind

In [68]:
# entries = []

# for code in available_industry_codes:
#     df_ind = indicators.set_index('industry_code').loc[code]

#     # Make the new row
#     payann_total = 0
#     for naic in df_ind.relavant_naics_codes:
#         # pull county data
#         test = df_county[df_county.NAICS2012 == str(naic)]
#         try:
#             payann_total += test.PAYANN.values[0]
#         except IndexError:
#             continue

        
#     new_entry = {
#         'fips': df_county.iloc[0].fips,
#         'county': df_county.iloc[0].COUNTY,
#         'name': df_county.iloc[0].GEO_TTL,
#         'industry_code': df_ind.name,
#         'industry_detail': df_ind.industry_detail,
#         'relevant_naics': df_ind.relavant_naics_codes,
#         'year': df_county.iloc[0].YEAR,
#         'payann_total': payann_total,
#     }

#     for indicator_name, indicator_rate in df_ind[indicator_list].iteritems():
#         new_entry.update({
#             indicator_name: indicator_rate * payann_total
#         })
        
#     entries.append(new_entry)

In [69]:
results = pd.DataFrame(data=entries)

In [70]:
results.shape

(306032, 32)

In [71]:
results.head()

Unnamed: 0,fips,county,name,industry_code,industry_detail,relevant_naics,year,payann_total,ACID,ENRG,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,8001,1,"Adams County, Colorado",113000,timber and raw forest products,[113],2012,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8001,1,"Adams County, Colorado",115000,agriculture and forestry support,[115],2012,290,0.537394,0.95877,...,8e-05,0.0,3.405586,0.0,0.000183,0.008769,0.95877,2.011564,211.423463,15.653502
2,8001,1,"Adams County, Colorado",211000,unrefined oil and gas,[211],2012,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8001,1,"Adams County, Colorado",2122a0,"iron, gold, silver, and other metal ores","[21221, 21222, 21229]",2012,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8001,1,"Adams County, Colorado",212310,dimensional stone,[21231],2012,0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# Clean data dir
clean_data_dir = pathlib.Path.cwd().parent / 'data_clean'
results.to_csv(clean_data_dir / 'indicators_per-industry_per-county.csv')

In [73]:
clean_data_dir

PosixPath('/Users/merrelbook/Projects/HealthDataVizGA/DataVisualAnalytics_Industries-cancer/data_clean')

In [107]:
# for idx in range(len(indicators)):
#     test_code = indicators.loc[idx, 'relavant_naics_codes'][0]
#     print(f"{test_code} - {str(test_code) in df['NAICS2012'].tolist()}")

In [109]:
# "113" in df['NAICS2012'].tolist()

In [108]:
# df.query('NAICS_Sector == 11')

# df_naics_3.NAICS2012.unique()
# df.query('NAICS_Sector==11').NAICS2012_TTL.unique()

In [110]:
# df.query('NAICS_Sector == 11').NAICS2012.unique()

In [103]:
# df.NAICS2012.apply(int)