In [1]:
import pathlib
import pandas as pd

In [22]:
# Load the raw data

data_dir = pathlib.Path.cwd().parent / 'data_raw'
bea_data_dir = data_dir / 'BEA_Industry_Factors' / 'county_level'
[f.name for f in bea_data_dir.iterdir()]

['.DS_Store',
 'industriesPerCountyGA-2016.csv',
 'industriesPerCountyGA-2017.csv',
 'industriesPerCountyGA-2015.csv',
 'industriesPerCountyGA-2000.csv',
 'industriesPerCountyGA-2014.csv',
 'industriesPerCountyGA-2007.csv',
 'industriesPerCountyGA-2013.csv',
 'industriesPerCountyGA-2012.csv']

In [3]:
# Load the 2012 data
df = pd.read_csv(bea_data_dir/'industriesPerCountyGA-2012.csv').drop("Unnamed: 11", axis=1)
df.head()

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2012,NAICS2012_TTL,ESTAB,EMP,PAYANN,state,county
0,0500000US13001,"Appling County, Georgia",1,2012,0,Total for all sectors,380,5369,220875,13,1
1,0500000US13001,"Appling County, Georgia",1,2012,11,"Agriculture, forestry, fishing and hunting",12,84,4034,13,1
2,0500000US13001,"Appling County, Georgia",1,2012,113,Forestry and logging,11,61,2256,13,1
3,0500000US13001,"Appling County, Georgia",1,2012,1133,Logging,11,61,2256,13,1
4,0500000US13001,"Appling County, Georgia",1,2012,11331,Logging,11,61,2256,13,1


### Process FIPS Code

FIPS is the federal/census unique ID for each geographic area.  States have 2 digives and counties have 5

In [4]:
# Process FIPS code
df['fips'] = df.GEO_ID.apply(lambda GID: GID.split('US')[1])
# # Remove the state_level data
# df_state = df[df['fips'].str.len() == 2]
# df_county = df[df['fips'].str.len() != 2]

def county_level(df):
    return df[df['fips'].str.len() == 5]

def state_level(df):
    return df[df['fips'].str.len() == 5]

### Group data by NAICS Sector

NAICS is the North American Industry Classification System. The coarsest level of classification is the *Sector*.

The organization of NAICS is as follows:  <-- from [this page](https://www.census.gov/programs-surveys/economic-census/guidance/understanding-naics.html) on census.gov
- Sector: 2-digit code
    - Subsector: 3-digit code
        - Industry Group: 4-digit code
            - NAICS Industry: 5-digit code
                - National Industry: 6-digit code

Start by grouping the data by sector:

In [9]:
def naics_level(df, naics_level):
    return df[df['NAICS2012'].str.len() == naics_level]

In [10]:
df['NAICS_Sector'] = df.NAICS2012.apply(lambda v: v[:2]).apply(int)
df.head()
# df_county['NAICS2012'].apply(lambda v: v.split('-')[0])

df_naics_2 = naics_level(df, 2).reset_index(drop=True)
df_naics_3 = naics_level(df, 3).reset_index(drop=True)
df_naics_4 = naics_level(df, 4).reset_index(drop=True)
df_naics_5 = naics_level(df, 5).reset_index(drop=True)
df_naics_6 = naics_level(df, 6).reset_index(drop=True)

# First remove all the per-county totals across all sectors
df_totals = df_naics_2[df_naics_2.NAICS2012 == '00']
df_naics_2 = df_naics_2[df_naics_2.NAICS2012 != '00']

In [11]:


# Now extract all the top-level sectors
# mask = (df['NAICS2012'].str.len() == 2) | (df['NAICS2012'].str.contains('-'))
# df[mask]
# df.query('fips==13001 & NAICS_Sector==32')

In [13]:
# county_level(df_naics_2)

In [18]:
df_naics_2[df_naics_2.fips=='13001']

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2012,NAICS2012_TTL,ESTAB,EMP,PAYANN,state,county,fips,NAICS_Sector
1,0500000US13001,"Appling County, Georgia",1,2012,11,"Agriculture, forestry, fishing and hunting",12,84,4034,13,1,13001,11
2,0500000US13001,"Appling County, Georgia",1,2012,22,Utilities,7,0,0,13,1,13001,22
3,0500000US13001,"Appling County, Georgia",1,2012,23,Construction,39,380,11920,13,1,13001,23
4,0500000US13001,"Appling County, Georgia",1,2012,42,Wholesale trade,24,137,4593,13,1,13001,42
5,0500000US13001,"Appling County, Georgia",1,2012,51,Information,4,0,0,13,1,13001,51
6,0500000US13001,"Appling County, Georgia",1,2012,52,Finance and insurance,20,152,4973,13,1,13001,52
7,0500000US13001,"Appling County, Georgia",1,2012,53,Real estate and rental and leasing,10,30,603,13,1,13001,53
8,0500000US13001,"Appling County, Georgia",1,2012,54,"Professional, scientific, and technical services",20,54,1665,13,1,13001,54
9,0500000US13001,"Appling County, Georgia",1,2012,55,Management of companies and enterprises,1,0,0,13,1,13001,55
10,0500000US13001,"Appling County, Georgia",1,2012,56,Administrative and support and waste managemen...,4,0,0,13,1,13001,56


---

In [31]:
# Load NAICS 2007--> 2012 concordance
naics_keys = pd.read_csv(data_dir / '2007_to_2012_NAICS.csv', 
                         header=4, 
                         usecols=['2007 NAICS Code','2007 NAICS Title','2012 NAICS Code','2012 NAICS Title'])

In [32]:
naics_keys

Unnamed: 0,2007 NAICS Code,2007 NAICS Title,2012 NAICS Code,2012 NAICS Title
0,111110,Soybean Farming,111110,Soybean Farming
1,111120,Oilseed (except Soybean) Farming,111120,Oilseed (except Soybean) Farming
2,111130,Dry Pea and Bean Farming,111130,Dry Pea and Bean Farming
3,111140,Wheat Farming,111140,Wheat Farming
4,111150,Corn Farming,111150,Corn Farming
...,...,...,...,...
1179,926140,Regulation of Agricultural Marketing and Commo...,926140,Regulation of Agricultural Marketing and Commo...
1180,926150,"Regulation, Licensing, and Inspection of Misce...",926150,"Regulation, Licensing, and Inspection of Misce..."
1181,927110,Space Research and Technology,927110,Space Research and Technology
1182,928110,National Security,928110,National Security


In [103]:
# Load USEEIO --> NAICS Concordance
useeio_keys = pd.read_csv(data_dir / 'USEEIO-NAICS-Concordance_2012.csv', header=5).loc[0:958,:]

In [147]:
def expand_keys(key_string):
    
    keys = []
    
    for key_subset in key_string.split(','):

        key_subset = key_subset.strip()
        try:
            start = key_subset.split('-')[0]
            end = key_subset.split('-')[1]

            try:
                key = int(start)
                keys.append(key)

                while str(key)[-1] != end:
                    key += 1
                    keys.append(key)
                    
            except ValueError:
                keys.append('*')
                
        except IndexError:
            try:
                keys.append(int(key_subset))
            except ValueError:
                keys.append('*')
                
    return keys

Detail                                                   1111B0
Related 2012 NAICS Codes                         11113-6, 11119
relavant_naics_codes        [11113, 11114, 11115, 11116, 11119]
Name: 1, dtype: object

In [150]:
# useeio_keys
useeio_naic_concordance = useeio_keys[['Detail', 'Related 2012 NAICS Codes']].dropna().reset_index(drop=True)
useeio_naic_concordance['relavant_naics_codes'] = useeio_naic_concordance.apply(lambda x: expand_keys(x["Related 2012 NAICS Codes"]), axis=1)

In [151]:
useeio_naic_concordance

Unnamed: 0,Detail,Related 2012 NAICS Codes,relavant_naics_codes
0,1111A0,11111-2,"[11111, 11112]"
1,1111B0,"11113-6, 11119","[11113, 11114, 11115, 11116, 11119]"
2,111200,1112,[1112]
3,111300,1113,[1113]
4,111400,1114,[1114]
...,...,...,...
389,812900,8129,[8129]
390,813100,8131,[8131]
391,813A00,"8132, 8133","[8132, 8133]"
392,813B00,"8134, 8139","[8134, 8139]"
