# Data Preparation

This notebook processes and prepares data related to **country**, **states**, and **counties** for analysis and visualization.


## County-Level Data (Retrieval, Process) NAICS Files

This notebook processes County NAICS data and is a filtered and edited version of `us_econ_old.ipynb` designed to generate yearly data starting from 2017.

### Important Notes
- After running the notebook, please delete the `data_raw` folder.
- The zip level is not available. For reference, visit the [Census API Variables](https://api.census.gov/data/2011/cbp/variables.html).
- For earlier code that generates averaged data over multiple years, refer to `us_econ_old.ipynb`.
- The output will be located in the `community-data` repository, specifically within the `industries/{<i>variable</i>}-update` folders. 
- We will manually copy the results to `community-data/industries/{<i>variable</i>}` folders.

In [1]:
import csv
import requests as r
import pandas as pd
import zipfile, io
import os
from tqdm import tqdm
import pathlib
import datetime
import requests as r
from pathlib import Path
import os
import pickle
import time

endyear = datetime.date.today().year
api_headers = {}
api_headers['x-api-key'] = '975f39a54e48438ceebf303d6018e34db212e804'

In [2]:
# Set a relative location to save the data from the request
repo_dir = pathlib.Path().cwd()
#print(repo_dir)

raw_data_dir = repo_dir / 'data_raw'
out_data_dir = raw_data_dir / 'BEA_Industry_Factors'
    
county_data_dir = out_data_dir / 'county_level'
if not county_data_dir.exists():
    county_data_dir.mkdir(parents=True)


In [3]:
# Load the state FIPS codes key
state_fips = pd.read_csv('../../../community-data/us/id_lists/state_fips.csv', usecols=['Name', 'Postal Code', 'FIPS'])
state_fips = state_fips.head(50)  # <-- limit to only US states, not teritories

### Retrieve County Data from API

In [6]:
# Base URL for the API call
base_url = "https://api.census.gov/data"

#
# NOTE Years Prior to 2012 Currently have a bug when specifying NAICS#### as one of the columns
#      - stick to 2012 and later for now
#

def get_county_cbp(fips, state, years):
    count = 0
    for year in years:
        print(f"Getting data for state: {state}\tyear: {year}")
        if year >= 2000 and year <= 2002:
            columns_to_select = "GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS1997_TTL,ESTAB,EMP,PAYANN"
            url = f"{base_url}/{year}/cbp?get={columns_to_select}&for=county:*&in=state:{fips:02d}"
        elif year >= 2003 and year <=2007:
            columns_to_select = "GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2002_TTL,ESTAB,EMP,PAYANN"
            url = f"{base_url}/{year}/cbp?get={columns_to_select}&for=county:*&in=state:{fips:02d}"
        elif year >= 2008 and year <= 2011:
            columns_to_select = "GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2007_TTL,ESTAB,EMP,PAYANN"
            url = f"{base_url}/{year}/cbp?get={columns_to_select}&for=county:*&in=state:{fips:02d}"
        elif year >= 2012 and year <= 2016:
            columns_to_select = "GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2012,NAICS2012_TTL,ESTAB,EMP,PAYANN"
            url = f"{base_url}/{year}/cbp?get={columns_to_select}&for=county:*&in=state:{fips:02d}"
        elif year >= 2017 and year <= 2021:
            columns_to_select = "GEO_ID,NAME,COUNTY,YEAR,NAICS2017,NAICS2017_LABEL,ESTAB,EMP,PAYANN"
            url = f"{base_url}/{year}/cbp?get={columns_to_select}&for=county:*&in=state:{fips:02d}"
    
    
        response = r.get(url, headers=api_headers)

        with open(county_data_dir / f"industriesPerCounty_{str.lower(state.replace(' ', ''))}_{year}.csv",'w') as resultPath:
            for line in response.text.strip().split('\n'):
                line=line.replace('[',"").replace(']',"")
                resultPath.write(line + "\n")

        print("  > Finished CSV for year"+str(year))

In [7]:
#Years Initialization for data generation
startyear = 2017
endyear = 2021

In [None]:
for fips in state_fips.FIPS.unique():
     state = state_fips.query(f'FIPS=={fips}').values[0][0]
     years=range(startyear, endyear+1)
     get_county_cbp(fips, state, years)

## Data Aggregation
This part allows us to manage different Fips level (county/state) and different NAICS level (sector/industry/etc...)  
Trick: Getting the NAICS code from all the NAICS files that we downloaded

In [7]:
# Load the data from startyear

def load_all_states(bea_data_dir):
    
    for i in range(startyear,endyear+1):

        x="_"+str(i)
        files = [f for f in bea_data_dir.iterdir() if x in f.name]

        for f in files:

            # variable selection based on census year
            naics_str = "NAICS2012" if i < 2017 else "NAICS2017"
            naics_ttl = "NAICS2012_TTL" if i < 2017 else "NAICS2017_LABEL"
            geo_ttl = "GEO_TTL" if i < 2017 else "NAME"

            df = pd.read_csv(f,encoding='latin-1',dtype={naics_str: str})
            if 'Unnamed: 11' in df.columns:
                df=df.drop("Unnamed: 11", axis=1)
            if 'Unnamed: 10' in df.columns:
                df=df.drop("Unnamed: 10", axis=1)

            # renaming columns so similar data from 2012 census & 2017 census are entered into appropriate columns
            df = df.rename(columns={"fips": "id", naics_str: "relevant_naics","EMP":"emp","PAYANN":"payann","ESTAB":"estab", naics_ttl:"NAICS_TTL", geo_ttl:"GEO_TTL"})
            naics_str = "relevant_naics"
            naics_ttl = "NAICS_TTL"
            geo_ttl = "GEO_TTL"

            df['is5'] = df[naics_str].apply(lambda x: 'True' if len(x) == 5 else 'False')

            df.loc[(df['is5'] == 'True') & (df[naics_str].apply(lambda v: v[2:3]) == '-'), 'NAICS_Sector'] = df[naics_str]
            df.loc[(df['is5'] == 'True') & (df[naics_str].apply(lambda v: v[2:3]) != '-'), 'NAICS_Sector'] = df[naics_str].apply(lambda v: v[:2])
            df.loc[(df['is5'] == 'False') , 'NAICS_Sector'] = df[naics_str].apply(lambda v: v[:2])

            yield df
    
df = pd.concat(load_all_states(county_data_dir)).drop("is5", axis=1)

#df

In [8]:
df=df.drop("county", axis=1)

### Process FIPS Code

FIPS (Federal Information Processing Standards) codes are unique identifiers assigned to geographic areas. States are represented by 2-digit codes, while counties are represented by 5-digit codes.

In [9]:
# Process FIPS code
df['fips'] = df.GEO_ID.apply(lambda GID: GID.split('US')[1])

def county_level(df):
    return df[df['id'].str.len() == 5]

def state_level(df):
    return df[df['id'].str.len() == 2]

In [10]:
# NOTE If this block is run please delete the generated file before pushing into repo (file size too large)
#df.to_csv("allll.csv")

### Renaming Columns for Aggregate DataFrame

Please note that we are no longer averaging data across all years. This functionality is retained in the original `us_econ_old.ipynb` notebook.

In [11]:
newDF = df.rename(columns={"fips": "id","EMP":"emp","PAYANN":"payann","ESTAB":"estab"})
newDF

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,relevant_naics,NAICS_TTL,estab,emp,payann,state,NAICS_Sector,id
0,0500000US51127,"New Kent County, Virginia",127,2017,237310,"Highway, street, and bridge construction",4,47,4387,51,23,51127
1,0500000US51127,"New Kent County, Virginia",127,2017,238,Specialty trade contractors,58,295,15102,51,23,51127
2,0500000US51127,"New Kent County, Virginia",127,2017,2381,"Foundation, structure, and building exterior c...",12,79,6415,51,23,51127
3,0500000US51127,"New Kent County, Virginia",127,2017,23814,Masonry contractors,6,24,1010,51,23,51127
4,0500000US51127,"New Kent County, Virginia",127,2017,238140,Masonry contractors,6,24,1010,51,23,51127
...,...,...,...,...,...,...,...,...,...,...,...,...
2922,0500000US10005,"Sussex County, Delaware",5,2021,813910,Business associations,11,33,1793,10,81,10005
2923,0500000US10005,"Sussex County, Delaware",5,2021,81393,Labor unions and similar labor organizations,3,8,72,10,81,10005
2924,0500000US10005,"Sussex County, Delaware",5,2021,81399,"Other similar organizations (except business, ...",20,58,2657,10,81,10005
2925,0500000US10005,"Sussex County, Delaware",5,2021,813990,"Other similar organizations (except business, ...",20,58,2657,10,81,10005


In [12]:
newDF.tail(50)

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,relevant_naics,NAICS_TTL,estab,emp,payann,state,NAICS_Sector,id
2877,0500000US10005,"Sussex County, Delaware",5,2021,81141,Home and garden equipment and appliance repair...,6,14,548,10,81,10005
2878,0500000US10005,"Sussex County, Delaware",5,2021,811412,Appliance repair and maintenance,5,14,535,10,81,10005
2879,0500000US10005,"Sussex County, Delaware",5,2021,81142,Reupholstery and furniture repair,5,5,325,10,81,10005
2880,0500000US10005,"Sussex County, Delaware",5,2021,811420,Reupholstery and furniture repair,5,5,325,10,81,10005
2881,0500000US10005,"Sussex County, Delaware",5,2021,81149,Other personal and household goods repair and ...,10,23,866,10,81,10005
2882,0500000US10005,"Sussex County, Delaware",5,2021,811490,Other personal and household goods repair and ...,10,23,866,10,81,10005
2883,0500000US10005,"Sussex County, Delaware",5,2021,812,Personal and laundry services,172,744,26470,10,81,10005
2884,0500000US10005,"Sussex County, Delaware",5,2021,8121,Personal care services,118,519,17266,10,81,10005
2885,0500000US10005,"Sussex County, Delaware",5,2021,81211,"Hair, nail, and skin care services",94,442,15280,10,81,10005
2886,0500000US10005,"Sussex County, Delaware",5,2021,812111,Barber shops,4,1,390,10,81,10005


### Group Data by NAICS Sector

The North American Industry Classification System (NAICS) categorizes industries, with the coarsest level of classification being the *Sector*.

The organization of NAICS is as follows (from [this page](https://www.census.gov/programs-surveys/economic-census/guidance/understanding-naics.html) on census.gov):
- **Sector**: 2-digit code
    - **Subsector**: 3-digit code
        - **Industry Group**: 4-digit code
            - **NAICS Industry**: 5-digit code
                - **National Industry**: 6-digit code

We will begin by grouping the data by sector:


In [13]:
def naics_level(df, naics_level):
    return df[df['relevant_naics'].str.len() == naics_level]

In [14]:
df_naics_2 = naics_level(newDF, 2).reset_index(drop=True)
df_naics_3 = naics_level(newDF, 3).reset_index(drop=True)
df_naics_4 = naics_level(newDF, 4).reset_index(drop=True)
df_naics_5 = naics_level(newDF, 5).reset_index(drop=True)
df_naics_6 = naics_level(newDF, 6).reset_index(drop=True)

df_naics_2 = df_naics_2[df_naics_2.relevant_naics != '00']
df_naics_3 = df_naics_3[df_naics_3.relevant_naics != '00']
df_naics_4 = df_naics_4[df_naics_4.relevant_naics != '00']
df_naics_5 = df_naics_5[df_naics_5.relevant_naics != '00']
df_naics_6 = df_naics_6[df_naics_6.relevant_naics != '00']

In [15]:
#s2=state_level(df_naics_2)
c2=county_level(df_naics_2)
#s3=state_level(df_naics_3)
#c3=county_level(df_naics_3)
#s4=state_level(df_naics_4)
c4=county_level(df_naics_4)
#s5=state_level(df_naics_5)
#c5=county_level(df_naics_5)
s6=state_level(df_naics_6)
#c6=county_level(df_naics_6)

In [16]:
newDF

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,relevant_naics,NAICS_TTL,estab,emp,payann,state,NAICS_Sector,id
0,0500000US51127,"New Kent County, Virginia",127,2017,237310,"Highway, street, and bridge construction",4,47,4387,51,23,51127
1,0500000US51127,"New Kent County, Virginia",127,2017,238,Specialty trade contractors,58,295,15102,51,23,51127
2,0500000US51127,"New Kent County, Virginia",127,2017,2381,"Foundation, structure, and building exterior c...",12,79,6415,51,23,51127
3,0500000US51127,"New Kent County, Virginia",127,2017,23814,Masonry contractors,6,24,1010,51,23,51127
4,0500000US51127,"New Kent County, Virginia",127,2017,238140,Masonry contractors,6,24,1010,51,23,51127
...,...,...,...,...,...,...,...,...,...,...,...,...
2922,0500000US10005,"Sussex County, Delaware",5,2021,813910,Business associations,11,33,1793,10,81,10005
2923,0500000US10005,"Sussex County, Delaware",5,2021,81393,Labor unions and similar labor organizations,3,8,72,10,81,10005
2924,0500000US10005,"Sussex County, Delaware",5,2021,81399,"Other similar organizations (except business, ...",20,58,2657,10,81,10005
2925,0500000US10005,"Sussex County, Delaware",5,2021,813990,"Other similar organizations (except business, ...",20,58,2657,10,81,10005


Skipped the code block for `The statewide data does not include NAICS starting with 1!` from us_econ nb

## Utilities: NAICS Code to Name Translation

This section utilizes 2012 NAICS codes and industries, as the 2017 NAICS codes and industries are not available in the current crosswalk data.

**TODO**: Update the 2012 codes with the 2017 codes when the updated data becomes available.

In [17]:
NAICS_codes = pd.read_csv('../../../community-data/us/Crosswalk_MasterCrosswalk.csv', usecols=['2012_NAICS_Code', '2012_NAICS_Industry'])

In [18]:
NAICS_codes=NAICS_codes.rename(columns={"2012_NAICS_Code": "relevant_naics", "2012_NAICS_Industry": "industry_detail"})

In [19]:
NAICS_codes=NAICS_codes.dropna()

In [20]:
NAICS_codes=NAICS_codes.drop_duplicates()

In [21]:
NAICS_codes

Unnamed: 0,relevant_naics,industry_detail
0,11.0,"Agriculture, Forestry, Fishing and Hunting"
13,111.0,Crop Production
19,1111.0,Oilseed and Grain Farming
21,11111.0,Soybean Farming
22,111110.0,Soybean Farming
...,...,...
3885,9281.0,National Security and International Affairs
3887,92811.0,National Security
3888,928110.0,National Security
3889,92812.0,International Affairs


In [22]:
#adding the row for Industries not classified
NAICS_codes
new_row = {'relevant_naics':99, 'industry_detail':"Industries not classified"}

#append row to the dataframe
NAICS_codes = pd.concat([NAICS_codes, pd.DataFrame([new_row])], ignore_index=True)

In [23]:
NAICS_codes

Unnamed: 0,relevant_naics,industry_detail
0,11.0,"Agriculture, Forestry, Fishing and Hunting"
1,111.0,Crop Production
2,1111.0,Oilseed and Grain Farming
3,11111.0,Soybean Farming
4,111110.0,Soybean Farming
...,...,...
2210,92811.0,National Security
2211,928110.0,National Security
2212,92812.0,International Affairs
2213,928120.0,International Affairs


In [24]:
NAICS_codes.to_csv('../../../community-data/us/id_lists/industry_ID_list.csv')

## Utilities: Making a States JSON

In [25]:
stateFips = pd.read_csv('../../../community-data/us/id_lists/state_fips.csv')

In [26]:
stateFips=stateFips.drop(['Unnamed: 3','Unnamed: 4','Unnamed: 5','Unnamed: 6'],axis=1)

In [27]:
stateFips

Unnamed: 0,Name,Postal Code,FIPS
0,Alabama,AL,1
1,Alaska,AK,2
2,Arizona,AZ,4
3,Arkansas,AR,5
4,California,CA,6
5,Colorado,CO,8
6,Connecticut,CT,9
7,Delaware,DE,10
8,Florida,FL,12
9,Georgia,GA,13


In [28]:
#stateFips.to_json(county_data_dir/'states.json', orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)

## Utilities: Making County to FIPS CSV

In [29]:
countyDF=c2[['GEO_TTL','id']].drop_duplicates()

In [30]:
countyDF

Unnamed: 0,GEO_TTL,id
0,"New Kent County, Virginia",51127
15,"Augusta County, Virginia",51015
31,"Northampton County, Virginia",51131
42,"Cumberland County, Virginia",51049
56,"Dickenson County, Virginia",51051
...,...,...
57725,"Issaquena County, Mississippi",28055
94413,"Arthur County, Nebraska",31005
118819,"Clark County, Idaho",16033
156682,"Loving County, Texas",48301


In [31]:
countyDF['hascomma'] = countyDF['GEO_TTL'].apply(lambda x: 'True' if ',' in x else 'False')
countyDF

Unnamed: 0,GEO_TTL,id,hascomma
0,"New Kent County, Virginia",51127,True
15,"Augusta County, Virginia",51015,True
31,"Northampton County, Virginia",51131,True
42,"Cumberland County, Virginia",51049,True
56,"Dickenson County, Virginia",51051,True
...,...,...,...
57725,"Issaquena County, Mississippi",28055,True
94413,"Arthur County, Nebraska",31005,True
118819,"Clark County, Idaho",16033,True
156682,"Loving County, Texas",48301,True


In [32]:
countyDF.loc[(countyDF['hascomma'] == 'True'), 'county'] = countyDF.GEO_TTL.apply(lambda GTT: GTT.split(', ')[0])
countyDF.loc[(countyDF['hascomma'] == 'True'), 'state'] = countyDF.GEO_TTL.apply(lambda GTT: GTT.split(', ')[-1])

In [33]:
countyDF=countyDF[['state','county','id']].drop_duplicates()

In [34]:
countyDF = countyDF.dropna()

In [35]:
countyDF

Unnamed: 0,state,county,id
0,Virginia,New Kent County,51127
15,Virginia,Augusta County,51015
31,Virginia,Northampton County,51131
42,Virginia,Cumberland County,51049
56,Virginia,Dickenson County,51051
...,...,...,...
57725,Mississippi,Issaquena County,28055
94413,Nebraska,Arthur County,31005
118819,Idaho,Clark County,16033
156682,Texas,Loving County,48301


In [36]:
stats = stateFips.rename(columns={"Name": "state"})

In [37]:
stats = stats.drop("FIPS",axis=1)
stats

Unnamed: 0,state,Postal Code
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA
5,Colorado,CO
6,Connecticut,CT
7,Delaware,DE
8,Florida,FL
9,Georgia,GA


In [38]:
countyDF = countyDF.merge(stats, on='state', how='left')

In [39]:
countyDF = countyDF.rename(columns={"Postal Code": "abvr"})
countyDF

Unnamed: 0,state,county,id,abvr
0,Virginia,New Kent County,51127,VA
1,Virginia,Augusta County,51015,VA
2,Virginia,Northampton County,51131,VA
3,Virginia,Cumberland County,51049,VA
4,Virginia,Dickenson County,51051,VA
...,...,...,...,...
3129,Mississippi,Issaquena County,28055,MS
3130,Nebraska,Arthur County,31005,NE
3131,Idaho,Clark County,16033,ID
3132,Texas,Loving County,48301,TX


#### Note: The variable 'countyDF' is defined but not used in later sections of the code.

## County-Level Data - Saving (**Commented**)


**Comment and uncomment the following code block with caution. This is very important!**

In [40]:
#NOTE Code block to generate individual county level files for each county. (Commented because no need of each county files)
# states = newDF.state.unique()
# #states=[13]

# df_naics_6 = df_naics_6.astype({'relevant_naics': 'string'})

# a = county_level(df_naics_2)
# b = county_level(df_naics_4)
# c = county_level(df_naics_6)

# for state in states:
#     stateName = stateFips.loc[stateFips.FIPS==state,"Postal Code"].values[0]
#     print(stateName)

#     repo_dir = pathlib.Path().cwd()
#     state_dir = repo_dir.parents[2] / 'community-data' / 'industries' / 'naics' / 'US' / 'counties-update' / stateName
    
#     if not state_dir.exists():
#         state_dir.mkdir(parents=True)
    
#     a1 = a[a.state==state]
#     b1 = b[b.state==state]
#     c1 = c[c.state==state]

#     for year in range(startyear, endyear+1):
#         print(year)

#         a1y = a1[a1.YEAR==year]
#         b1y = b1[b1.YEAR==year]
#         c1y = c1[c1.YEAR==year]

#         def save_county_data(state, df, counties, naics_level_str):
#             for county in counties:

#                 curr_df = df[df.COUNTY==county]
                
#                 county = str(county)
#                 # print(county)
#                 if len(county) == 2:
#                     county = "0" + county
#                 elif len(county) == 1:
#                     county = "00" + county

#                 state = str(state) if len(str(state)) == 2 else "0" + str(state)

#                 curr_df = curr_df.drop(["GEO_ID", "GEO_TTL", "COUNTY", "YEAR", "NAICS_TTL", "state", "NAICS_Sector"], axis=1)
#                 curr_df = curr_df.rename(columns={"id":"fips"})

#                 filename = "US" + state + county + "-" + "census-" + naics_level_str + "-" + str(year) + ".csv"
                  
                 #NOTE: Need to change output path if the nb is transferred to data-pipeline repository
#                 curr_df.to_csv(f"../../../industries/naics/US/counties-update/state-naics-update/{stateName}/{filename}")

#         c_a1 = a1.COUNTY.unique()
#         c_b1 = b1.COUNTY.unique()
#         c_c1 = c1.COUNTY.unique()

#         save_county_data(state, a1y, c_a1, "naics2")
#         save_county_data(state, b1y, c_b1, "naics4")
#         save_county_data(state, c1y, c_c1, "naics6")

In [41]:
states = newDF.state.unique()
#states=[13]

df_naics_6 = df_naics_6.astype({'relevant_naics': 'string'})

a = county_level(df_naics_2)
b = county_level(df_naics_4)
c = county_level(df_naics_6)

for state in states:
    stateName = stateFips.loc[stateFips.FIPS==state,"Postal Code"].values[0]
    print(stateName)

    repo_dir = pathlib.Path().cwd()
    # state_dir = repo_dir.parents[2] / 'us' / 'state-naics-update' / stateName
    state_dir = repo_dir.parents[2] / 'community-data' / 'industries' / 'naics' / 'US' / 'counties-update' / stateName

    
    if not state_dir.exists():
        state_dir.mkdir(parents=True)
    
    a1 = a[a.state==state]
    b1 = b[b.state==state]
    c1 = c[c.state==state]

    for year in range(startyear, endyear+1):
        print(year)

        a1y = a1[a1.YEAR==year]
        b1y = b1[b1.YEAR==year]
        c1y = c1[c1.YEAR==year]

        def save_county_data(state, df, naics_level_str):

            state = str(state) if len(str(state)) == 2 else "0" + str(state)

            curr_df = df.drop(["GEO_ID", "GEO_TTL", "COUNTY", "YEAR", "NAICS_TTL", "state", "NAICS_Sector"], axis=1)
            curr_df = curr_df.rename(columns={"id":"Fips", "relevant_naics":"Naics", "estab":"Establishments", "emp":"Employees", "payann":"Payroll"})
            curr_df = curr_df[["Fips", "Naics", "Establishments", "Employees", "Payroll"]]

            filename = "US-" + stateName + "-" + "census-" + naics_level_str + "-counties-" + str(year) + ".csv"

            curr_df.to_csv(f"../../../community-data/industries/naics/US/counties-update/{stateName}/{filename}", index=False)
        
        save_county_data(state, a1y, "naics2")
        save_county_data(state, b1y, "naics4")
        save_county_data(state, c1y, "naics6")

VA
2017
2018
2019
2020
2021
KS
2017
2018
2019
2020
2021
AR
2017
2018
2019
2020
2021
RI
2017
2018
2019
2020
2021
DE
2017
2018
2019
2020
2021
CO
2017
2018
2019
2020
2021
SD
2017
2018
2019
2020
2021
NC
2017
2018
2019
2020
2021
ND
2017
2018
2019
2020
2021
MI
2017
2018
2019
2020
2021
ID
2017
2018
2019
2020
2021
TX
2017
2018
2019
2020
2021
IA
2017
2018
2019
2020
2021
MN
2017
2018
2019
2020
2021
PA
2017
2018
2019
2020
2021
CA
2017
2018
2019
2020
2021
AZ
2017
2018
2019
2020
2021
UT
2017
2018
2019
2020
2021
NY
2017
2018
2019
2020
2021
SC
2017
2018
2019
2020
2021
AK
2017
2018
2019
2020
2021
LA
2017
2018
2019
2020
2021
MA
2017
2018
2019
2020
2021
TN
2017
2018
2019
2020
2021
GA
2017
2018
2019
2020
2021
WY
2017
2018
2019
2020
2021
VT
2017
2018
2019
2020
2021
WI
2017
2018
2019
2020
2021
MS
2017
2018
2019
2020
2021
AL
2017
2018
2019
2020
2021
OH
2017
2018
2019
2020
2021
MD
2017
2018
2019
2020
2021
ME
2017
2018
2019
2020
2021
MT
2017
2018
2019
2020
2021
IN
2017
2018
2019
2020
2021
OR
2017
2018
2019
20

## State-Level Data - Retreival, Process, Saving


### Retrieve Statewide Data from County API (**Commented**)

**Comment and uncomment the following code block with caution. This is very important!**

In [None]:
#NOTE: Activate this cell only if you want to generate state naics data USING county cbp api (NOT the state cbp api)
#NOTE: If this block is activated, comment the data generation block for state naics which is using state cbp api request directly

# states=newDF.state.unique()
# #states=[13]

# for state in states:
    
#     b1 = county_level(df_naics_2)
#     c1 = b1[b1.state==state]
#     c1.astype({'NAICS_Sector': 'int'})
#     d1 = c1.groupby(['NAICS_Sector','NAICS_TTL','state','relevant_naics', "YEAR"],as_index=False).sum()
#     d1 = d1.drop(["COUNTY","id", "GEO_TTL", "NAICS_Sector", "NAICS_TTL", "state", "GEO_ID"],axis=1)
#     # d1.insert(0, 'Fips', state)
#     # d1.insert(1, 'COUNTY', 999)
#     # d1.insert(2, 'GEO_TTL', 'Statewide')

#     b2 = county_level(df_naics_4)
#     c2 = b2[b2.state==state]
#     c2.astype({'NAICS_Sector': 'int'})
#     d2 = c2.groupby(['NAICS_Sector','NAICS_TTL','state','relevant_naics', "YEAR"],as_index=False).sum()
#     d2 = d2.drop(["COUNTY","id", "GEO_TTL", "NAICS_Sector", "NAICS_TTL", "state", "GEO_ID"],axis=1)
#     # d2.insert(0, 'Fips', state)
#     # d2.insert(1, 'COUNTY', 999)
#     # d2.insert(2, 'GEO_TTL', 'Statewide')

#     b3 = county_level(df_naics_6)
#     c3 = b3[b3.state==state]
#     c3.astype({'NAICS_Sector': 'int'})
#     d3 = c3.groupby(['NAICS_Sector','NAICS_TTL','state','relevant_naics', "YEAR"],as_index=False).sum()
#     d3 = d3.drop(["COUNTY","id", "GEO_TTL", "NAICS_Sector", "NAICS_TTL", "state", "GEO_ID"],axis=1)
#     # d3.insert(0, 'Fips', state)
#     # d3.insert(1, 'COUNTY', 999)
#     # d3.insert(2, 'GEO_TTL', 'Statewide')
    
#     stateName=stateFips.loc[stateFips.FIPS==state,"Postal Code"].values[0]
#     print(stateName)

    # repo_dir = pathlib.Path().cwd()
    # state_dir = repo_dir.parents[2] / 'community-data' / 'industries' / 'naics' / 'US' / 'states-update' / stateName
    
    # if not state_dir.exists():
    #     state_dir.mkdir(parents=True)

#     for year in range(startyear, endyear+1):
#         print(year)

#         def save_state_data(state, df, naics_level_str):
                
#             state = str(state) if len(str(state)) == 2 else "0" + str(state)

#             filename = "US-" + stateName + "-" + "census-" + naics_level_str + "-" + str(year) + ".csv"

#             curr_df = df.rename(columns={"relevant_naics":"Naics", "estab":"Establishments", "emp":"Employees", "payann":"Payroll"})

#             curr_df.to_csv(f"../../../community-data/industries/naics/US/states-update/{stateName}/{filename}", index=False)

#         d1y = d1[d1.YEAR==year]
#         d2y = d2[d2.YEAR==year]
#         d3y = d3[d3.YEAR==year]

#         d1y = d1y.drop(["YEAR"],axis=1)
#         d2y = d2y.drop(["YEAR"],axis=1)
#         d3y = d3y.drop(["YEAR"],axis=1)

#         save_state_data(state, d1y, "naics2")
#         save_state_data(state, d2y, "naics4")
#         save_state_data(state, d3y, "naics6")

### Retrieve Statewide Data from State API Directly (**Correct Version**)

In [43]:
state_data_dir = out_data_dir / 'state_level'
if not state_data_dir.exists():
    state_data_dir.mkdir(parents=True)

In [44]:
# Base URL for the API call
base_url = "https://api.census.gov/data"

#
# NOTE Years Prior to 2012 Currently have a bug when specifying NAICS#### as one of the columns
#      - stick to 2012 and later for now
#

def get_state_cbp(fips, state, years):
    count=0
    for year in years:
        print(f"Getting data for state: {state}\tyear: {year}")
        if year>=2000 and year<=2002:
            columns_to_select = "GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS1997_TTL,ESTAB,EMP,PAYANN"
            url=f"{base_url}/{year}/cbp?get={columns_to_select}&for=state:{fips:02d}"
        elif year>=2003 and year<=2007:
            columns_to_select = "GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2002_TTL,ESTAB,EMP,PAYANN"
            url=f"{base_url}/{year}/cbp?get={columns_to_select}&for=state:{fips:02d}"
        elif year>=2008 and year<=2011:
            columns_to_select = "GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2007_TTL,ESTAB,EMP,PAYANN"
            url=f"{base_url}/{year}/cbp?get={columns_to_select}&for=state:{fips:02d}"
        elif year>=2012 and year<=2016:
            columns_to_select = "GEO_ID,GEO_TTL,COUNTY,YEAR,NAICS2012,NAICS2012_TTL,ESTAB,EMP,PAYANN"
            url=f"{base_url}/{year}/cbp?get={columns_to_select}&for=state:{fips:02d}"
        elif year>=2017 and year <= 2021:
            columns_to_select = "GEO_ID,NAME,COUNTY,YEAR,NAICS2017,NAICS2017_LABEL,ESTAB,EMP,PAYANN"
            url=f"{base_url}/{year}/cbp?get={columns_to_select}&for=state:{fips:02d}"
    
    
        response = r.get(url, headers=api_headers)

        with open(state_data_dir / f"industriesPerState_{str.lower(state.replace(' ', ''))}_{year}.csv",'w') as resultPath:
            for line in response.text.strip().split('\n'):
                line=line.replace('[',"").replace(']',"")
                resultPath.write(line + "\n")

        print("  > Finished CSV for year"+str(year))

In [45]:
for fips in state_fips.FIPS.unique():
     state = state_fips.query(f'FIPS=={fips}').values[0][0]
     years=range(startyear,endyear+1)
     get_state_cbp(fips, state, years)

Getting data for state: Alabama	year: 2017
  > Finished CSV for year2017
Getting data for state: Alabama	year: 2018
  > Finished CSV for year2018
Getting data for state: Alabama	year: 2019
  > Finished CSV for year2019
Getting data for state: Alabama	year: 2020
  > Finished CSV for year2020
Getting data for state: Alabama	year: 2021
  > Finished CSV for year2021
Getting data for state: Alaska	year: 2017
  > Finished CSV for year2017
Getting data for state: Alaska	year: 2018
  > Finished CSV for year2018
Getting data for state: Alaska	year: 2019
  > Finished CSV for year2019
Getting data for state: Alaska	year: 2020
  > Finished CSV for year2020
Getting data for state: Alaska	year: 2021
  > Finished CSV for year2021
Getting data for state: Arizona	year: 2017
  > Finished CSV for year2017
Getting data for state: Arizona	year: 2018
  > Finished CSV for year2018
Getting data for state: Arizona	year: 2019
  > Finished CSV for year2019
Getting data for state: Arizona	year: 2020
  > Finished 

In [46]:
df_state = pd.concat(load_all_states(state_data_dir)).drop("is5", axis=1)

In [47]:
df_state

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,relevant_naics,NAICS_TTL,estab,emp,payann,state,NAICS_Sector
0,0400000US32,Nevada,,2017,324122,Asphalt shingle and coating materials manufact...,3,0,0,32,32
1,0400000US32,Nevada,,2017,325,Chemical manufacturing,80,1359,77548,32,32
2,0400000US32,Nevada,,2017,00,Total for all sectors,66430,1191625,50960493,32,00
3,0400000US32,Nevada,,2017,21311,Support activities for mining,69,1595,175576,32,21
4,0400000US32,Nevada,,2017,213111,Drilling oil and gas wells,7,29,2134,32,21
...,...,...,...,...,...,...,...,...,...,...,...
1271,0400000US38,North Dakota,,2021,813930,Labor unions and similar labor organizations,41,254,6176,38,81
1272,0400000US38,North Dakota,,2021,81394,Political organizations,9,22,711,38,81
1273,0400000US38,North Dakota,,2021,813940,Political organizations,9,22,711,38,81
1274,0400000US38,North Dakota,,2021,81399,"Other similar organizations (except business, ...",10,107,2150,38,81


In [48]:
df_state['fips'] = df_state.GEO_ID.apply(lambda GID: GID.split('US')[1])
df_state

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,relevant_naics,NAICS_TTL,estab,emp,payann,state,NAICS_Sector,fips
0,0400000US32,Nevada,,2017,324122,Asphalt shingle and coating materials manufact...,3,0,0,32,32,32
1,0400000US32,Nevada,,2017,325,Chemical manufacturing,80,1359,77548,32,32,32
2,0400000US32,Nevada,,2017,00,Total for all sectors,66430,1191625,50960493,32,00,32
3,0400000US32,Nevada,,2017,21311,Support activities for mining,69,1595,175576,32,21,32
4,0400000US32,Nevada,,2017,213111,Drilling oil and gas wells,7,29,2134,32,21,32
...,...,...,...,...,...,...,...,...,...,...,...,...
1271,0400000US38,North Dakota,,2021,813930,Labor unions and similar labor organizations,41,254,6176,38,81,38
1272,0400000US38,North Dakota,,2021,81394,Political organizations,9,22,711,38,81,38
1273,0400000US38,North Dakota,,2021,813940,Political organizations,9,22,711,38,81,38
1274,0400000US38,North Dakota,,2021,81399,"Other similar organizations (except business, ...",10,107,2150,38,81,38


In [49]:
df_state["COUNTY"] = 0
df_state

Unnamed: 0,GEO_ID,GEO_TTL,COUNTY,YEAR,relevant_naics,NAICS_TTL,estab,emp,payann,state,NAICS_Sector,fips
0,0400000US32,Nevada,0,2017,324122,Asphalt shingle and coating materials manufact...,3,0,0,32,32,32
1,0400000US32,Nevada,0,2017,325,Chemical manufacturing,80,1359,77548,32,32,32
2,0400000US32,Nevada,0,2017,00,Total for all sectors,66430,1191625,50960493,32,00,32
3,0400000US32,Nevada,0,2017,21311,Support activities for mining,69,1595,175576,32,21,32
4,0400000US32,Nevada,0,2017,213111,Drilling oil and gas wells,7,29,2134,32,21,32
...,...,...,...,...,...,...,...,...,...,...,...,...
1271,0400000US38,North Dakota,0,2021,813930,Labor unions and similar labor organizations,41,254,6176,38,81,38
1272,0400000US38,North Dakota,0,2021,81394,Political organizations,9,22,711,38,81,38
1273,0400000US38,North Dakota,0,2021,813940,Political organizations,9,22,711,38,81,38
1274,0400000US38,North Dakota,0,2021,81399,"Other similar organizations (except business, ...",10,107,2150,38,81,38


In [50]:
naics_str = "relevant_naics"
naics_ttl = "NAICS_TTL"
geo_ttl = "GEO_TTL"

newDF_state = df_state.filter(['fips', 'state', 'COUNTY', 'YEAR' ,geo_ttl, naics_str, naics_ttl,'NAICS_Sector', "estab", "emp", "payann"], axis=1)
newDF_state

Unnamed: 0,fips,state,COUNTY,YEAR,GEO_TTL,relevant_naics,NAICS_TTL,NAICS_Sector,estab,emp,payann
0,32,32,0,2017,Nevada,324122,Asphalt shingle and coating materials manufact...,32,3,0,0
1,32,32,0,2017,Nevada,325,Chemical manufacturing,32,80,1359,77548
2,32,32,0,2017,Nevada,00,Total for all sectors,00,66430,1191625,50960493
3,32,32,0,2017,Nevada,21311,Support activities for mining,21,69,1595,175576
4,32,32,0,2017,Nevada,213111,Drilling oil and gas wells,21,7,29,2134
...,...,...,...,...,...,...,...,...,...,...,...
1271,38,38,0,2021,North Dakota,813930,Labor unions and similar labor organizations,81,41,254,6176
1272,38,38,0,2021,North Dakota,81394,Political organizations,81,9,22,711
1273,38,38,0,2021,North Dakota,813940,Political organizations,81,9,22,711
1274,38,38,0,2021,North Dakota,81399,"Other similar organizations (except business, ...",81,10,107,2150


In [51]:
df_naics_2_state = naics_level(newDF_state, 2).reset_index(drop=True)
df_naics_3_state = naics_level(newDF_state, 3).reset_index(drop=True)
df_naics_4_state = naics_level(newDF_state, 4).reset_index(drop=True)
df_naics_5_state = naics_level(newDF_state, 5).reset_index(drop=True)
df_naics_6_state = naics_level(newDF_state, 6).reset_index(drop=True)

df_naics_2_state = df_naics_2_state[df_naics_2_state.relevant_naics != '00']
df_naics_3_state = df_naics_3_state[df_naics_3_state.relevant_naics != '00']
df_naics_4_state = df_naics_4_state[df_naics_4_state.relevant_naics != '00']
df_naics_5_state = df_naics_5_state[df_naics_5_state.relevant_naics != '00']
df_naics_6_state = df_naics_6_state[df_naics_6_state.relevant_naics != '00']

In [52]:
df_naics_2_state

Unnamed: 0,fips,state,COUNTY,YEAR,GEO_TTL,relevant_naics,NAICS_TTL,NAICS_Sector,estab,emp,payann
1,32,32,0,2017,Nevada,22,Utilities,22,124,4757,507532
2,32,32,0,2017,Nevada,11,"Agriculture, forestry, fishing and hunting",11,55,287,12273
3,32,32,0,2017,Nevada,21,"Mining, quarrying, and oil and gas extraction",21,184,12935,1251647
4,32,32,0,2017,Nevada,23,Construction,23,4979,75417,4208010
5,32,32,0,2017,Nevada,42,Wholesale trade,42,3120,37565,2259481
...,...,...,...,...,...,...,...,...,...,...,...
4495,38,38,0,2021,North Dakota,62,Health care and social assistance,62,2234,65242,3666950
4496,38,38,0,2021,North Dakota,71,"Arts, entertainment, and recreation",71,486,5068,103278
4497,38,38,0,2021,North Dakota,72,Accommodation and food services,72,2090,30977,654445
4498,38,38,0,2021,North Dakota,81,Other services (except public administration),81,2577,13674,489660



### Save Statewide Data


**Comment and uncomment the following code block with caution. This is very important!**

In [53]:
#NOTE: Uncomment if you want to generate the state naics datasets using the state cbp api directly.
#NOTE: If the code block is uncommented, make sure that the state naics generation block (above in the nb) using county cbp api is COMMENTED.

states=newDF.state.unique()

a = df_naics_2_state
b = df_naics_4_state
c = df_naics_6_state
for state in states:
    stateName=stateFips.loc[stateFips.FIPS==state,"Postal Code"].values[0]
    print(stateName)

    a1 = a[a.state==state]
    b1 = b[b.state==state]
    c1 = c[c.state==state]

    a1 = a1.drop(["COUNTY", "GEO_TTL", "NAICS_Sector", "NAICS_TTL", "state"],axis=1)
    b1 = b1.drop(["COUNTY", "GEO_TTL", "NAICS_Sector", "NAICS_TTL", "state"],axis=1)
    c1 = c1.drop(["COUNTY", "GEO_TTL", "NAICS_Sector", "NAICS_TTL", "state"],axis=1)


    repo_dir = pathlib.Path().cwd()
    # state_dir = repo_dir.parents[2] / 'us' / 'state-naics-update' / stateName / 'state-naics-api'
    # state_dir = repo_dir.parents[2] / 'industries' / 'naics' / 'US' / 'counties-update' / stateName / 'state-naics-api'
    state_dir = repo_dir.parents[2] / 'community-data' / 'industries' / 'naics' / 'US' / 'states-update' / stateName
    
    if not state_dir.exists():
        state_dir.mkdir(parents=True)

    for year in range(startyear, endyear+1):
        
        ay = a1[a1.YEAR==year]
        by = b1[b1.YEAR==year]
        cy = c1[c1.YEAR==year]

        ay = ay.drop(["YEAR"],axis=1)
        ay = ay.drop(["fips"],axis=1)
        by = by.drop(["YEAR"],axis=1)
        by = by.drop(["fips"],axis=1)
        cy = cy.drop(["YEAR"],axis=1)
        cy = cy.drop(["fips"],axis=1)

        def save_state_data(state, df, naics_level_str):
                
            state = str(state) if len(str(state)) == 2 else "0" + str(state)

            filename = "US-" + stateName + "-" + "census-" + naics_level_str + "-" + str(year) + ".csv"

            curr_df = df.rename(columns={"relevant_naics":"Naics", "estab":"Establishments", "emp":"Employees", "payann":"Payroll"})

            curr_df.to_csv(f"../../../community-data/industries/naics/US/states-update/{stateName}/{filename}", index=False)


        save_state_data(state, ay, "naics2")
        save_state_data(state, by, "naics4")
        save_state_data(state, cy, "naics6")

VA
KS
AR
RI
DE
CO
SD
NC
ND
MI
ID
TX
IA
MN
PA
CA
AZ
UT
NY
SC
AK
LA
MA
TN
GA
WY
VT
WI
MS
AL
OH
MD
ME
MT
IN
OR
NJ
IL
CT
WA
WV
NV
NE
NH
KY
MO
NM
OK
HI
FL


## Country-Level Data - Retreival, Process, Saving


In [54]:
states=newDF.state.unique()

a = df_naics_2_state
b = df_naics_4_state
c = df_naics_6_state

for year in range(startyear, endyear+1):
    print(year)

    ay = a[a["YEAR"]==year]
    by = b[b["YEAR"]==year]
    cy = c[c["YEAR"]==year]

    ay = ay.drop(["YEAR"],axis=1)
    by = by.drop(["YEAR"],axis=1)
    cy = cy.drop(["YEAR"],axis=1)

    a1 = ay.drop(["COUNTY", "GEO_TTL", "NAICS_Sector", "NAICS_TTL", "state"],axis=1)
    b1 = by.drop(["COUNTY", "GEO_TTL", "NAICS_Sector", "NAICS_TTL", "state"],axis=1)
    c1 = cy.drop(["COUNTY", "GEO_TTL", "NAICS_Sector", "NAICS_TTL", "state"],axis=1)

    a1 = a1.groupby(['fips','relevant_naics'], as_index=False).sum()
    b1 = b1.groupby(['fips','relevant_naics'], as_index=False).sum()
    c1 = c1.groupby(['fips','relevant_naics'], as_index=False).sum()

    repo_dir = pathlib.Path().cwd()
    country_dir = repo_dir.parents[2] / 'community-data' / 'industries' / 'naics' / 'US' / 'country-update'

    if not country_dir.exists():
        country_dir.mkdir(parents=True)

    def save_country_data(df, naics_level_str):

        filename = "US-" + "census-" + naics_level_str + "-" + str(year) + ".csv"

        curr_df = df.rename(columns={"fips":"Fips", "relevant_naics":"Naics", "estab":"Establishments", "emp":"Employees", "payann":"Payroll"})

        curr_df.to_csv(f"../../../community-data/industries/naics/US/country-update/{filename}", index=False)

    save_country_data(a1, "naics2")
    save_country_data(b1, "naics4")
    save_country_data(c1, "naics6")
    

2017
2018
2019
2020
2021


In [10]:
# NOTE: Code to remove files and directories from data_raw
# NOTE: Uncomment and run only after you have generated the datasets by running all the abaove code blocks.
# def del_data_raw(bea_data_dir):
    
#     for i in range(startyear,endyear+1):

#         x="_"+str(i)
#         files = [f for f in bea_data_dir.iterdir() if x in f.name]

#         for f in files:
#             os.remove(f)

#     os.rmdir(bea_data_dir)

# del_data_raw(county_data_dir)
# del_data_raw(state_data_dir)

## Zipcode-Level Data - Retreival, Process, Saving


In [11]:
import utilities.zipcode_utility as zu

zipcodes = pd.read_csv('../../../community-data/us/zipcodes/zipcodes2.csv')

zip_data_dir = out_data_dir
if not zip_data_dir.exists():
    zip_data_dir.mkdir(parents=True)

# Default Zip path is '../../../community-data/industries/naics/US/zips'
zip_util = zu.ZipCodeUtility(api_headers=api_headers, base_path=zip_data_dir)

In [None]:
zip_util.get_all_zip_zbp(zipcodes)

Getting data for zipcode: 1001	year: 2000
Finished processing 1001 for 2000.
Getting data for zipcode: 1001	year: 2001
Finished processing 1001 for 2001.
Getting data for zipcode: 1001	year: 2002
Finished processing 1001 for 2002.
Getting data for zipcode: 1001	year: 2003
Finished processing 1001 for 2003.
Getting data for zipcode: 1001	year: 2004
Finished processing 1001 for 2004.
Getting data for zipcode: 1001	year: 2005
Finished processing 1001 for 2005.
Getting data for zipcode: 1001	year: 2006
Finished processing 1001 for 2006.
Getting data for zipcode: 1001	year: 2007
Finished processing 1001 for 2007.
Getting data for zipcode: 1001	year: 2008
Finished processing 1001 for 2008.
Getting data for zipcode: 1001	year: 2009
Finished processing 1001 for 2009.
Getting data for zipcode: 1001	year: 2010
Finished processing 1001 for 2010.
Getting data for zipcode: 1001	year: 2011
Finished processing 1001 for 2011.
Getting data for zipcode: 1001	year: 2012
Finished processing 1001 for 2012.