In [1]:
import pathlib
import pandas as pd
import numpy as np

### Goal: Process Industry Data Data --> .tsv for choropleth

Current US choropleth example takes .tsv with the following format:

```
id	rate
01001	5.1
01003	4.9
01005	8.6
01007	6.2
```

where id is the county FIPS and rate is some value for coloring the choropleth

In [3]:
# Helper Functions
def fips_to_str(fips):
    fips_str = str(int(fips))

    while len(fips_str) < 5:
        fips_str = '0' + fips_str

    return fips_str


def extract_state_fips(fips_str):
    return int(fips_str[0:2])


def contains_3digit_naics(row):
    '''return True if any of the relevant naics is a three digit naic'''
    naics_list = row['relevant_naics'].replace(' ', '').strip('[]').split(',')
    check = any([len(naic)==3 for naic in naics_list])
    return check

In [29]:
base_dir = pathlib.Path.cwd()

# Load list of all county fips
county_fips_file = base_dir.parent.parent / 'data_raw' / 'counties_fips.csv'
counties = pd.read_csv(county_fips_file)
counties['fips_str'] = counties.FIPS.apply(fips_to_str)

# Load the cancer incidence data
data_dir = base_dir.parent.parent / 'data_clean' 

industry_data_csv = data_dir / 'indicators_per-industry_per-county.csv'

# Load ALL incidence rate data to dataframe
df = pd.read_csv(industry_data_csv)
df['fips_str'] = df['fips'].apply(fips_to_str)

df3 = df[df.apply(contains_3digit_naics, axis=1)]

df3.head()

Unnamed: 0.1,Unnamed: 0,fips,county,name,industry_code,industry_detail,relevant_naics,year,payann,estab,...,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR,fips_str
0,0,8001,1,"Adams County, Colorado",113000,timber and raw forest products,[113],2012,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8001
1,1,8001,1,"Adams County, Colorado",115000,agriculture and forestry support,[115],2012,290,6,...,0.0,3.405586,0.0,0.000183,0.008769,0.95877,2.011564,211.423463,15.653502,8001
2,2,8001,1,"Adams County, Colorado",211000,unrefined oil and gas,[211],2012,0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8001
25,25,8001,1,"Adams County, Colorado",315000,clothing,[315],2012,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8001
26,26,8001,1,"Adams County, Colorado",316000,leather,[316],2012,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8001


In [39]:
# df3[
#     df3.relevant_naics.apply(
#         lambda x: True if len(x.strip('[]').split(','))>1 
#                   else False
#     )]
df3.relevant_naics = df3.relevant_naics.apply(
        lambda x: int(x.strip('[]').split(',')[0]) if len(x.strip('[]').split(','))>1 
                  else int(x.strip('[]')))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [43]:
df3.columns

Index(['Unnamed: 0', 'fips', 'county', 'name', 'industry_code',
       'industry_detail', 'relevant_naics', 'year', 'payann', 'estab', 'emp',
       'ACID', 'ENRG', 'ETOX', 'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC',
       'HNC', 'HRSP', 'HTOX', 'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN',
       'OZON', 'PEST', 'REN', 'SMOG', 'VADD', 'WATR', 'fips_str'],
      dtype='object')

In [54]:
info_cols = ['fips_str', 'relevant_naics', 'industry_code', 'industry_detail']
data_cols = ['emp']
#     'payann', 'estab', 'emp']
#     'ACID', 'ENRG', 'ETOX', 'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC',
#     'HNC', 'HRSP', 'HTOX', 'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN',
#     'OZON', 'PEST', 'REN', 'SMOG', 'VADD', 'WATR']

df_out = df3[info_cols + data_cols].sort_values(by=['fips_str', 'relevant_naics'])

df_out.rename(columns={'fips_str': 'id'}, inplace=True)

df_out.head()

Unnamed: 0,id,relevant_naics,industry_code,industry_detail,emp
241739,1001,113,113000,timber and raw forest products,30
241761,1001,441,441000,vehicles and parts sales,273
241762,1001,445,445000,food and beverage stores,255
241763,1001,452,452000,general merchandise stores,0
241764,1001,484,484000,truck transport,0


In [59]:
df_out.to_csv(
    data_dir / 'tables_for_viz' / 'industry_byCounty_byType.tsv',
    index=False, sep='\t'
)

In [78]:
df_key = (df3[['relevant_naics', 'industry_detail']]
          .groupby('industry_detail').first()
          .sort_values(by='relevant_naics'))

df_key.to_csv(
    data_dir / 'tables_for_viz' / 'industry_ID_list.csv',
#     index=False
)

df_key.head()

Unnamed: 0_level_0,relevant_naics
industry_detail,Unnamed: 1_level_1
timber and raw forest products,113
wild-caught fish and game,114
agriculture and forestry support,115
unrefined oil and gas,211
clothing,315


In [77]:
# len(df.industry_code.unique())