In [1]:
# Importing package
import re
import pandas as pd
import numpy as np

In [2]:
# Helper Functions
def fips_to_str(fips):
    fips_str = str(int(fips))

    while len(fips_str) < 5:
        fips_str = '0' + fips_str

    return fips_str

def extract_state_fips(fips_str):
    return int(fips_str[0:2])


# Loading data

In [3]:
# Load list of all county fips
counties = pd.read_csv("../../data_raw/counties_fips.csv")
counties['id'] = counties.FIPS.apply(fips_to_str)
counties = counties.drop(['FIPS'], axis=1)
counties

Unnamed: 0,Name,State,id
0,Autauga,AL,01001
1,Baldwin,AL,01003
2,Barbour,AL,01005
3,Bibb,AL,01007
4,Blount,AL,01009
...,...,...,...
3227,Yabucoa,PR,72151
3228,Yauco,PR,72153
3229,St. Croix,VI,78010
3230,St. John,VI,78020


In [4]:
# Load industrial impacts data
df = pd.read_csv("../../data_clean/indicators_per-industry_per-county.csv")
df['id'] = df.fips.apply(fips_to_str)
df

Unnamed: 0.1,Unnamed: 0,fips,county,name,industry_code,industry_detail,relevant_naics,year,payann,estab,...,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR,id
0,0,8001,1,"Adams County, Colorado",113000,timber and raw forest products,[113],2012,0,1,...,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,08001
1,1,8001,1,"Adams County, Colorado",115000,agriculture and forestry support,[115],2012,290,6,...,0.0,3.405586,0.0,1.827157e-04,8.769207e-03,0.958770,2.011564,211.423463,15.653502,08001
2,2,8001,1,"Adams County, Colorado",211000,unrefined oil and gas,[211],2012,0,6,...,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,08001
3,3,8001,1,"Adams County, Colorado",2122a0,"iron, gold, silver, and other metal ores","[21221, 21222, 21229]",2012,0,1,...,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,08001
4,4,8001,1,"Adams County, Colorado",212310,dimensional stone,[21231],2012,0,4,...,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,08001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306027,306027,32510,510,"Carson City, Nevada",812300,dry-cleaning and laundry,[8123],2012,0,4,...,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000,32510
306028,306028,32510,510,"Carson City, Nevada",812900,"pet care, photofinishing, parking and other su...",[8129],2012,331,4,...,0.0,2.148818,0.0,1.318396e-07,2.876990e-09,0.025598,0.076440,191.834961,0.096584,32510
306029,306029,32510,510,"Carson City, Nevada",813100,religious organizations,[8131],2012,2441,13,...,0.0,6.541864,0.0,0.000000e+00,0.000000e+00,0.000000,0.312826,696.409510,2.551873,32510
306030,306030,32510,510,"Carson City, Nevada",813a00,"grantmaking, giving, and social advocacy organ...","[8132, 8133]",2012,635,9,...,0.0,4.713797,0.0,0.000000e+00,0.000000e+00,0.000000,0.020206,440.264876,0.051385,32510


In [5]:
# Loading the cancer incidents file
df_cancer = pd.read_csv("../../data_clean/CDC_CancerByCounty/incidencerates/cancer_by_type.csv")
df_cancer

Unnamed: 0,locale,fips,met_health_obj,incidence rate_per_100000,incidence rate_lower_95_confidence,incidence rate_upper_95_confidence,annual_count_avg,recent_trend_str,trend_last_5,trend_last_5_lower_95_confidence,...,cancer,file_name,race,sex,source_url,stage,stateFIPS,type,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,46,incd,southdakota,All Cancer Sites
2,"US (SEER+NPCR)(1,10)",0.0,***,448,447.7,448.4,1638110,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,2,incd,alaska,All Cancer Sites
3,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,33,incd,newhampshire,All Cancer Sites
4,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,17,incd,illinois,All Cancer Sites
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74492,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,...,86,incidencerates_015.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,Non-Hodgkin Lymphoma
74493,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,...,90,incidencerates_011.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,Leukemia
74494,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,...,400,incidencerates_004.csv,0,2,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,Breast (in situ) (Female)
74495,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,...,515,incidencerates_007.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,"Childhood (Ages <20, All Sites)"


In [6]:
df_cancer_list = pd.read_csv("../../data_raw/cancer_ID_list.csv")
df_cancer_list.columns = ['cancerName', 'cancer']
mask = ~df_cancer_list["cancerName"].str.contains("Cancer")
df_cancer_list.loc[mask, 'cancerName'] = df_cancer_list['cancerName'] + ' Cancer'
df_cancer_list

ModuleNotFoundError: No module named 'numpy.core._multiarray_umath'

Unnamed: 0,cancerName,cancer
0,All Cancer Sites,1
1,Bladder Cancer,71
2,Brain & ONS Cancer,76
3,Breast (Female) Cancer,55
4,Breast (in situ) (Female) Cancer,400
5,Cervix (Female) Cancer,57
6,"Childhood (Ages <15, All Sites) Cancer",516
7,"Childhood (Ages <20, All Sites) Cancer",515
8,Colon & Rectum Cancer,20
9,Esophagus Cancer,17


In [7]:
df_cancer_list = pd.read_csv("resources/cancer_ID_list.csv")
df_cancer_list.columns = ['cancerName', 'cancer']
mask = ~df_cancer_list["cancerName"].str.contains("Cancer")
df_cancer_list.loc[mask, 'cancerName'] = df_cancer_list['cancerName'] + ' Cancer'
df_cancer_list

Unnamed: 0,cancerName,cancer
0,All Cancer Sites,1
1,Bladder Cancer,71
2,Brain & ONS Cancer,76
3,Breast (Female) Cancer,55
4,Breast (in situ) (Female) Cancer,400
5,Cervix (Female) Cancer,57
6,Childhood (Age <15) Cancer,516
7,Childhood (Age <20) Cancer,515
8,Colon & Rectum Cancer,20
9,Esophagus Cancer,17


# Data Processing

Grouping the industrial impacts data by fips ID and adding across the impacts parameters

In [8]:
df_new = df.groupby(['id', 'name', 'year'], as_index=False)['payann', 'estab', 
    'emp', 'ACID', 'ENRG', 'ETOX', 'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 
    'HNC', 'HRSP', 'HTOX', 'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN',
       'OZON', 'PEST', 'REN', 'SMOG', 'VADD', 'WATR'].sum()
df_new

Unnamed: 0,id,name,year,payann,estab,emp,ACID,ENRG,ETOX,EUTR,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,01001,"Autauga County, Alabama",2012,128668,594,4954,299.152569,7.286445e+05,1963.026340,26.933851,...,0.050200,0.000000e+00,2149.369209,4.351692e+05,0.002923,3.497395e-02,293475.377416,2949.632736,79914.159702,7851.498268
1,01003,"Baldwin County, Alabama",2012,953498,3340,32629,133.420049,6.931874e+03,3765.662479,132.635472,...,0.057225,1.674254e+04,16148.087370,4.932628e+03,0.007422,8.577729e-02,1999.246044,4438.579174,576096.081165,1633.902304
2,01005,"Barbour County, Alabama",2012,53138,361,1715,102.817249,6.064801e+00,3298.083216,13.598555,...,0.002351,0.000000e+00,554.606997,0.000000e+00,0.000153,4.371549e-02,6.064801,3649.913530,29611.275038,511.844967
3,01007,"Bibb County, Alabama",2012,20778,205,655,30.042694,2.012683e+04,1807.410043,4.819015,...,0.006445,0.000000e+00,309.053357,0.000000e+00,0.000593,2.437340e-02,20126.828138,1064.780004,10666.884789,285.555832
4,01009,"Blount County, Alabama",2012,61132,455,2161,32.192658,1.131970e+01,416.998063,4.320160,...,0.004599,0.000000e+00,896.650774,0.000000e+00,0.002463,6.501047e-03,11.319704,1165.911030,36511.029249,98.727235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3135,56037,"Sweetwater County, Wyoming",2012,491651,989,9029,587.935749,3.629382e+06,813.473206,105.117742,...,0.063982,1.394053e+06,4115.339612,3.628234e+06,0.003711,8.917167e-03,1147.804947,19410.599796,312217.341832,5945.474594
3136,56039,"Teton County, Wyoming",2012,361056,1402,8689,37.178584,9.249760e+02,7592.166446,25.091516,...,0.020598,8.200278e+00,6423.591808,4.857347e+02,0.003347,2.019774e-01,439.241290,1412.961358,210925.667302,1777.330068
3137,56041,"Uinta County, Wyoming",2012,145379,442,3287,96.737078,1.919977e+06,112.140420,13.936063,...,0.011720,3.997271e+05,1420.508279,1.919961e+06,0.000773,4.621011e-04,16.608615,3486.862556,97322.166731,582.400056
3138,56043,"Washakie County, Wyoming",2012,31951,283,907,31.417667,2.756565e+05,28.072519,4.183471,...,0.000551,0.000000e+00,518.501787,2.756546e+05,0.000140,3.860924e-07,1.845259,1176.237930,20257.950393,20.344697


In the cancer incidents file, some faulty values such as \*, 3 or fewer and ��� present. So these values are replaced by null value.

In [9]:
# Some initial additions
df_cancer['fips_str'] = df_cancer.fips.apply(fips_to_str)
#df_cancer['fips_state'] = df_cancer.fips_str.apply(extract_state_fips)
df_cancer['rate'] = (pd.to_numeric(df_cancer['incidence rate_per_100000'], errors='coerce')
                             .replace(np.nan, 0, regex=True))
#df_cancer['count'] = df_cancer.annual_count_avg.str.split().str[0]
df_cancer['count'] = (pd.to_numeric(df_cancer['annual_count_avg'], errors='coerce')
                             .replace(np.nan, 0, regex=True))
df_cancer

Unnamed: 0,locale,fips,met_health_obj,incidence rate_per_100000,incidence rate_lower_95_confidence,incidence rate_upper_95_confidence,annual_count_avg,recent_trend_str,trend_last_5,trend_last_5_lower_95_confidence,...,sex,source_url,stage,stateFIPS,type,state,cancer_description,fips_str,rate,count
0,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,All Cancer Sites,00000,448.0,1638110.0
1,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,46,incd,southdakota,All Cancer Sites,00000,448.0,1638110.0
2,"US (SEER+NPCR)(1,10)",0.0,***,448,447.7,448.4,1638110,falling,-0.9,-1.6,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,2,incd,alaska,All Cancer Sites,00000,448.0,1638110.0
3,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,33,incd,newhampshire,All Cancer Sites,00000,448.0,1638110.0
4,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,17,incd,illinois,All Cancer Sites,00000,448.0,1638110.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74492,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,Non-Hodgkin Lymphoma,56045,0.0,0.0
74493,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,Leukemia,56045,0.0,0.0
74494,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,...,2,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,Breast (in situ) (Female),56045,0.0,0.0
74495,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,"Childhood (Ages <20, All Sites)",56045,0.0,0.0


**Criterion** cancer ID 1 is also being used, and rows having missing cancer stages are used.

In [10]:
criterion = ((df_cancer.stage.isna())) 

columns = ['fips', 'fips_str', 'stateFIPS', 'count', 'rate', 'cancer']

df_cancer_type = df_cancer[criterion][columns]



# Sort values by fips
df_cancer_type.sort_values(by='fips', inplace=True)

df_cancer_type.fips = df_cancer_type.fips.apply(int)

In [11]:
df_cancer_type['cancer'].unique()

array([  1,  72,  71,  76,  80,  61,  58,  66, 515, 400, 516,  86,  90,
        18,  20,  17,  35,   3,  55,  57,  53,  40,  47])

In [12]:
# rename columns
df_cancer_type.rename(columns={'fips_str': 'id'}, inplace=True)

# Only take cancer rows which are in file of list of all county fips
df_single_cancer = df_cancer_type[
        (df_cancer_type.id.isin(counties.id.tolist()))][['id', 'rate', 'stateFIPS','count','cancer']]

In [13]:
df_single_cancer['stateFIPS'].unique()

array([ 1,  2,  4,  5,  6,  8,  9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56])

In [14]:
# Save cancer data to tsv
df_single_cancer.to_csv('cancer.tsv', sep='\t', index=False)

# only taking cancer type 72
#df_single_cancer = df_single_cancer[df_single_cancer['cancer'] == 72]
df_single_cancer

Unnamed: 0,id,rate,stateFIPS,count,cancer
1185,01001,0.0,1,0.0,61
1194,01001,22.6,1,3.0,515
1193,01001,24.2,1,8.0,400
1192,01001,13.5,1,8.0,90
1191,01001,14.7,1,9.0,86
...,...,...,...,...,...
74475,56045,0.0,56,0.0,3
74474,56045,403.9,56,40.0,1
74495,56045,0.0,56,0.0,515
74484,56045,0.0,56,0.0,57


In [15]:
result = pd.merge(df_single_cancer, counties, on='id', how='inner')
result

Unnamed: 0,id,rate,stateFIPS,count,cancer,Name,State
0,01001,0.0,1,0.0,61,Autauga,AL
1,01001,22.6,1,3.0,515,Autauga,AL
2,01001,24.2,1,8.0,400,Autauga,AL
3,01001,13.5,1,8.0,90,Autauga,AL
4,01001,14.7,1,9.0,86,Autauga,AL
...,...,...,...,...,...,...,...
72123,56045,0.0,56,0.0,3,Weston,WY
72124,56045,403.9,56,40.0,1,Weston,WY
72125,56045,0.0,56,0.0,515,Weston,WY
72126,56045,0.0,56,0.0,57,Weston,WY


**Final merge**

In [16]:
df_final = pd.merge(result, df_new, on='id', how='inner')
df_final.sort_values(by=['id','cancer'], inplace=True)
df_final

Unnamed: 0,id,rate,stateFIPS,count,cancer,Name,State,name,year,payann,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
13,01001,495.6,1,304.0,1,Autauga,AL,"Autauga County, Alabama",2012,128668,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
14,01001,13.5,1,8.0,3,Autauga,AL,"Autauga County, Alabama",2012,128668,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
15,01001,0.0,1,0.0,17,Autauga,AL,"Autauga County, Alabama",2012,128668,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
16,01001,9.1,1,6.0,18,Autauga,AL,"Autauga County, Alabama",2012,128668,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
22,01001,52.2,1,32.0,20,Autauga,AL,"Autauga County, Alabama",2012,128668,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72018,56045,0.0,56,0.0,86,Weston,WY,"Weston County, Wyoming",2012,19638,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72019,56045,0.0,56,0.0,90,Weston,WY,"Weston County, Wyoming",2012,19638,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72016,56045,0.0,56,0.0,400,Weston,WY,"Weston County, Wyoming",2012,19638,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72033,56045,0.0,56,0.0,515,Weston,WY,"Weston County, Wyoming",2012,19638,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143


In [17]:
df_result = pd.merge(df_final, df_cancer_list, on='cancer', how='inner')
df_result.sort_values(by=['id','cancer'], inplace=True)
df_result

Unnamed: 0,id,rate,stateFIPS,count,cancer,Name,State,name,year,payann,...,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR,cancerName
0,01001,495.6,1,304.0,1,Autauga,AL,"Autauga County, Alabama",2012,128668,...,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268,All Cancer Sites
3132,01001,13.5,1,8.0,3,Autauga,AL,"Autauga County, Alabama",2012,128668,...,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268,Oral Cavity & Pharynx Cancer
6264,01001,0.0,1,0.0,17,Autauga,AL,"Autauga County, Alabama",2012,128668,...,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268,Esophagus Cancer
9396,01001,9.1,1,6.0,18,Autauga,AL,"Autauga County, Alabama",2012,128668,...,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268,Stomach Cancer
12528,01001,52.2,1,32.0,20,Autauga,AL,"Autauga County, Alabama",2012,128668,...,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268,Colon & Rectum Cancer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59507,56045,0.0,56,0.0,86,Weston,WY,"Weston County, Wyoming",2012,19638,...,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143,Non-Hodgkin Lymphoma Cancer
62639,56045,0.0,56,0.0,90,Weston,WY,"Weston County, Wyoming",2012,19638,...,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143,Leukemia Cancer
65771,56045,0.0,56,0.0,400,Weston,WY,"Weston County, Wyoming",2012,19638,...,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143,Breast (in situ) (Female) Cancer
68903,56045,0.0,56,0.0,515,Weston,WY,"Weston County, Wyoming",2012,19638,...,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143,Childhood (Age <20) Cancer


In [18]:
df_result.columns

Index(['id', 'rate', 'stateFIPS', 'count', 'cancer', 'Name', 'State', 'name',
       'year', 'payann', 'estab', 'emp', 'ACID', 'ENRG', 'ETOX', 'EUTR',
       'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX', 'JOBS',
       'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN', 'SMOG',
       'VADD', 'WATR', 'cancerName'],
      dtype='object')

In [19]:
df_result.rename(columns={'payann': 'Annual payroll', 'estab': 'Number of establishments', 'emp': 'Total employment'}, inplace=True)
df_result.columns

Index(['id', 'rate', 'stateFIPS', 'count', 'cancer', 'Name', 'State', 'name',
       'year', 'Annual payroll', 'Number of establishments',
       'Total employment', 'ACID', 'ENRG', 'ETOX', 'EUTR', 'FOOD', 'GCC',
       'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX', 'JOBS', 'LAND', 'METL',
       'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN', 'SMOG', 'VADD', 'WATR',
       'cancerName'],
      dtype='object')

In [20]:
df_result.to_csv("data_viz_full.csv", index=False)

## Counties omitted in final merge

In [21]:
df_new[~df_new.id.isin(result.id)]

Unnamed: 0,id,name,year,payann,estab,emp,ACID,ENRG,ETOX,EUTR,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
76,2105,"Hoonah-Angoon Census Area, Alaska",2012,2867,63,48,1.109523,3.136368,46.491002,0.368733,...,0.002392,0.0,31.100974,0.0,1.1e-05,0.001219662,3.136368,36.632252,1596.212392,12.299408
86,2195,"Petersburg Census Area, Alaska",2012,14274,127,265,1.566714,2.111854,0.097588,39.488054,...,7.4e-05,0.0,286.680154,0.0,0.00041,1.619062e-07,2.111854,46.467452,5303.738436,41.794329
87,2198,"Prince of Wales-Hyder Census Area, Alaska",2012,7959,108,64,1.44959,3.572667,1468.019312,0.69583,...,0.000201,0.0,301.774266,0.0,0.000315,0.02000628,3.572667,34.330463,5130.736122,231.326596
89,2230,"Skagway Municipality, Alaska",2012,9146,70,51,3.857452,2.698056,199.648536,0.838943,...,0.004959,0.0,176.27819,0.0,2.5e-05,0.005250683,2.698056,127.04595,4617.941097,45.997763
93,2275,"Wrangell City and Borough, Alaska",2012,2974,56,70,0.006525,0.10334,0.602039,0.005933,...,1.3e-05,0.0,175.067308,0.0,0.000216,7.960902e-06,0.10334,0.671285,1999.111515,2.547659
251,8014,"Broomfield County, Colorado",2012,768362,1328,12768,21.194234,5890.534754,2251.843587,10.751894,...,0.014802,0.214567,6242.677021,3520.166485,0.001491,0.05981235,2370.368269,761.527359,451934.621161,700.450567
2826,51019,"Bedford County, Virginia",2012,125055,888,4163,69.850595,8954.856731,1559.510492,9.42468,...,0.00641,0.004886,1931.941669,80.164441,0.005647,0.01988242,8874.69229,2460.658179,73151.220333,283.508267
2913,51515,"Bedford city, Virginia",2012,27819,269,1071,0.082351,2.097059,0.969239,0.144011,...,0.00011,0.0,346.046969,0.0,3.9e-05,4.824651e-08,2.097059,7.678301,16664.974743,19.504993


In [22]:
df[df['name'].str.contains('Ketchikan')]

Unnamed: 0.1,Unnamed: 0,fips,county,name,industry_code,industry_detail,relevant_naics,year,payann,estab,...,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR,id
91298,91298,2130,130,"Ketchikan Gateway Borough, Alaska",113000,timber and raw forest products,[113],2012,9976,6,...,0.0,29.066989,0.0,3.410899e-06,0.110267,2.491440,181.425507,5928.324678,1233.281947,02130
91299,91299,2130,130,"Ketchikan Gateway Borough, Alaska",114000,wild-caught fish and game,[114],2012,478,8,...,0.0,0.406757,0.0,5.157491e-09,0.000022,0.016584,0.020507,358.550768,0.328166,02130
91300,91300,2130,130,"Ketchikan Gateway Borough, Alaska",212310,dimensional stone,[21231],2012,0,1,...,0.0,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,02130
91301,91301,2130,130,"Ketchikan Gateway Borough, Alaska",21311a,other support activities for mining,"[213112, 213113, 213114, 213115]",2012,0,1,...,0.0,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,02130
91302,91302,2130,130,"Ketchikan Gateway Borough, Alaska",311300,"sugar, candy, and chocolate",[3113],2012,0,1,...,0.0,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,02130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91378,91378,2130,130,"Ketchikan Gateway Borough, Alaska",812300,dry-cleaning and laundry,[8123],2012,0,2,...,0.0,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,02130
91379,91379,2130,130,"Ketchikan Gateway Borough, Alaska",812900,"pet care, photofinishing, parking and other su...",[8129],2012,0,2,...,0.0,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,02130
91380,91380,2130,130,"Ketchikan Gateway Borough, Alaska",813100,religious organizations,[8131],2012,681,10,...,0.0,1.825076,0.0,0.000000e+00,0.000000,0.000000,0.087273,194.287127,0.711932,02130
91381,91381,2130,130,"Ketchikan Gateway Borough, Alaska",813a00,"grantmaking, giving, and social advocacy organ...","[8132, 8133]",2012,0,2,...,0.0,0.000000,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000,02130


   # ERROR TESTING (SKIP THIS SECTION)

In [23]:
criterion = ((df_cancer.stage.isna())) #(df_cancer.cancer!=1) & 

columns = ['fips', 'fips_str', 'fips_state', 'count', 'rate', 'cancer']

df_cancer_type = df_cancer[criterion][columns]

KeyError: "['fips_state'] not in index"

In [None]:
df_cancer_type['points'] = np.where( ( df_cancer_type['cancer'] == 1) , 1, 0)
df_cancer_type

In [None]:
df_single_cancer = df_cancer_type[
        (df_cancer_type.fips.isin(counties.id.tolist()))][['fips_str', 'rate', 'fips_state','count','cancer', 'points']]
df_new2 = df_single_cancer.groupby(['fips_state', 'fips_str', 'points'], as_index=False)['count', 'rate'].sum()
df_new2

In [None]:
counts = df_new2['count']
odd_i = [] 
even_i = [] 
for i in range(0, len(counts)): 
    if i % 2: 
        even_i.append(counts[i]) 
    else : 
        odd_i.append(counts[i])

np.count_nonzero(np.array(odd_i) == np.array(even_i))

In [None]:
df_cancer_type1 = df_cancer_type[df_cancer_type['fips_str'] == '01001']
# Sort values by cancer
df_cancer_type1.sort_values(by='cancer', inplace=True)
df_cancer_type1

In [None]:
df_single_cancer = df_single_cancer[df_single_cancer['fips_str'] == '01001']
df_new3 = df_single_cancer.groupby(['fips_state', 'fips_str', 'cancer'], as_index=False)['count', 'rate'].count()
df_new3