# Clustering

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import numpy as np
import random

## Part 1: Importing the data and exploring it

### Importing cancer data

In [2]:
cancer_data = pd.read_csv("cancer_by_type.csv")
cancer_data.head()

Unnamed: 0,locale,fips,met_health_obj,incidence rate_per_100000,incidence rate_lower_95_confidence,incidence rate_upper_95_confidence,annual_count_avg,recent_trend_str,trend_last_5,trend_last_5_lower_95_confidence,...,cancer,file_name,race,sex,source_url,stage,stateFIPS,type,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,46,incd,southdakota,All Cancer Sites
2,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,2,incd,alaska,All Cancer Sites
3,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,33,incd,newhampshire,All Cancer Sites
4,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,17,incd,illinois,All Cancer Sites


In [3]:
cancer_data.columns

Index(['locale', 'fips', 'met_health_obj', 'incidence rate_per_100000',
       'incidence rate_lower_95_confidence',
       'incidence rate_upper_95_confidence', 'annual_count_avg',
       'recent_trend_str', 'trend_last_5', 'trend_last_5_lower_95_confidence',
       'trend_last_5_upper_95_confidence', 'age', 'areatype', 'cancer',
       'file_name', 'race', 'sex', 'source_url', 'stage', 'stateFIPS', 'type',
       'state', 'cancer_description'],
      dtype='object')

In [4]:
cancer_data = cancer_data[['locale', 'fips', 'incidence rate_per_100000','annual_count_avg', 'areatype', 'cancer', 'stateFIPS',
       'state', 'cancer_description']]
cancer_data.head()

Unnamed: 0,locale,fips,incidence rate_per_100000,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,29,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,46,southdakota,All Cancer Sites
2,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,2,alaska,All Cancer Sites
3,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,33,newhampshire,All Cancer Sites
4,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,17,illinois,All Cancer Sites


In [5]:
cancer_data.tail()

Unnamed: 0,locale,fips,incidence rate_per_100000,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description
74492,"Weston County(6,10)",56045.0,*,3 or fewer,county,86,56,wyoming,Non-Hodgkin Lymphoma
74493,"Weston County(6,10)",56045.0,*,3 or fewer,county,90,56,wyoming,Leukemia
74494,"Weston County(6,10)",56045.0,*,3 or fewer,county,400,56,wyoming,Breast (in situ) (Female)
74495,"Weston County(6,10)",56045.0,*,3 or fewer,county,515,56,wyoming,"Childhood (Ages <20, All Sites)"
74496,"Weston County(6,10)",56045.0,*,3 or fewer,county,516,56,wyoming,"Childhood (Ages <15, All Sites)"


In [6]:
cancer_data['annual_count_avg'] = np.where((cancer_data['annual_count_avg'] == '3 or fewer'),'3',cancer_data['annual_count_avg'])
cancer_data

Unnamed: 0,locale,fips,incidence rate_per_100000,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,29,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,46,southdakota,All Cancer Sites
2,"US (SEER+NPCR)(1,10)",0.0,448,1638110,country,1,2,alaska,All Cancer Sites
3,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,33,newhampshire,All Cancer Sites
4,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,17,illinois,All Cancer Sites
...,...,...,...,...,...,...,...,...,...
74492,"Weston County(6,10)",56045.0,*,3,county,86,56,wyoming,Non-Hodgkin Lymphoma
74493,"Weston County(6,10)",56045.0,*,3,county,90,56,wyoming,Leukemia
74494,"Weston County(6,10)",56045.0,*,3,county,400,56,wyoming,Breast (in situ) (Female)
74495,"Weston County(6,10)",56045.0,*,3,county,515,56,wyoming,"Childhood (Ages <20, All Sites)"


In [29]:
cancer_data.replace(to_replace=['3 or fewer', '�'], value='0', inplace=True)
cancer_data['annual_count_avg'] = cancer_data['annual_count_avg'].astype("float64").astype("int64")

In [30]:
def split_keep_first(text):
    return text.split()[0]

In [32]:
cancer_data['incidence rate_per_100000'] = cancer_data['incidence rate_per_100000'].apply(split_keep_first)
#cancer_data['incidence rate_lower_95_confidence'] = cancer_data['incidence rate_lower_95_confidence'].apply(split_keep_first)
#cancer_data['incidence rate_upper_95_confidence'] = cancer_data['incidence rate_upper_95_confidence'].apply(split_keep_first)

In [33]:
cancer_data.replace(to_replace=['* ', '*', '�'], value='0', inplace=True)

In [34]:
cancer_data['incidence rate_per_100000'] = cancer_data['incidence rate_per_100000'].astype("float64")

In [35]:
# converting type of "fips" to int64
cancer_data['fips'] = cancer_data['fips'].astype("int64")

In [36]:
# number of fips codes for which cancer data is available
len(cancer_data['fips'].unique())

3190

### Importing industry data per county

In [9]:
industry_data = pd.read_csv("indicators_per-industry_per-county.csv")
industry_data

Unnamed: 0.1,Unnamed: 0,fips,county,name,industry_code,industry_detail,relevant_naics,year,payann,estab,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,0,8001,1,"Adams County, Colorado",113000,timber and raw forest products,[113],2012,0,1,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
1,1,8001,1,"Adams County, Colorado",115000,agriculture and forestry support,[115],2012,290,6,...,7.965438e-05,0.0,3.405586,0.0,1.827157e-04,8.769207e-03,0.958770,2.011564,211.423463,15.653502
2,2,8001,1,"Adams County, Colorado",211000,unrefined oil and gas,[211],2012,0,6,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
3,3,8001,1,"Adams County, Colorado",2122a0,"iron, gold, silver, and other metal ores","[21221, 21222, 21229]",2012,0,1,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
4,4,8001,1,"Adams County, Colorado",212310,dimensional stone,[21231],2012,0,4,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306027,306027,32510,510,"Carson City, Nevada",812300,dry-cleaning and laundry,[8123],2012,0,4,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
306028,306028,32510,510,"Carson City, Nevada",812900,"pet care, photofinishing, parking and other su...",[8129],2012,331,4,...,1.483303e-05,0.0,2.148818,0.0,1.318396e-07,2.876990e-09,0.025598,0.076440,191.834961,0.096584
306029,306029,32510,510,"Carson City, Nevada",813100,religious organizations,[8131],2012,2441,13,...,1.670618e-07,0.0,6.541864,0.0,0.000000e+00,0.000000e+00,0.000000,0.312826,696.409510,2.551873
306030,306030,32510,510,"Carson City, Nevada",813a00,"grantmaking, giving, and social advocacy organ...","[8132, 8133]",2012,635,9,...,8.127071e-09,0.0,4.713797,0.0,0.000000e+00,0.000000e+00,0.000000,0.020206,440.264876,0.051385


#### Removing un-necessary columns

In [10]:
industry_data.drop('Unnamed: 0', axis=1, inplace=True)

In [11]:
def remove_useless(df):
    for col in df.columns:
        if len(df[col].unique()) == 1:
            df.drop(col, axis=1, inplace=True)

In [12]:
remove_useless(industry_data)
industry_data.columns

Index(['fips', 'county', 'name', 'industry_code', 'industry_detail',
       'relevant_naics', 'payann', 'estab', 'emp', 'ACID', 'ENRG', 'ETOX',
       'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX',
       'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN',
       'SMOG', 'VADD', 'WATR'],
      dtype='object')

In [13]:
industry_data.head()

Unnamed: 0,fips,county,name,industry_code,industry_detail,relevant_naics,payann,estab,emp,ACID,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,8001,1,"Adams County, Colorado",113000,timber and raw forest products,[113],0,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8001,1,"Adams County, Colorado",115000,agriculture and forestry support,[115],290,6,10,0.537394,...,8e-05,0.0,3.405586,0.0,0.000183,0.008769,0.95877,2.011564,211.423463,15.653502
2,8001,1,"Adams County, Colorado",211000,unrefined oil and gas,[211],0,6,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8001,1,"Adams County, Colorado",2122a0,"iron, gold, silver, and other metal ores","[21221, 21222, 21229]",0,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8001,1,"Adams County, Colorado",212310,dimensional stone,[21231],0,4,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
industry_data

Unnamed: 0,fips,county,name,industry_code,industry_detail,relevant_naics,payann,estab,emp,ACID,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,8001,1,"Adams County, Colorado",113000,timber and raw forest products,[113],0,1,0,0.000000,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
1,8001,1,"Adams County, Colorado",115000,agriculture and forestry support,[115],290,6,10,0.537394,...,7.965438e-05,0.0,3.405586,0.0,1.827157e-04,8.769207e-03,0.958770,2.011564,211.423463,15.653502
2,8001,1,"Adams County, Colorado",211000,unrefined oil and gas,[211],0,6,0,0.000000,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
3,8001,1,"Adams County, Colorado",2122a0,"iron, gold, silver, and other metal ores","[21221, 21222, 21229]",0,1,0,0.000000,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
4,8001,1,"Adams County, Colorado",212310,dimensional stone,[21231],0,4,0,0.000000,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306027,32510,510,"Carson City, Nevada",812300,dry-cleaning and laundry,[8123],0,4,0,0.000000,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
306028,32510,510,"Carson City, Nevada",812900,"pet care, photofinishing, parking and other su...",[8129],331,4,9,0.000970,...,1.483303e-05,0.0,2.148818,0.0,1.318396e-07,2.876990e-09,0.025598,0.076440,191.834961,0.096584
306029,32510,510,"Carson City, Nevada",813100,religious organizations,[8131],2441,13,150,0.007487,...,1.670618e-07,0.0,6.541864,0.0,0.000000e+00,0.000000e+00,0.000000,0.312826,696.409510,2.551873
306030,32510,510,"Carson City, Nevada",813a00,"grantmaking, giving, and social advocacy organ...","[8132, 8133]",635,9,0,0.000509,...,8.127071e-09,0.0,4.713797,0.0,0.000000e+00,0.000000e+00,0.000000,0.020206,440.264876,0.051385


In [15]:
# Number of available fips codes
len(industry_data['fips'].unique())

3140

### Finding out diffrerences in "fips" values between the two datasets
Check the counties_fips.csv file in data_raw to find out if fips value actually corresponds to a county or if it corresponds to a state of national value.

In [16]:
industry_fips = set(industry_data['fips'].unique())
cancer_fips = set(cancer_data['fips'].unique())

#### Getting "fips" values in cancer data that don't have corresponding industry data and the other way around

In [17]:
cancer_diff_industry = cancer_fips.difference(industry_fips)
print(cancer_diff_industry)

{0, 48000, 32000, 16000, 53000, 37000, 21000, 5000, 48269, 42000, 26000, 10000, 47000, 2201, 31000, 15000, 15005, 36000, 20000, 4000, 41000, 25000, 9000, 46000, 30000, 51000, 2232, 35000, 19000, 56000, 40000, 24000, 8000, 45000, 29000, 13000, 51917, 50000, 34000, 18000, 2900, 55000, 39000, 23000, 44000, 28000, 12000, 33000, 17000, 1000, 2280, 49000, 54000, 38000, 22000, 6000, 27000}


In [18]:
county_corresp_fips = pd.read_csv('counties_fips.csv')

In [19]:
county_corresp_fips.head()

Unnamed: 0,FIPS,Name,State
0,1001,Autauga,AL
1,1003,Baldwin,AL
2,1005,Barbour,AL
3,1007,Bibb,AL
4,1009,Blount,AL


In [20]:
# getting fips values which correspond to actual counties but don't have entries in the industry dataset
missing_industry_values = set()
for fips in cancer_diff_industry:
    if not county_corresp_fips[county_corresp_fips['FIPS'] == fips].empty:
        missing_industry_values.add(fips)
print(missing_industry_values)

{2232, 2201, 2280, 48269}


In [21]:
industry_diff_cancer = industry_fips.difference(cancer_fips)
len(industry_diff_cancer)

7

In [22]:
#getting fips values that don't correspond to actual counties but have entries in the industry dataset
extra_industry_values = set()
for fips in industry_diff_cancer:
    if county_corresp_fips[county_corresp_fips['FIPS'] == fips].empty:
        extra_industry_values.add(fips)
print(extra_industry_values)

{2275, 2195, 2198, 2230, 2105}


In [23]:
industry_data.columns

Index(['fips', 'county', 'name', 'industry_code', 'industry_detail',
       'relevant_naics', 'payann', 'estab', 'emp', 'ACID', 'ENRG', 'ETOX',
       'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX',
       'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN',
       'SMOG', 'VADD', 'WATR'],
      dtype='object')

#### aggregate factors per fips

In [24]:
industryData=industry_data[['fips','ACID', 'ENRG', 'ETOX',
       'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX',
       'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN',
       'SMOG', 'VADD', 'WATR']]

In [25]:
factorsPerFips=industryData.groupby(['fips']).sum()
factorsPerFips

Unnamed: 0_level_0,ACID,ENRG,ETOX,EUTR,FOOD,GCC,HAPS,HAZW,HC,HNC,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,299.152569,7.286445e+05,1963.026340,26.933851,709.634497,106259.392004,10.732168,15.515155,0.000006,0.000024,...,0.050200,0.000000e+00,2149.369209,4.351692e+05,0.002923,3.497395e-02,293475.377416,2949.632736,79914.159702,7851.498268
1003,133.420049,6.931874e+03,3765.662479,132.635472,4843.498718,43881.445254,31.163349,614.863426,0.000025,0.000110,...,0.057225,1.674254e+04,16148.087370,4.932628e+03,0.007422,8.577729e-02,1999.246044,4438.579174,576096.081165,1633.902304
1005,102.817249,6.064801e+00,3298.083216,13.598555,126.810329,19913.180328,7.238123,3.329358,0.000016,0.000062,...,0.002351,0.000000e+00,554.606997,0.000000e+00,0.000153,4.371549e-02,6.064801,3649.913530,29611.275038,511.844967
1007,30.042694,2.012683e+04,1807.410043,4.819015,76.420984,5751.086704,2.622439,4.793377,0.000005,0.000030,...,0.006445,0.000000e+00,309.053357,0.000000e+00,0.000593,2.437340e-02,20126.828138,1064.780004,10666.884789,285.555832
1009,32.192658,1.131970e+01,416.998063,4.320160,232.490017,6564.143301,3.770756,9.737300,0.000005,0.000021,...,0.004599,0.000000e+00,896.650774,0.000000e+00,0.002463,6.501047e-03,11.319704,1165.911030,36511.029249,98.727235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,587.935749,3.629382e+06,813.473206,105.117742,1001.624899,205214.901809,46.136744,40.714622,0.000113,0.000664,...,0.063982,1.394053e+06,4115.339612,3.628234e+06,0.003711,8.917167e-03,1147.804947,19410.599796,312217.341832,5945.474594
56039,37.178584,9.249760e+02,7592.166446,25.091516,2044.202199,14614.797183,39.699930,167.219233,0.000015,0.000061,...,0.020598,8.200278e+00,6423.591808,4.857347e+02,0.003347,2.019774e-01,439.241290,1412.961358,210925.667302,1777.330068
56041,96.737078,1.919977e+06,112.140420,13.936063,384.841414,32123.362015,10.644368,8.324102,0.000019,0.000117,...,0.011720,3.997271e+05,1420.508279,1.919961e+06,0.000773,4.621011e-04,16.608615,3486.862556,97322.166731,582.400056
56043,31.417667,2.756565e+05,28.072519,4.183471,101.472042,6346.644246,2.766800,2.369265,0.000005,0.000024,...,0.000551,0.000000e+00,518.501787,2.756546e+05,0.000140,3.860924e-07,1.845259,1176.237930,20257.950393,20.344697


### Joining datasets on "fips" column

#### Inner join: Only keep fips values that are in both datasets

In [37]:
df_inner = pd.merge(cancer_data, factorsPerFips, on='fips', how='inner')

In [38]:
len(df_inner["fips"].unique())

3133

In [39]:
df_inner

Unnamed: 0,locale,fips,incidence rate_per_100000,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description,ACID,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,"Autauga County(6,10)",1001,495.6,304,county,1,1,alabama,All Cancer Sites,299.152569,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
1,"Autauga County(6,10)",1001,13.5,8,county,3,1,alabama,Oral Cavity & Pharynx,299.152569,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
2,"Autauga County(6,10)",1001,0.0,3,county,17,1,alabama,Esophagus,299.152569,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
3,"Autauga County(6,10)",1001,9.1,6,county,18,1,alabama,Stomach,299.152569,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
4,"Autauga County(6,10)",1001,52.2,32,county,20,1,alabama,Colon & Rectum,299.152569,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72054,"Weston County(6,10)",56045,0.0,3,county,86,56,wyoming,Non-Hodgkin Lymphoma,34.618906,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72055,"Weston County(6,10)",56045,0.0,3,county,90,56,wyoming,Leukemia,34.618906,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72056,"Weston County(6,10)",56045,0.0,3,county,400,56,wyoming,Breast (in situ) (Female),34.618906,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72057,"Weston County(6,10)",56045,0.0,3,county,515,56,wyoming,"Childhood (Ages <20, All Sites)",34.618906,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143


#### Right join: Keep all "fips" values from the industry dataset

In [28]:
df_right = pd.merge(cancer_data, factorsPerFips, on='fips', how='right')

In [29]:
len(df_right["fips"].unique())

3140

#### Left join: Keep all "fips" values from the cancer dataset

In [30]:
df_left = pd.merge(cancer_data, factorsPerFips, on='fips', how='left')

In [31]:
len(df_left["fips"].unique())

3190

## Part 2: analysis

For now we'll work with the resulting inner_join data because it is the most complete

Names of environemental factors: ['Acid Rain', 'Energy', 'Freshwater Aquatic Ecotoxicity', 'Eutrophication', 'Food Waste', 'Global Climate Change', 'Hazardous Air Pollutants', 'Hazardous Waste', 'Human Health Cancer', 'Human Health Noncancer', 'Human Health - Respiratory Effects', 'Human Health Cancer and Noncancer', 'Employment', 'Land', 'Metals', 'Minerals and Metals', 'Muncipal Solid Waste', 'Nonrenewable Energy', 'Ozone Depletion', 'Pesticides', 'Renewable Energy', 'Smog Formation', 'Value Added', 'Water']

In [32]:
df_inner.columns

Index(['locale', 'fips', 'annual_count_avg', 'areatype', 'cancer', 'stateFIPS',
       'state', 'cancer_description', 'ACID', 'ENRG', 'ETOX', 'EUTR', 'FOOD',
       'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX', 'JOBS', 'LAND',
       'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN', 'SMOG', 'VADD',
       'WATR'],
      dtype='object')

In [33]:
df_inner

Unnamed: 0,locale,fips,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description,ACID,ENRG,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,299.152569,728644.544118,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
1,"Autauga County(6,10)",1001,8,county,3,1,alabama,Oral Cavity & Pharynx,299.152569,728644.544118,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
2,"Autauga County(6,10)",1001,3,county,17,1,alabama,Esophagus,299.152569,728644.544118,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
3,"Autauga County(6,10)",1001,6,county,18,1,alabama,Stomach,299.152569,728644.544118,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
4,"Autauga County(6,10)",1001,32.0,county,20,1,alabama,Colon & Rectum,299.152569,728644.544118,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72054,"Weston County(6,10)",56045,3,county,86,56,wyoming,Non-Hodgkin Lymphoma,34.618906,428737.854052,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72055,"Weston County(6,10)",56045,3,county,90,56,wyoming,Leukemia,34.618906,428737.854052,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72056,"Weston County(6,10)",56045,3,county,400,56,wyoming,Breast (in situ) (Female),34.618906,428737.854052,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72057,"Weston County(6,10)",56045,3,county,515,56,wyoming,"Childhood (Ages <20, All Sites)",34.618906,428737.854052,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143


In [34]:
df_inner['stateFIPS'].unique()

array([ 1,  2,  4,  5,  6,  8,  9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56],
      dtype=int64)

In [35]:
df_inner

Unnamed: 0,locale,fips,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description,ACID,ENRG,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,299.152569,728644.544118,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
1,"Autauga County(6,10)",1001,8,county,3,1,alabama,Oral Cavity & Pharynx,299.152569,728644.544118,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
2,"Autauga County(6,10)",1001,3,county,17,1,alabama,Esophagus,299.152569,728644.544118,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
3,"Autauga County(6,10)",1001,6,county,18,1,alabama,Stomach,299.152569,728644.544118,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
4,"Autauga County(6,10)",1001,32.0,county,20,1,alabama,Colon & Rectum,299.152569,728644.544118,...,0.050200,0.000000,2149.369209,435169.166702,0.002923,0.034974,293475.377416,2949.632736,79914.159702,7851.498268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72054,"Weston County(6,10)",56045,3,county,86,56,wyoming,Non-Hodgkin Lymphoma,34.618906,428737.854052,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72055,"Weston County(6,10)",56045,3,county,90,56,wyoming,Leukemia,34.618906,428737.854052,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72056,"Weston County(6,10)",56045,3,county,400,56,wyoming,Breast (in situ) (Female),34.618906,428737.854052,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143
72057,"Weston County(6,10)",56045,3,county,515,56,wyoming,"Childhood (Ages <20, All Sites)",34.618906,428737.854052,...,0.000817,17113.628433,348.003649,428660.792817,0.000395,0.000943,77.061234,1321.995548,12995.136488,47.904143


In [40]:
df_inner.to_csv('factorsCancerperCounty.csv')