# Clustering

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import numpy as np
import random

## Part 1: Importing the data and exploring it

### Importing cancer data

In [47]:
cancer_data = pd.read_csv("cancer_by_type.csv")
cancer_data.head()

Unnamed: 0,locale,fips,met_health_obj,incidence rate_per_100000,incidence rate_lower_95_confidence,incidence rate_upper_95_confidence,annual_count_avg,recent_trend_str,trend_last_5,trend_last_5_lower_95_confidence,...,cancer,file_name,race,sex,source_url,stage,stateFIPS,type,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,46,incd,southdakota,All Cancer Sites
2,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,2,incd,alaska,All Cancer Sites
3,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,33,incd,newhampshire,All Cancer Sites
4,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,17,incd,illinois,All Cancer Sites


In [48]:
cancer_data.columns

Index(['locale', 'fips', 'met_health_obj', 'incidence rate_per_100000',
       'incidence rate_lower_95_confidence',
       'incidence rate_upper_95_confidence', 'annual_count_avg',
       'recent_trend_str', 'trend_last_5', 'trend_last_5_lower_95_confidence',
       'trend_last_5_upper_95_confidence', 'age', 'areatype', 'cancer',
       'file_name', 'race', 'sex', 'source_url', 'stage', 'stateFIPS', 'type',
       'state', 'cancer_description'],
      dtype='object')

In [49]:
cancer_data = cancer_data[['locale', 'fips', 'incidence rate_per_100000', 'annual_count_avg','areatype', 'cancer', 'stateFIPS',
       'state', 'cancer_description']]
cancer_data.head()

Unnamed: 0,locale,fips,incidence rate_per_100000,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,29,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,46,southdakota,All Cancer Sites
2,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,2,alaska,All Cancer Sites
3,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,33,newhampshire,All Cancer Sites
4,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,17,illinois,All Cancer Sites


In [50]:
cancer_data.tail()

Unnamed: 0,locale,fips,incidence rate_per_100000,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description
74492,"Weston County(6,10)",56045.0,*,3 or fewer,county,86,56,wyoming,Non-Hodgkin Lymphoma
74493,"Weston County(6,10)",56045.0,*,3 or fewer,county,90,56,wyoming,Leukemia
74494,"Weston County(6,10)",56045.0,*,3 or fewer,county,400,56,wyoming,Breast (in situ) (Female)
74495,"Weston County(6,10)",56045.0,*,3 or fewer,county,515,56,wyoming,"Childhood (Ages <20, All Sites)"
74496,"Weston County(6,10)",56045.0,*,3 or fewer,county,516,56,wyoming,"Childhood (Ages <15, All Sites)"


In [53]:
cancer_data['incidence rate_per_100000'] = np.where((cancer_data['annual_count_avg'] == '3 or fewer'),'0',cancer_data['incidence rate_per_100000'])
cancer_data

Unnamed: 0,locale,fips,incidence rate_per_100000,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,29,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,46,southdakota,All Cancer Sites
2,"US (SEER+NPCR)(1,10)",0.0,448,1638110,country,1,2,alaska,All Cancer Sites
3,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,33,newhampshire,All Cancer Sites
4,"US (SEER+NPCR)(1,10)",0.0,448.0,1638110.0,country,1,17,illinois,All Cancer Sites
...,...,...,...,...,...,...,...,...,...
74492,"Weston County(6,10)",56045.0,0,3 or fewer,county,86,56,wyoming,Non-Hodgkin Lymphoma
74493,"Weston County(6,10)",56045.0,0,3 or fewer,county,90,56,wyoming,Leukemia
74494,"Weston County(6,10)",56045.0,0,3 or fewer,county,400,56,wyoming,Breast (in situ) (Female)
74495,"Weston County(6,10)",56045.0,0,3 or fewer,county,515,56,wyoming,"Childhood (Ages <20, All Sites)"


In [55]:
cancer_data = cancer_data[['locale', 'fips', 'incidence rate_per_100000','areatype', 'cancer', 'stateFIPS',
       'state', 'cancer_description']]
cancer_data

Unnamed: 0,locale,fips,incidence rate_per_100000,areatype,cancer,stateFIPS,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,448.0,country,1,29,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,448.0,country,1,46,southdakota,All Cancer Sites
2,"US (SEER+NPCR)(1,10)",0.0,448,country,1,2,alaska,All Cancer Sites
3,"US (SEER+NPCR)(1,10)",0.0,448.0,country,1,33,newhampshire,All Cancer Sites
4,"US (SEER+NPCR)(1,10)",0.0,448.0,country,1,17,illinois,All Cancer Sites
...,...,...,...,...,...,...,...,...
74492,"Weston County(6,10)",56045.0,0,county,86,56,wyoming,Non-Hodgkin Lymphoma
74493,"Weston County(6,10)",56045.0,0,county,90,56,wyoming,Leukemia
74494,"Weston County(6,10)",56045.0,0,county,400,56,wyoming,Breast (in situ) (Female)
74495,"Weston County(6,10)",56045.0,0,county,515,56,wyoming,"Childhood (Ages <20, All Sites)"


In [56]:
# converting type of "fips" to int64
cancer_data['fips'] = cancer_data['fips'].astype("int64")

In [57]:
# number of fips codes for which cancer data is available
len(cancer_data['fips'].unique())

3190

### Importing industry data per county

In [60]:
industry_data = pd.read_csv("final_indicators_per-industry_per-county.csv")
industry_data

Unnamed: 0.1,Unnamed: 0,fips,county,name,industry_code,industry_detail,relevant_naics,year,payann,total_compensation,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,0,8001,1,"Adams County, Colorado",113000,timber and raw forest products,[113],2012,0.000,6363.0,...,0,0,0,0,0,0,0,0,0,0
1,1,8001,1,"Adams County, Colorado",115000,agriculture and forestry support,[115],2012,0.290,12285.0,...,1.04547E-07,0,0.00446732,0,2.3968E-07,1.15031E-05,0.001257678,0.002638694,0.277337331,0.020533673
2,2,8001,1,"Adams County, Colorado",211000,unrefined oil and gas,[211],2012,0.000,34979.0,...,0,0,0,0,0,0,0,0,0,0
3,3,8001,1,"Adams County, Colorado",2122a0,"iron, gold, silver, and other metal ores","[21221, 21222, 21229]",2012,0.000,2670.0,...,0,0,0,0,0,0,0,0,0,0
4,4,8001,1,"Adams County, Colorado",212310,dimensional stone,[21231],2012,0.000,2385.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306027,306027,32510,510,"Carson City, Nevada",812300,dry-cleaning and laundry,[8123],2012,0.000,7516.0,...,0,0,0,0,0,0,0,0,0,0
306028,306028,32510,510,"Carson City, Nevada",812900,"pet care, photofinishing, parking and other su...",[8129],2012,0.331,15275.0,...,1.56924E-08,0,0.00227838,0,1.39959E-10,3.05365E-12,2.71419E-05,8.10484E-05,0.203401573,0.000102408
306029,306029,32510,510,"Carson City, Nevada",813100,religious organizations,[8131],2012,2.441,15995.0,...,3.5744E-10,0,0.01400192,0,0,0,0,0.000669559,1.490564525,0.005461919
306030,306030,32510,510,"Carson City, Nevada",813a00,"grantmaking, giving, and social advocacy organ...","[8132, 8133]",2012,0.635,12255.0,...,1.436E-11,0,0.008325978,0,0,0,0,3.56898E-05,0.777639617,9.07612E-05


#### Removing un-necessary columns

In [61]:
industry_data.drop('Unnamed: 0', axis=1, inplace=True)

In [62]:
def remove_useless(df):
    for col in df.columns:
        if len(df[col].unique()) == 1:
            df.drop(col, axis=1, inplace=True)

In [63]:
remove_useless(industry_data)
industry_data.columns

Index(['fips', 'county', 'name', 'industry_code', 'industry_detail',
       'relevant_naics', 'payann', 'total_compensation', 'added_value ($)',
       'Local_tranforrmation_ind', 'estab', 'emp', 'ACID', 'ENRG', 'ETOX',
       'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX',
       'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN',
       'SMOG', 'VADD', 'WATR'],
      dtype='object')

In [64]:
industry_data.head()

Unnamed: 0,fips,county,name,industry_code,industry_detail,relevant_naics,payann,total_compensation,added_value ($),Local_tranforrmation_ind,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,8001,1,"Adams County, Colorado",113000,timber and raw forest products,[113],0.0,6363.0,8448,0.0,...,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,8001,1,"Adams County, Colorado",115000,agriculture and forestry support,[115],0.29,12285.0,16115,0.000380411,...,1.04547e-07,0,0.00446732,0,2.3968e-07,1.15031e-05,0.001257678,0.002638694,0.277337331,0.020533673
2,8001,1,"Adams County, Colorado",211000,unrefined oil and gas,[211],0.0,34979.0,209735,0.0,...,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,8001,1,"Adams County, Colorado",2122a0,"iron, gold, silver, and other metal ores","[21221, 21222, 21229]",0.0,2670.0,10099,0.0,...,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0
4,8001,1,"Adams County, Colorado",212310,dimensional stone,[21231],0.0,2385.0,8559,0.0,...,0.0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
# Number of available fips codes
len(industry_data['fips'].unique())

3140

### Finding out diffrerences in "fips" values between the two datasets
Check the counties_fips.csv file in data_raw to find out if fips value actually corresponds to a county or if it corresponds to a state of national value.

In [66]:
industry_fips = set(industry_data['fips'].unique())
cancer_fips = set(cancer_data['fips'].unique())

#### Getting "fips" values in cancer data that don't have corresponding industry data and the other way around

In [67]:
cancer_diff_industry = cancer_fips.difference(industry_fips)
print(cancer_diff_industry)

{0, 48000, 32000, 16000, 53000, 37000, 21000, 5000, 48269, 42000, 26000, 10000, 47000, 2201, 31000, 15000, 15005, 36000, 20000, 4000, 41000, 25000, 9000, 46000, 30000, 51000, 2232, 35000, 19000, 56000, 40000, 24000, 8000, 45000, 29000, 13000, 51917, 50000, 34000, 18000, 2900, 55000, 39000, 23000, 44000, 28000, 12000, 33000, 17000, 1000, 2280, 49000, 54000, 38000, 22000, 6000, 27000}


In [68]:
county_corresp_fips = pd.read_csv('counties_fips.csv')

In [69]:
county_corresp_fips.head()

Unnamed: 0,FIPS,Name,State
0,1001,Autauga,AL
1,1003,Baldwin,AL
2,1005,Barbour,AL
3,1007,Bibb,AL
4,1009,Blount,AL


In [70]:
# getting fips values which correspond to actual counties but don't have entries in the industry dataset
missing_industry_values = set()
for fips in cancer_diff_industry:
    if not county_corresp_fips[county_corresp_fips['FIPS'] == fips].empty:
        missing_industry_values.add(fips)
print(missing_industry_values)

{2232, 2201, 2280, 48269}


In [71]:
industry_diff_cancer = industry_fips.difference(cancer_fips)
len(industry_diff_cancer)

7

In [72]:
#getting fips values that don't correspond to actual counties but have entries in the industry dataset
extra_industry_values = set()
for fips in industry_diff_cancer:
    if county_corresp_fips[county_corresp_fips['FIPS'] == fips].empty:
        extra_industry_values.add(fips)
print(extra_industry_values)

{2275, 2195, 2198, 2230, 2105}


In [77]:
industry_data.columns

Index(['fips', 'county', 'name', 'industry_code', 'industry_detail',
       'relevant_naics', 'payann', 'total_compensation', 'added_value ($)',
       'Local_tranforrmation_ind', 'estab', 'emp', 'ACID', 'ENRG', 'ETOX',
       'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX',
       'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN',
       'SMOG', 'VADD', 'WATR'],
      dtype='object')

#### aggregate factors per fips

In [78]:
industryData=industry_data[['fips','ACID', 'ENRG', 'ETOX',
       'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX',
       'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN',
       'SMOG', 'VADD', 'WATR']]

In [79]:
factorsPerFips=industryData.groupby(['fips']).sum()
factorsPerFips

Unnamed: 0_level_0,ACID,ENRG,ETOX,EUTR,FOOD,GCC,HAPS,HAZW,HC,HNC,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
fips,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,0.0011439001.017563185000000000000000000#DIV/0...,0.000365399002609.093009000000000000000000#DIV...,1.186566815000.732817288000000000000000000#DIV...,0.000368259000.08420957000000000000000000#DIV/...,0.001086982000.021679299000000000000000000#DIV...,0.18757811300367.4178327000000000000000000#DIV...,0.000159629000.015918997000000000000000000#DIV...,3.70702E-06000.022022336000000000000000000#DIV...,2.92089E-10005.47875E-09000000000000000000#DIV...,6.63838E-10003.33022E-08000000000000000000#DIV...,...,8.74938E-08000.000168707000000000000000000#DIV...,0000000000000000000000#DIV/0!00000000000000000...,0.004263018000.084218657000000000000000000#DIV...,0001558.288943000000000000000000#DIV/0!0000000...,5.00534E-10006.75546E-06000000000000000000#DIV...,1.61719E-05002.14853E-08000000000000000000#DIV...,0.000365399001050.804066000000000000000000#DIV...,0.026608199008.594629197000000000000000000#DIV...,0.8694590190042.95321507000000000000000000#DIV...,0.1808754020026.76189974000000000000000000#DIV...
1003,0.00070793100.0001215400.0076663890000.0041461...,0.00022613600.00021684109.5739014920004.275030...,0.73433626900.07365850900.0019092610000.002170...,0.00022790606.34386E-0500.0024705560000.210448...,0.00067270600.00019682400.0031984050000.003053...,0.11608736200.01220915604.6087987010001.714660...,9.87901E-0501.24809E-0600.0003095630000.000127...,2.29418E-0601.44398E-0600.0001962330000.000359...,1.80564E-1003.29252E-1203.92038E-100001.14105E...,4.10252E-1001.95453E-1101.48868E-090001.00249E...,...,5.41692E-0801.79711E-0802.04354E-060004.14913E...,000043.876243430000000000000000000000000000000...,0.00263827400.00077022800.0130627570000.011895...,00009.5737288920002.48808065900000000000000000...,3.09348E-1004.13205E-0804.95345E-090001.49559E...,1.00084E-0501.9833E-0603.41708E-100006.98892E-...,0.00022613600.00021684100.0001726020001.786949...,0.01646714300.00045494700.1596904930000.043458...,0.53808625300.04781678103.6543100320002.435226...,0.11193922400.00354028900.2729423880000.051737...
1005,0.00410537800000000000000000000000#DIV/0!000.2...,0.00131139200000000000000000000000#DIV/0!000.0...,4.25850431400000000000000000000000#DIV/0!000.1...,0.00132165600000000000000000000000#DIV/0!000.0...,0.00390110200000000000000000000000#DIV/0!000.0...,0.67320457100000000000000000000000#DIV/0!0039....,0.00057289600000000000000000000000#DIV/0!000.0...,1.33042E-0500000000000000000000000#DIV/0!000.0...,1.04754E-0900000000000000000000000#DIV/0!002.9...,2.37654E-0900000000000000000000000#DIV/0!001.1...,...,3.1414E-0700000000000000000000000#DIV/0!005.42...,000000000000000000000000#DIV/0!000000000000000...,0.0152996700000000000000000000000#DIV/0!000.32...,000000000000000000000000#DIV/0!000000000000000...,1.79236E-0900000000000000000000000#DIV/0!001.7...,5.80398E-0500000000000000000000000#DIV/0!007.8...,0.00131139200000000000000000000000#DIV/0!000.0...,0.09549494300000000000000000000000#DIV/0!007.2...,3.12042687800000000000000000000000#DIV/0!0013....,0.64914901700000000000000000000000#DIV/0!000.0...
1007,0.00227534500000.001758122000000000#DIV/0!5.34...,0.000726819000030.65417594000000000#DIV/0!000....,2.36021275800000.0079305000000000#DIV/0!1.1327...,0.00073250800000.001314199000000000#DIV/0!6.70...,0.00216212800000.000676667000000000#DIV/0!0.07...,0.37311363400000.207846994000000000#DIV/0!0.01...,0.00031751900000.000600714000000000#DIV/0!2.05...,7.37367E-0600000.006089581000000000#DIV/0!3.49...,5.80194E-1000009.71771E-10000000000#DIV/0!1.96...,1.32104E-0900002.01056E-08000000000#DIV/0!5.63...,...,1.74107E-0700009.47415E-06000000000#DIV/0!2.16...,000000000000000#DIV/0!000000000000000000000000...,0.00847961500000.02789239000000000#DIV/0!0.227...,000000000000000#DIV/0!000000000000000000000000...,9.94429E-1000004.96653E-07000000000#DIV/0!2.81...,3.21677E-0500002.2052E-07000000000#DIV/0!002.1...,0.000726819000030.65417594000000000#DIV/0!000....,0.05292665400000.072396944000000000#DIV/0!0.00...,1.72945024500001.743185895000000000#DIV/0!1.97...,0.359781200000.013411882000000000#DIV/0!0.0024...
1009,0.000376802000000000000001.73744E-053.16687E-0...,0.000120363000000000000000.0030640490.00052413...,0.390856401000000000000004.18016E-054.01895E-0...,0.000121305000000000000001.96994E-064.06892E-0...,0.000358053000000000000000.0001659160.00035799...,0.061788435000000000000000.0287971620.06332178...,5.25818E-05000000000000009.06922E-050.00013948...,1.2211E-06000000000000000.0006823780.001010821...,9.61237E-11000000000000003.73163E-126.30775E-1...,2.19066E-10000000000000002.55797E-115.32174E-1...,...,2.88106E-08000000000000001.91299E-065.68871E-0...,00000000000000000000000000000#DIV/0!0000000000...,0.001404243000000000000000.0068760490.01484353...,00000000000000000000000000000#DIV/0!0000000000...,1.64632E-10000000000000003.80686E-086.48334E-0...,5.32704E-06000000000000002.42255E-101.41823E-1...,0.000120363000000000000000.0030640490.00052413...,0.00876477000000000000000.0018395140.002780161...,0.286400748000000000000000.5109055690.86948880...,0.059580554000000000000000.0010993990.00226757...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,0.0002260650000.3661331660.0204578680.07649919...,0.000403324000457.23261431088.9423175294.68863...,0.1370048270000.0911829440.0160223490.08762452...,0.0001179960000.1179893710.0033877180.01309108...,0.0003660920000.1527501570.0052715440.24154527...,0.022709031000220.10808044.49989695850.6163650...,2.32144E-060000.0147841760.003118120.015786277...,2.6858E-060000.009371750.0001483260.0113274020...,6.12593E-120001.87542E-085.64189E-092.4131E-08...,3.63358E-110007.09906E-083.51398E-082.10048E-0...,...,3.34499E-080009.7596E-051.22013E-072.99953E-05...,00002095.45183801102.3274890000000000000000000...,0.0014326230000.6238542110.020478630.938387706...,0000457.22437121088.9423175294.675815000000000...,7.68693E-080002.36547E-071.71209E-073.85235E-0...,3.68893E-060001.62908E-0800000000000000008.128...,0.0004033240000.00824316300.012820651000000000...,0.0008462020007.6265356890.9149163452.77727731...,0.088939213000174.523388511.98502199169.920377...,0.00658493700013.035246040.0393675071.50211452...
56039,00000000000000000000000000000000#DIV/0!4.00438...,00000000000000000000000000000000#DIV/0!000.002...,00000000000000000000000000000000#DIV/0!8.48711...,00000000000000000000000000000000#DIV/0!5.0228E...,00000000000000000000000000000000#DIV/0!0.54227...,00000000000000000000000000000000#DIV/0!0.13324...,00000000000000000000000000000000#DIV/0!1.53679...,00000000000000000000000000000000#DIV/0!0.00026...,00000000000000000000000000000000#DIV/0!1.46775...,00000000000000000000000000000000#DIV/0!4.22491...,...,00000000000000000000000000000000#DIV/0!1.63235...,00000000000000000000000000000000#DIV/0!0000000...,00000000000000000000000000000000#DIV/0!1.70598...,00000000000000000000000000000000#DIV/0!0000000...,00000000000000000000000000000000#DIV/0!2.107E-...,00000000000000000000000000000000#DIV/0!003.854...,00000000000000000000000000000000#DIV/0!000.002...,00000000000000000000000000000000#DIV/0!0.00162...,00000000000000000000000000000000#DIV/0!14.7603...,00000000000000000000000000000000#DIV/0!0.01806...
56041,0000.0507120840002.20856E-060000000000000000#D...,0003509.9022760000.0005053660000000000000000#D...,0000.0580871770001.4881E-050000000000000000#DI...,0000.0086782130002.34257E-070000000000000000#D...,0000.1601227860001.37098E-050000000000000000#D...,00033.554096750000.0021118580000000000000000#D...,0000.0104648810001.00931E-050000000000000000#D...,0000.0075090490001.28397E-050000000000000000#D...,0001.60325E-080005.42273E-130000000000000000#D...,0001.39302E-070002.19922E-120000000000000000#D...,...,0001.98842E-050004.14236E-090000000000000000#D...,000730.743964100000000000000000000#DIV/0!00000...,0000.6220666350000.0005684830000000000000000#D...,0003509.89377700000000000000000000#DIV/0!00000...,0002.55377E-070007.36587E-100000000000000000#D...,00000001.76239E-100000000000000000#DIV/0!005.4...,0000.0084989380000.0005053660000000000000000#D...,0001.8410850260000.0001686860000000000000000#D...,000112.64192510000.0343595220000000000000000#D...,0000.9957668080008.31891E-050000000000000000#D...
56043,00.03092090600000000000005.68059E-060#DIV/0!00...,01652.83244300000000000001.82391E-050#DIV/0!00...,00.02437354300000000000009.14502E-060#DIV/0!00...,00.00512609700000000000001.42692E-060#DIV/0!00...,00.00165955300000000000000.0001920290#DIV/0!00...,06.2847563500000000000000.0194264840#DIV/0!000...,00.00471507300000000000003.90088E-050#DIV/0!00...,00.00079624600000000000009.99539E-050#DIV/0!00...,08.51436E-0900000000000002.73142E-120#DIV/0!00...,05.29449E-0800000000000001.62835E-110#DIV/0!00...,...,01.9607E-0700000000000001.61679E-070#DIV/0!000...,00000000000000000#DIV/0!0000000000000000000000...,00.0064492600000000000000.0072914390#DIV/0!000...,01652.832375000000000000000#DIV/0!000000000000...,02.60827E-0700000000000007.43262E-090#DIV/0!00...,05.40842E-1100000000000004.89818E-110#DIV/0!00...,06.8555E-0500000000000001.82391E-050#DIV/0!000...,01.38403140900000000000000.0006165880#DIV/0!00...,011.468525300000000000000.439657660#DIV/0!0003...,00.04691542900000000000000.0011001430#DIV/0!00...


### Joining datasets on "fips" column

#### Inner join: Only keep fips values that are in both datasets

In [80]:
df_inner = pd.merge(cancer_data, factorsPerFips, on='fips', how='inner')

In [81]:
len(df_inner["fips"].unique())

3133

#### Right join: Keep all "fips" values from the industry dataset

In [82]:
df_right = pd.merge(cancer_data, factorsPerFips, on='fips', how='right')

In [83]:
len(df_right["fips"].unique())

3140

#### Left join: Keep all "fips" values from the cancer dataset

In [84]:
df_left = pd.merge(cancer_data, factorsPerFips, on='fips', how='left')

In [85]:
len(df_left["fips"].unique())

3190

## Part 2: analysis

For now we'll work with the resulting inner_join data because it is the most complete

Names of environemental factors: ['Acid Rain', 'Energy', 'Freshwater Aquatic Ecotoxicity', 'Eutrophication', 'Food Waste', 'Global Climate Change', 'Hazardous Air Pollutants', 'Hazardous Waste', 'Human Health Cancer', 'Human Health Noncancer', 'Human Health - Respiratory Effects', 'Human Health Cancer and Noncancer', 'Employment', 'Land', 'Metals', 'Minerals and Metals', 'Muncipal Solid Waste', 'Nonrenewable Energy', 'Ozone Depletion', 'Pesticides', 'Renewable Energy', 'Smog Formation', 'Value Added', 'Water']

In [86]:
df_inner.columns

Index(['locale', 'fips', 'incidence rate_per_100000', 'areatype', 'cancer',
       'stateFIPS', 'state', 'cancer_description', 'ACID', 'ENRG', 'ETOX',
       'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX',
       'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN',
       'SMOG', 'VADD', 'WATR'],
      dtype='object')

In [87]:
df_inner

Unnamed: 0,locale,fips,incidence rate_per_100000,areatype,cancer,stateFIPS,state,cancer_description,ACID,ENRG,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,"Autauga County(6,10)",1001,495.6,county,1,1,alabama,All Cancer Sites,0.0011439001.017563185000000000000000000#DIV/0...,0.000365399002609.093009000000000000000000#DIV...,...,8.74938E-08000.000168707000000000000000000#DIV...,0000000000000000000000#DIV/0!00000000000000000...,0.004263018000.084218657000000000000000000#DIV...,0001558.288943000000000000000000#DIV/0!0000000...,5.00534E-10006.75546E-06000000000000000000#DIV...,1.61719E-05002.14853E-08000000000000000000#DIV...,0.000365399001050.804066000000000000000000#DIV...,0.026608199008.594629197000000000000000000#DIV...,0.8694590190042.95321507000000000000000000#DIV...,0.1808754020026.76189974000000000000000000#DIV...
1,"Autauga County(6,10)",1001,13.5,county,3,1,alabama,Oral Cavity & Pharynx,0.0011439001.017563185000000000000000000#DIV/0...,0.000365399002609.093009000000000000000000#DIV...,...,8.74938E-08000.000168707000000000000000000#DIV...,0000000000000000000000#DIV/0!00000000000000000...,0.004263018000.084218657000000000000000000#DIV...,0001558.288943000000000000000000#DIV/0!0000000...,5.00534E-10006.75546E-06000000000000000000#DIV...,1.61719E-05002.14853E-08000000000000000000#DIV...,0.000365399001050.804066000000000000000000#DIV...,0.026608199008.594629197000000000000000000#DIV...,0.8694590190042.95321507000000000000000000#DIV...,0.1808754020026.76189974000000000000000000#DIV...
2,"Autauga County(6,10)",1001,0,county,17,1,alabama,Esophagus,0.0011439001.017563185000000000000000000#DIV/0...,0.000365399002609.093009000000000000000000#DIV...,...,8.74938E-08000.000168707000000000000000000#DIV...,0000000000000000000000#DIV/0!00000000000000000...,0.004263018000.084218657000000000000000000#DIV...,0001558.288943000000000000000000#DIV/0!0000000...,5.00534E-10006.75546E-06000000000000000000#DIV...,1.61719E-05002.14853E-08000000000000000000#DIV...,0.000365399001050.804066000000000000000000#DIV...,0.026608199008.594629197000000000000000000#DIV...,0.8694590190042.95321507000000000000000000#DIV...,0.1808754020026.76189974000000000000000000#DIV...
3,"Autauga County(6,10)",1001,9.1,county,18,1,alabama,Stomach,0.0011439001.017563185000000000000000000#DIV/0...,0.000365399002609.093009000000000000000000#DIV...,...,8.74938E-08000.000168707000000000000000000#DIV...,0000000000000000000000#DIV/0!00000000000000000...,0.004263018000.084218657000000000000000000#DIV...,0001558.288943000000000000000000#DIV/0!0000000...,5.00534E-10006.75546E-06000000000000000000#DIV...,1.61719E-05002.14853E-08000000000000000000#DIV...,0.000365399001050.804066000000000000000000#DIV...,0.026608199008.594629197000000000000000000#DIV...,0.8694590190042.95321507000000000000000000#DIV...,0.1808754020026.76189974000000000000000000#DIV...
4,"Autauga County(6,10)",1001,52.2,county,20,1,alabama,Colon & Rectum,0.0011439001.017563185000000000000000000#DIV/0...,0.000365399002609.093009000000000000000000#DIV...,...,8.74938E-08000.000168707000000000000000000#DIV...,0000000000000000000000#DIV/0!00000000000000000...,0.004263018000.084218657000000000000000000#DIV...,0001558.288943000000000000000000#DIV/0!0000000...,5.00534E-10006.75546E-06000000000000000000#DIV...,1.61719E-05002.14853E-08000000000000000000#DIV...,0.000365399001050.804066000000000000000000#DIV...,0.026608199008.594629197000000000000000000#DIV...,0.8694590190042.95321507000000000000000000#DIV...,0.1808754020026.76189974000000000000000000#DIV...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72054,"Weston County(6,10)",56045,0,county,86,56,wyoming,Non-Hodgkin Lymphoma,000.0317685900.00231560.002171150000#DIV/0!5.0...,001698.1441720123.255999150.2704140000#DIV/0!0...,...,002.01466E-0701.38021E-088.51307E-070000#DIV/0...,0000031.285542840000#DIV/0!0000000000000000000...,000.00662606400.002317950.026632710000#DIV/0!0...,001698.1441020123.255999150.27005020000#DIV/0!...,002.68022E-0701.93855E-081.09321E-080000#DIV/0...,005.55832E-110000000#DIV/0!001.45196E-122.7005...,007.04344E-05000.0003638670000#DIV/0!000.00093...,001.42197406700.103558220.0788228810000#DIV/0!...,0011.7829302501.356569434.8225698020000#DIV/0!...,000.04820159700.0044559580.0426320390000#DIV/0...
72055,"Weston County(6,10)",56045,0,county,90,56,wyoming,Leukemia,000.0317685900.00231560.002171150000#DIV/0!5.0...,001698.1441720123.255999150.2704140000#DIV/0!0...,...,002.01466E-0701.38021E-088.51307E-070000#DIV/0...,0000031.285542840000#DIV/0!0000000000000000000...,000.00662606400.002317950.026632710000#DIV/0!0...,001698.1441020123.255999150.27005020000#DIV/0!...,002.68022E-0701.93855E-081.09321E-080000#DIV/0...,005.55832E-110000000#DIV/0!001.45196E-122.7005...,007.04344E-05000.0003638670000#DIV/0!000.00093...,001.42197406700.103558220.0788228810000#DIV/0!...,0011.7829302501.356569434.8225698020000#DIV/0!...,000.04820159700.0044559580.0426320390000#DIV/0...
72056,"Weston County(6,10)",56045,0,county,400,56,wyoming,Breast (in situ) (Female),000.0317685900.00231560.002171150000#DIV/0!5.0...,001698.1441720123.255999150.2704140000#DIV/0!0...,...,002.01466E-0701.38021E-088.51307E-070000#DIV/0...,0000031.285542840000#DIV/0!0000000000000000000...,000.00662606400.002317950.026632710000#DIV/0!0...,001698.1441020123.255999150.27005020000#DIV/0!...,002.68022E-0701.93855E-081.09321E-080000#DIV/0...,005.55832E-110000000#DIV/0!001.45196E-122.7005...,007.04344E-05000.0003638670000#DIV/0!000.00093...,001.42197406700.103558220.0788228810000#DIV/0!...,0011.7829302501.356569434.8225698020000#DIV/0!...,000.04820159700.0044559580.0426320390000#DIV/0...
72057,"Weston County(6,10)",56045,0,county,515,56,wyoming,"Childhood (Ages <20, All Sites)",000.0317685900.00231560.002171150000#DIV/0!5.0...,001698.1441720123.255999150.2704140000#DIV/0!0...,...,002.01466E-0701.38021E-088.51307E-070000#DIV/0...,0000031.285542840000#DIV/0!0000000000000000000...,000.00662606400.002317950.026632710000#DIV/0!0...,001698.1441020123.255999150.27005020000#DIV/0!...,002.68022E-0701.93855E-081.09321E-080000#DIV/0...,005.55832E-110000000#DIV/0!001.45196E-122.7005...,007.04344E-05000.0003638670000#DIV/0!000.00093...,001.42197406700.103558220.0788228810000#DIV/0!...,0011.7829302501.356569434.8225698020000#DIV/0!...,000.04820159700.0044559580.0426320390000#DIV/0...


In [88]:
df_inner['stateFIPS'].unique()

array([ 1,  2,  4,  5,  6,  8,  9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56],
      dtype=int64)

In [89]:
df_inner

Unnamed: 0,locale,fips,incidence rate_per_100000,areatype,cancer,stateFIPS,state,cancer_description,ACID,ENRG,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,"Autauga County(6,10)",1001,495.6,county,1,1,alabama,All Cancer Sites,0.0011439001.017563185000000000000000000#DIV/0...,0.000365399002609.093009000000000000000000#DIV...,...,8.74938E-08000.000168707000000000000000000#DIV...,0000000000000000000000#DIV/0!00000000000000000...,0.004263018000.084218657000000000000000000#DIV...,0001558.288943000000000000000000#DIV/0!0000000...,5.00534E-10006.75546E-06000000000000000000#DIV...,1.61719E-05002.14853E-08000000000000000000#DIV...,0.000365399001050.804066000000000000000000#DIV...,0.026608199008.594629197000000000000000000#DIV...,0.8694590190042.95321507000000000000000000#DIV...,0.1808754020026.76189974000000000000000000#DIV...
1,"Autauga County(6,10)",1001,13.5,county,3,1,alabama,Oral Cavity & Pharynx,0.0011439001.017563185000000000000000000#DIV/0...,0.000365399002609.093009000000000000000000#DIV...,...,8.74938E-08000.000168707000000000000000000#DIV...,0000000000000000000000#DIV/0!00000000000000000...,0.004263018000.084218657000000000000000000#DIV...,0001558.288943000000000000000000#DIV/0!0000000...,5.00534E-10006.75546E-06000000000000000000#DIV...,1.61719E-05002.14853E-08000000000000000000#DIV...,0.000365399001050.804066000000000000000000#DIV...,0.026608199008.594629197000000000000000000#DIV...,0.8694590190042.95321507000000000000000000#DIV...,0.1808754020026.76189974000000000000000000#DIV...
2,"Autauga County(6,10)",1001,0,county,17,1,alabama,Esophagus,0.0011439001.017563185000000000000000000#DIV/0...,0.000365399002609.093009000000000000000000#DIV...,...,8.74938E-08000.000168707000000000000000000#DIV...,0000000000000000000000#DIV/0!00000000000000000...,0.004263018000.084218657000000000000000000#DIV...,0001558.288943000000000000000000#DIV/0!0000000...,5.00534E-10006.75546E-06000000000000000000#DIV...,1.61719E-05002.14853E-08000000000000000000#DIV...,0.000365399001050.804066000000000000000000#DIV...,0.026608199008.594629197000000000000000000#DIV...,0.8694590190042.95321507000000000000000000#DIV...,0.1808754020026.76189974000000000000000000#DIV...
3,"Autauga County(6,10)",1001,9.1,county,18,1,alabama,Stomach,0.0011439001.017563185000000000000000000#DIV/0...,0.000365399002609.093009000000000000000000#DIV...,...,8.74938E-08000.000168707000000000000000000#DIV...,0000000000000000000000#DIV/0!00000000000000000...,0.004263018000.084218657000000000000000000#DIV...,0001558.288943000000000000000000#DIV/0!0000000...,5.00534E-10006.75546E-06000000000000000000#DIV...,1.61719E-05002.14853E-08000000000000000000#DIV...,0.000365399001050.804066000000000000000000#DIV...,0.026608199008.594629197000000000000000000#DIV...,0.8694590190042.95321507000000000000000000#DIV...,0.1808754020026.76189974000000000000000000#DIV...
4,"Autauga County(6,10)",1001,52.2,county,20,1,alabama,Colon & Rectum,0.0011439001.017563185000000000000000000#DIV/0...,0.000365399002609.093009000000000000000000#DIV...,...,8.74938E-08000.000168707000000000000000000#DIV...,0000000000000000000000#DIV/0!00000000000000000...,0.004263018000.084218657000000000000000000#DIV...,0001558.288943000000000000000000#DIV/0!0000000...,5.00534E-10006.75546E-06000000000000000000#DIV...,1.61719E-05002.14853E-08000000000000000000#DIV...,0.000365399001050.804066000000000000000000#DIV...,0.026608199008.594629197000000000000000000#DIV...,0.8694590190042.95321507000000000000000000#DIV...,0.1808754020026.76189974000000000000000000#DIV...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72054,"Weston County(6,10)",56045,0,county,86,56,wyoming,Non-Hodgkin Lymphoma,000.0317685900.00231560.002171150000#DIV/0!5.0...,001698.1441720123.255999150.2704140000#DIV/0!0...,...,002.01466E-0701.38021E-088.51307E-070000#DIV/0...,0000031.285542840000#DIV/0!0000000000000000000...,000.00662606400.002317950.026632710000#DIV/0!0...,001698.1441020123.255999150.27005020000#DIV/0!...,002.68022E-0701.93855E-081.09321E-080000#DIV/0...,005.55832E-110000000#DIV/0!001.45196E-122.7005...,007.04344E-05000.0003638670000#DIV/0!000.00093...,001.42197406700.103558220.0788228810000#DIV/0!...,0011.7829302501.356569434.8225698020000#DIV/0!...,000.04820159700.0044559580.0426320390000#DIV/0...
72055,"Weston County(6,10)",56045,0,county,90,56,wyoming,Leukemia,000.0317685900.00231560.002171150000#DIV/0!5.0...,001698.1441720123.255999150.2704140000#DIV/0!0...,...,002.01466E-0701.38021E-088.51307E-070000#DIV/0...,0000031.285542840000#DIV/0!0000000000000000000...,000.00662606400.002317950.026632710000#DIV/0!0...,001698.1441020123.255999150.27005020000#DIV/0!...,002.68022E-0701.93855E-081.09321E-080000#DIV/0...,005.55832E-110000000#DIV/0!001.45196E-122.7005...,007.04344E-05000.0003638670000#DIV/0!000.00093...,001.42197406700.103558220.0788228810000#DIV/0!...,0011.7829302501.356569434.8225698020000#DIV/0!...,000.04820159700.0044559580.0426320390000#DIV/0...
72056,"Weston County(6,10)",56045,0,county,400,56,wyoming,Breast (in situ) (Female),000.0317685900.00231560.002171150000#DIV/0!5.0...,001698.1441720123.255999150.2704140000#DIV/0!0...,...,002.01466E-0701.38021E-088.51307E-070000#DIV/0...,0000031.285542840000#DIV/0!0000000000000000000...,000.00662606400.002317950.026632710000#DIV/0!0...,001698.1441020123.255999150.27005020000#DIV/0!...,002.68022E-0701.93855E-081.09321E-080000#DIV/0...,005.55832E-110000000#DIV/0!001.45196E-122.7005...,007.04344E-05000.0003638670000#DIV/0!000.00093...,001.42197406700.103558220.0788228810000#DIV/0!...,0011.7829302501.356569434.8225698020000#DIV/0!...,000.04820159700.0044559580.0426320390000#DIV/0...
72057,"Weston County(6,10)",56045,0,county,515,56,wyoming,"Childhood (Ages <20, All Sites)",000.0317685900.00231560.002171150000#DIV/0!5.0...,001698.1441720123.255999150.2704140000#DIV/0!0...,...,002.01466E-0701.38021E-088.51307E-070000#DIV/0...,0000031.285542840000#DIV/0!0000000000000000000...,000.00662606400.002317950.026632710000#DIV/0!0...,001698.1441020123.255999150.27005020000#DIV/0!...,002.68022E-0701.93855E-081.09321E-080000#DIV/0...,005.55832E-110000000#DIV/0!001.45196E-122.7005...,007.04344E-05000.0003638670000#DIV/0!000.00093...,001.42197406700.103558220.0788228810000#DIV/0!...,0011.7829302501.356569434.8225698020000#DIV/0!...,000.04820159700.0044559580.0426320390000#DIV/0...


In [90]:
df_inner.to_csv('Final_factorsCancerperCounty.csv')