# Clustering

In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import numpy as np
import random

## Part 1: Importing the data and exploring it

### Importing cancer data

In [33]:
cancer_data = pd.read_csv("cancer_by_type.csv")
cancer_data.head()

Unnamed: 0,locale,fips,met_health_obj,incidence rate_per_100000,incidence rate_lower_95_confidence,incidence rate_upper_95_confidence,annual_count_avg,recent_trend_str,trend_last_5,trend_last_5_lower_95_confidence,...,cancer,file_name,race,sex,source_url,stage,stateFIPS,type,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,46,incd,southdakota,All Cancer Sites
2,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,2,incd,alaska,All Cancer Sites
3,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,33,incd,newhampshire,All Cancer Sites
4,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,17,incd,illinois,All Cancer Sites


In [34]:
cancer_data.columns

Index(['locale', 'fips', 'met_health_obj', 'incidence rate_per_100000',
       'incidence rate_lower_95_confidence',
       'incidence rate_upper_95_confidence', 'annual_count_avg',
       'recent_trend_str', 'trend_last_5', 'trend_last_5_lower_95_confidence',
       'trend_last_5_upper_95_confidence', 'age', 'areatype', 'cancer',
       'file_name', 'race', 'sex', 'source_url', 'stage', 'stateFIPS', 'type',
       'state', 'cancer_description'],
      dtype='object')

In [35]:
cancer_data = cancer_data[['locale', 'fips', 'annual_count_avg', 'areatype', 'cancer', 'stateFIPS',
       'state', 'cancer_description']]
cancer_data.head()

Unnamed: 0,locale,fips,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,1638110.0,country,1,29,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,1638110.0,country,1,46,southdakota,All Cancer Sites
2,"US (SEER+NPCR)(1,10)",0.0,1638110.0,country,1,2,alaska,All Cancer Sites
3,"US (SEER+NPCR)(1,10)",0.0,1638110.0,country,1,33,newhampshire,All Cancer Sites
4,"US (SEER+NPCR)(1,10)",0.0,1638110.0,country,1,17,illinois,All Cancer Sites


In [36]:
cancer_data.tail()

Unnamed: 0,locale,fips,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description
74492,"Weston County(6,10)",56045.0,3 or fewer,county,86,56,wyoming,Non-Hodgkin Lymphoma
74493,"Weston County(6,10)",56045.0,3 or fewer,county,90,56,wyoming,Leukemia
74494,"Weston County(6,10)",56045.0,3 or fewer,county,400,56,wyoming,Breast (in situ) (Female)
74495,"Weston County(6,10)",56045.0,3 or fewer,county,515,56,wyoming,"Childhood (Ages <20, All Sites)"
74496,"Weston County(6,10)",56045.0,3 or fewer,county,516,56,wyoming,"Childhood (Ages <15, All Sites)"


In [39]:
cancer_data['annual_count_avg'] = np.where((cancer_data['annual_count_avg'] == '3 or fewer'),'3',cancer_data['annual_count_avg'])
cancer_data

Unnamed: 0,locale,fips,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,1638110.0,country,1,29,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,1638110.0,country,1,46,southdakota,All Cancer Sites
2,"US (SEER+NPCR)(1,10)",0.0,1638110,country,1,2,alaska,All Cancer Sites
3,"US (SEER+NPCR)(1,10)",0.0,1638110.0,country,1,33,newhampshire,All Cancer Sites
4,"US (SEER+NPCR)(1,10)",0.0,1638110.0,country,1,17,illinois,All Cancer Sites
...,...,...,...,...,...,...,...,...
74492,"Weston County(6,10)",56045.0,3,county,86,56,wyoming,Non-Hodgkin Lymphoma
74493,"Weston County(6,10)",56045.0,3,county,90,56,wyoming,Leukemia
74494,"Weston County(6,10)",56045.0,3,county,400,56,wyoming,Breast (in situ) (Female)
74495,"Weston County(6,10)",56045.0,3,county,515,56,wyoming,"Childhood (Ages <20, All Sites)"


In [40]:
# converting type of "fips" to int64
cancer_data['fips'] = cancer_data['fips'].astype("int64")

In [41]:
# number of fips codes for which cancer data is available
len(cancer_data['fips'].unique())

3190

### Importing industry data per county

In [42]:
industry_data = pd.read_csv("indicators_per-industry_per-county.csv")
industry_data

Unnamed: 0.1,Unnamed: 0,fips,county,name,industry_code,industry_detail,relevant_naics,year,payann,estab,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,0,8001,1,"Adams County, Colorado",113000,timber and raw forest products,[113],2012,0,1,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
1,1,8001,1,"Adams County, Colorado",115000,agriculture and forestry support,[115],2012,290,6,...,7.965438e-05,0.0,3.405586,0.0,1.827157e-04,8.769207e-03,0.958770,2.011564,211.423463,15.653502
2,2,8001,1,"Adams County, Colorado",211000,unrefined oil and gas,[211],2012,0,6,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
3,3,8001,1,"Adams County, Colorado",2122a0,"iron, gold, silver, and other metal ores","[21221, 21222, 21229]",2012,0,1,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
4,4,8001,1,"Adams County, Colorado",212310,dimensional stone,[21231],2012,0,4,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306027,306027,32510,510,"Carson City, Nevada",812300,dry-cleaning and laundry,[8123],2012,0,4,...,0.000000e+00,0.0,0.000000,0.0,0.000000e+00,0.000000e+00,0.000000,0.000000,0.000000,0.000000
306028,306028,32510,510,"Carson City, Nevada",812900,"pet care, photofinishing, parking and other su...",[8129],2012,331,4,...,1.483303e-05,0.0,2.148818,0.0,1.318396e-07,2.876990e-09,0.025598,0.076440,191.834961,0.096584
306029,306029,32510,510,"Carson City, Nevada",813100,religious organizations,[8131],2012,2441,13,...,1.670618e-07,0.0,6.541864,0.0,0.000000e+00,0.000000e+00,0.000000,0.312826,696.409510,2.551873
306030,306030,32510,510,"Carson City, Nevada",813a00,"grantmaking, giving, and social advocacy organ...","[8132, 8133]",2012,635,9,...,8.127071e-09,0.0,4.713797,0.0,0.000000e+00,0.000000e+00,0.000000,0.020206,440.264876,0.051385


#### Removing un-necessary columns

In [26]:
industry_data.drop('Unnamed: 0', axis=1, inplace=True)

In [27]:
def remove_useless(df):
    for col in df.columns:
        if len(df[col].unique()) == 1:
            df.drop(col, axis=1, inplace=True)

In [28]:
remove_useless(industry_data)
industry_data.columns

Index(['fips', 'county', 'name', 'industry_code', 'industry_detail',
       'relevant_naics', 'payann', 'estab', 'emp', 'ACID', 'ENRG', 'ETOX',
       'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC', 'HRSP', 'HTOX',
       'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON', 'PEST', 'REN',
       'SMOG', 'VADD', 'WATR'],
      dtype='object')

In [29]:
industry_data.head()

Unnamed: 0,fips,county,name,industry_code,industry_detail,relevant_naics,payann,estab,emp,ACID,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,8001,1,"Adams County, Colorado",113000,timber and raw forest products,[113],0,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8001,1,"Adams County, Colorado",115000,agriculture and forestry support,[115],290,6,10,0.537394,...,8e-05,0.0,3.405586,0.0,0.000183,0.008769,0.95877,2.011564,211.423463,15.653502
2,8001,1,"Adams County, Colorado",211000,unrefined oil and gas,[211],0,6,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8001,1,"Adams County, Colorado",2122a0,"iron, gold, silver, and other metal ores","[21221, 21222, 21229]",0,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8001,1,"Adams County, Colorado",212310,dimensional stone,[21231],0,4,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Number of available fips codes
len(industry_data['fips'].unique())

3140

### Finding out diffrerences in "fips" values between the two datasets
Check the counties_fips.csv file in data_raw to find out if fips value actually corresponds to a county or if it corresponds to a state of national value.

In [31]:
industry_fips = set(industry_data['fips'].unique())
cancer_fips = set(cancer_data['fips'].unique())

#### Getting "fips" values in cancer data that don't have corresponding industry data and the other way around

In [32]:
cancer_diff_industry = cancer_fips.difference(industry_fips)
print(cancer_diff_industry)

{0, 48000, 32000, 16000, 53000, 37000, 21000, 5000, 48269, 42000, 26000, 10000, 47000, 2201, 31000, 15000, 15005, 36000, 20000, 4000, 41000, 25000, 9000, 46000, 30000, 51000, 2232, 35000, 19000, 56000, 40000, 24000, 8000, 45000, 29000, 13000, 51917, 50000, 34000, 18000, 2900, 55000, 39000, 23000, 44000, 28000, 12000, 33000, 17000, 1000, 2280, 49000, 54000, 38000, 22000, 6000, 27000}


In [33]:
county_corresp_fips = pd.read_csv('counties_fips.csv')

In [34]:
county_corresp_fips.head()

Unnamed: 0,FIPS,Name,State
0,1001,Autauga,AL
1,1003,Baldwin,AL
2,1005,Barbour,AL
3,1007,Bibb,AL
4,1009,Blount,AL


In [35]:
# getting fips values which correspond to actual counties but don't have entries in the industry dataset
missing_industry_values = set()
for fips in cancer_diff_industry:
    if not county_corresp_fips[county_corresp_fips['FIPS'] == fips].empty:
        missing_industry_values.add(fips)
print(missing_industry_values)

{2232, 2201, 2280, 48269}


In [36]:
industry_diff_cancer = industry_fips.difference(cancer_fips)
len(industry_diff_cancer)

7

In [37]:
#getting fips values that don't correspond to actual counties but have entries in the industry dataset
extra_industry_values = set()
for fips in industry_diff_cancer:
    if county_corresp_fips[county_corresp_fips['FIPS'] == fips].empty:
        extra_industry_values.add(fips)
print(extra_industry_values)

{2275, 2195, 2198, 2230, 2105}


### Joining datasets on "fips" column

#### Inner join: Only keep fips values that are in both datasets

In [52]:
df_inner = pd.merge(cancer_data, industry_data, on='fips', how='inner')

In [53]:
len(df_inner["fips"].unique())

3133

#### Right join: Keep all "fips" values from the industry dataset

In [40]:
df_right = pd.merge(cancer_data, industry_data, on='fips', how='right')

In [41]:
len(df_right["fips"].unique())

3140

#### Left join: Keep all "fips" values from the cancer dataset

In [42]:
df_left = pd.merge(cancer_data, industry_data, on='fips', how='left')

In [43]:
len(df_left["fips"].unique())

3190

## Part 2: analysis

For now we'll work with the resulting inner_join data because it is the most complete

Names of environemental factors: ['Acid Rain', 'Energy', 'Freshwater Aquatic Ecotoxicity', 'Eutrophication', 'Food Waste', 'Global Climate Change', 'Hazardous Air Pollutants', 'Hazardous Waste', 'Human Health Cancer', 'Human Health Noncancer', 'Human Health - Respiratory Effects', 'Human Health Cancer and Noncancer', 'Employment', 'Land', 'Metals', 'Minerals and Metals', 'Muncipal Solid Waste', 'Nonrenewable Energy', 'Ozone Depletion', 'Pesticides', 'Renewable Energy', 'Smog Formation', 'Value Added', 'Water']

In [44]:
df_inner.columns

Index(['locale', 'fips', 'annual_count_avg', 'areatype', 'cancer', 'stateFIPS',
       'state', 'cancer_description', 'county', 'name', 'industry_code',
       'industry_detail', 'relevant_naics', 'payann', 'estab', 'emp', 'ACID',
       'ENRG', 'ETOX', 'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC',
       'HRSP', 'HTOX', 'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON',
       'PEST', 'REN', 'SMOG', 'VADD', 'WATR'],
      dtype='object')

In [45]:
df_inner.head()

Unnamed: 0,locale,fips,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description,county,name,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,6.6e-05,0.0,3.210888,0.0,3.767853e-07,0.012181,0.275217,20.04119,654.873075,136.234634
1,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,0.047113,0.0,23.518978,435169.166702,0.001886537,6e-06,293448.484963,2400.143851,11995.153334,7473.552102
4,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
df_inner['stateFIPS'].unique()

array([ 1,  2,  4,  5,  6,  8,  9, 10, 12, 13, 15, 16, 17, 18, 19, 20, 21,
       22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
       39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 53, 54, 55, 56],
      dtype=int64)

In [54]:
df_inner

Unnamed: 0,locale,fips,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description,county,name,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,6.592757e-05,0.0,3.210888,0.000000,3.767853e-07,0.012181,0.275217,20.041190,654.873075,136.234634
1,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
2,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
3,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,4.711330e-02,0.0,23.518978,435169.166702,1.886537e-03,0.000006,293448.484963,2400.143851,11995.153334,7473.552102
4,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7030313,"Weston County(6,10)",56045,3 or fewer,county,516,56,wyoming,"Childhood (Ages <15, All Sites)",45,"Weston County, Wyoming",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
7030314,"Weston County(6,10)",56045,3 or fewer,county,516,56,wyoming,"Childhood (Ages <15, All Sites)",45,"Weston County, Wyoming",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
7030315,"Weston County(6,10)",56045,3 or fewer,county,516,56,wyoming,"Childhood (Ages <15, All Sites)",45,"Weston County, Wyoming",...,1.211386e-08,0.0,0.474359,0.000000,0.000000e+00,0.000000,0.000000,0.022683,50.497535,0.185040
7030316,"Weston County(6,10)",56045,3 or fewer,county,516,56,wyoming,"Childhood (Ages <15, All Sites)",45,"Weston County, Wyoming",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000


In [55]:
df_inner['annual_count_avg'] = np.where((df_inner['annual_count_avg'] == '3 or fewer'),'3',df_inner['annual_count_avg'])
df_inner

Unnamed: 0,locale,fips,annual_count_avg,areatype,cancer,stateFIPS,state,cancer_description,county,name,...,METL,MINE,MSW,NREN,OZON,PEST,REN,SMOG,VADD,WATR
0,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,6.592757e-05,0.0,3.210888,0.000000,3.767853e-07,0.012181,0.275217,20.041190,654.873075,136.234634
1,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
2,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
3,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,4.711330e-02,0.0,23.518978,435169.166702,1.886537e-03,0.000006,293448.484963,2400.143851,11995.153334,7473.552102
4,"Autauga County(6,10)",1001,304.0,county,1,1,alabama,All Cancer Sites,1,"Autauga County, Alabama",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7030313,"Weston County(6,10)",56045,3,county,516,56,wyoming,"Childhood (Ages <15, All Sites)",45,"Weston County, Wyoming",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
7030314,"Weston County(6,10)",56045,3,county,516,56,wyoming,"Childhood (Ages <15, All Sites)",45,"Weston County, Wyoming",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000
7030315,"Weston County(6,10)",56045,3,county,516,56,wyoming,"Childhood (Ages <15, All Sites)",45,"Weston County, Wyoming",...,1.211386e-08,0.0,0.474359,0.000000,0.000000e+00,0.000000,0.000000,0.022683,50.497535,0.185040
7030316,"Weston County(6,10)",56045,3,county,516,56,wyoming,"Childhood (Ages <15, All Sites)",45,"Weston County, Wyoming",...,0.000000e+00,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,0.000000


In [None]:
cancer

In [56]:
df_factors = df_inner[['ACID',
       'ENRG', 'ETOX', 'EUTR', 'FOOD', 'GCC', 'HAPS', 'HAZW', 'HC', 'HNC',
       'HRSP', 'HTOX', 'JOBS', 'LAND', 'METL', 'MINE', 'MSW', 'NREN', 'OZON',
       'PEST', 'REN', 'SMOG', 'VADD', 'WATR', 'annual_count_avg']]