### NOTE: This is maintained for example purpose.  'Real' query process has been moved to 'cancer_data_query.py'

In [1]:
import os
import re
import pathlib
import requests
import pandas as pd
pd.options.display.max_columns=999

from tqdm import tqdm, tqdm_notebook

from functools import partial

from cancer_data_query import CancerDataQuery, query_state_data

In [2]:
# Define the data paths
project_dir = pathlib.Path.cwd().parent.parent
repo_dir = pathlib.Path.cwd().parent
raw_data_dir = repo_dir / 'data_raw'

In [3]:
raw_data_dir

PosixPath('/Users/merrelbook/Projects/HealthDataVizGA/DataVisualAnalytics_Industries-cancer/data_raw')

In [4]:
with open(raw_data_dir / 'county_cancer_stats' / 'states.txt') as f:
    states_str = f.read()
    states_list = states_str.split(', ')
    
# Convert to lower case
states_list = [str.lower(state) for state in states_list]
states_list = [s.replace(' ', '') for s in states_list]

In [5]:
states_list[40]

'southdakota'

In [6]:
# cdq = CancerDataQuery(states_list[0])

In [7]:
# cdq.download_cancer_data('demographics')

In [8]:
#
# THE FOLLOWING IS NOW MULTITHREADED IN cancer_data_query.py
#

# for state in states_list[0:5]:
#     cdq = CancerDataQuery(state, data_dir=raw_data_dir)
#     for data_type in cdq.data_types:
#         cdq.download_cancer_data(data_type)

In [9]:
# cdq.data_keys

In [10]:
# url = "https://www.statecancerprofiles.cancer.gov/quick-profiles/index.php?statename=georgia"
# url = "https://www.statecancerprofiles.cancer.gov/quick-profiles/index.php?stateFIPS=13"

url = "https://www.statecancerprofiles.cancer.gov/quick-profiles/index.php?statename=" + states_list[0]


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36',
    'Content-Type': 'text/html',
}

response = requests.get(url, headers=headers)
html = response.text

In [11]:
###
# GET ALL TABLES
regex = 'class="icn icn-table" href="(.*?)"'
url_all = re.findall(regex, html)

In [12]:
###############
# GET icn-table

# Pull Unique DataSet Types
data_types = list(set([u.split('/')[1] for u in url_all]))
url_by_type = {datatype: [u for u in url_all if datatype in u] for datatype in data_types}

data_types

['prevalence', 'incidencerates', 'demographics', 'risk', 'deathrates']

In [13]:
url_by_type['demographics'][0:5]

['/demographics/index.php?stateFIPS=01&areatype=county&topic=crowd&demo=00027&race=00&sex=0&age=001&type=manyareacensus#results',
 '/demographics/index.php?stateFIPS=01&areatype=county&topic=ed&demo=00004&race=00&sex=0&age=081&type=manyareacensus#results',
 '/demographics/index.php?stateFIPS=01&areatype=county&topic=ed&demo=00005&race=00&sex=0&age=081&type=manyareacensus#results',
 '/demographics/index.php?stateFIPS=01&areatype=county&topic=ed&demo=00006&race=00&sex=0&age=081&type=manyareacensus#results',
 '/demographics/index.php?stateFIPS=01&areatype=county&topic=inc&demo=00010&race=00&sex=0&age=001&type=manyareacensus#results']

In [14]:
### OLD URL SPLITTING APPROACH

# url_deathrates = [u for u in url_all if 'deathrates' in u]
# url_risk = [u for u in url_all if 'risk' in u]
# url_prevalence = [u for u in url_all if 'prevalence' in u]
# url_demographics = [u for u in url_all if 'demographics' in u]
# url_incidencerates = [u for u in url_all if 'incidencerates' in u]

# for url in url_deathrates[:5]:
#     details = url.split('?')[1].split('&')
#     details = [d.split('#results')[0] for d in details]
#     print(details)

# for url in url_prevalence:
#     details = url.split('?')[1].split('&')
#     details = [d.split('#results')[0] for d in details]
#     print(details)

In [15]:
# raw_data_dir = pathlib.Path('/Users/merrelbook/Projects/HealthDataVizGA/raw_data/CDC_CancerIncidence-GAbyCounty')

# base_url = 'https://www.statecancerprofiles.cancer.gov'

# dl_text = '&sortVariableName=rate&sortOrder=desc&output=1'
# dl_url = base_url + url_incidencerates[0].replace('#results', dl_text)
# dl_url

In [16]:
# download_cancer_data('demographics', url_by_type)

In [17]:
# d.split['='][0]: d.split['='][1]

---
## Cleaning per data type

In [18]:
def make_meta_df(idx_max, subset_key):
    for idx in range(idx_max):
        df = pd.DataFrame(subset_key).transpose()
        df.index = [idx]
        yield df


def skip_to(fle, starters,**kwargs):
    '''Source: https://stackoverflow.com/questions/34028511/skipping-unknown-number-of-lines-to-read-the-header-python-pandas'''
    if os.stat(fle).st_size == 0:
        raise ValueError("File is empty")
    with open(fle, errors='replace') as f:
        pos = 0
        cur_line = f.readline()
        
        while not any([cur_line.startswith(s) for s in starters]):
            pos = f.tell()
            cur_line = f.readline()
        f.seek(pos)

        df = pd.read_csv(f, **kwargs)
        return df


def gen_subsets(df_key, raw_data_dir, cols_to_clean=None):
    for idx, subset_key in tqdm(df_key.iterrows(), total=len(df_key)):
        
        # df_subset = pd.read_csv(raw_data_dir / subset_key.file_name, header=9, encoding = "ISO-8859-1").dropna()
    
        try:
            df_subset = skip_to(raw_data_dir / subset_key.file_name, ['County', 'Parish', 'Borough'])

            df_subset.columns = [c.strip().lower() for c in df_subset.columns]
            df_subset = df_subset.dropna(subset=['fips'])

            # Join with metadata
            df_subset = df_subset.join(
                pd.concat(make_meta_df(len(df_subset), subset_key))
            )

            # Rename the columns
            if cols_to_clean is not None:
                try:
                    df_subset.columns = cols_to_clean
                except ValueError:
                    print(subset_key.file_name)
                    print('---')
                    print(df_subset.columns)
                    print('---')

            yield df_subset
            
        except:
            continue

In [23]:
state = states_list[9]

---
### Process Death Rate Data

Processing Steps
1. Load Each Pre-downloaded data
2. Clean (rename) columns
3. Join metadata to each row of subset
4. Combine datasets
5. Drow all rows with NaN
6. Change areatype for state and country entries

In [24]:
cancer_data_dir = raw_data_dir / 'CDC_CancerByCounty'

(cancer_data_dir/'ohio').exists()

True

In [114]:
subset_key_dict = {
    'cancer_by_type': range(0,22),
    'cancer_by_race': range(22,30),
    'cancer_by_sex': range(30,33),
    'cancer_by_age': range(33,38),
    'cancer_all': range(38,49),
}


data_type = 'deathrates'

# Set the data path
cancer_data_dir = raw_data_dir / 'CDC_CancerByCounty'
this_data_dir = cancer_data_dir / state / data_type

# Load the data key
df_key = pd.read_csv(this_data_dir / 'DATA_KEY.csv')
# df_key

subset_range = subset_key_dict['cancer_all']
df_key[df_key.index.isin(subset_range)]

Unnamed: 0,age,areatype,cancer,file_name,race,sex,source_url,stateFIPS,type
38,1,county,1,deathrates_038.csv,0,0,https://www.statecancerprofiles.cancer.gov/dea...,56,death


In [116]:
def process_deathrates_raw(state):
    
    subset_key_dict = {
    'cancer_by_type': range(0,22),
    'cancer_by_race': range(22,30),
    'cancer_by_sex': range(30,33),
    'cancer_by_age': range(33,38),
    'cancer_all': range(38,49),
    }
        
    data_type = 'deathrates'

    # Set the data path
    cancer_data_dir = raw_data_dir / 'CDC_CancerByCounty'
    this_data_dir = cancer_data_dir / state / data_type

    # Load the data key
    df_key = pd.read_csv(this_data_dir / 'DATA_KEY.csv')

    for subset_name, subset_range in subset_key_dict.items():
        
        print(f'Processing data for:\t{subset_name}')
        
        subset_key = df_key[df_key.index.isin(subset_range)]
    
        # Load and clean/rename columns
        cols_to_clean= ['locale', 'fips', 'met_health_obj', 
                    'deathrate_per_100000', 'deathrate_lower_95_confidence', 'deathrate_upper_95_confidence',
                    'annual_count_avg', 'recent_trend_str',
                    'trend_last_5', 'trend_last_5_lower_95_confidence', 'trend_last_5_upper_95_confidence',
                    'stateFIPS', 'areatype', 'cancer', 'race', 'sex', 'age', 'type','file_name', 'source_url']

        df = pd.concat(
            gen_subsets(subset_key, this_data_dir, cols_to_clean)
        )

        # Change areatype for state and country entries
        df.loc[df.locale == str.title(state), 'areatype'] = "state"
        df.loc[df.locale == 'United States', 'areatype'] = "country"

        # Save out the cleaned and joined dataset
        df.to_csv(this_data_dir / f'{subset_name}-deathrates.csv',
                  index=False
                 )

        print(f'Data Joined and cleaned for data type {data_type} in {state}')

In [117]:
for state in states_list[0:3]:
    process_deathrates_raw(state)

  5%|▍         | 1/22 [00:00<00:02,  8.30it/s]

Processing data for:	cancer_by_type


100%|██████████| 22/22 [00:02<00:00,  8.26it/s]
 12%|█▎        | 1/8 [00:00<00:00,  8.50it/s]

Data Joined and cleaned for data type deathrates in alabama
Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:00<00:00,  8.13it/s]
 33%|███▎      | 1/3 [00:00<00:00,  8.30it/s]

Data Joined and cleaned for data type deathrates in alabama
Processing data for:	cancer_by_sex


100%|██████████| 3/3 [00:00<00:00,  8.22it/s]
 20%|██        | 1/5 [00:00<00:00,  7.93it/s]

Data Joined and cleaned for data type deathrates in alabama
Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:00<00:00,  8.01it/s]
100%|██████████| 1/1 [00:00<00:00,  8.17it/s]
  0%|          | 0/22 [00:00<?, ?it/s]

Data Joined and cleaned for data type deathrates in alabama
Processing data for:	cancer_all
Data Joined and cleaned for data type deathrates in alabama
Processing data for:	cancer_by_type


100%|██████████| 22/22 [00:01<00:00, 17.43it/s]
 25%|██▌       | 2/8 [00:00<00:00, 17.20it/s]

Data Joined and cleaned for data type deathrates in alaska
Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:00<00:00, 17.07it/s]
100%|██████████| 3/3 [00:00<00:00, 17.71it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

Data Joined and cleaned for data type deathrates in alaska
Processing data for:	cancer_by_sex
Data Joined and cleaned for data type deathrates in alaska
Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:00<00:00, 17.83it/s]
100%|██████████| 1/1 [00:00<00:00, 17.45it/s]
 14%|█▎        | 3/22 [00:00<00:00, 28.63it/s]

Data Joined and cleaned for data type deathrates in alaska
Processing data for:	cancer_all
Data Joined and cleaned for data type deathrates in alaska
Processing data for:	cancer_by_type


100%|██████████| 22/22 [00:00<00:00, 27.39it/s]
 38%|███▊      | 3/8 [00:00<00:00, 27.49it/s]

Data Joined and cleaned for data type deathrates in arizona
Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:00<00:00, 28.12it/s]
100%|██████████| 3/3 [00:00<00:00, 27.99it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

Data Joined and cleaned for data type deathrates in arizona
Processing data for:	cancer_by_sex
Data Joined and cleaned for data type deathrates in arizona
Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:00<00:00, 26.56it/s]
100%|██████████| 1/1 [00:00<00:00, 25.39it/s]

Data Joined and cleaned for data type deathrates in arizona
Processing data for:	cancer_all
Data Joined and cleaned for data type deathrates in arizona





---
### Process Incidence Data

Processing Steps
1. Load Each Pre-downloaded data
2. Drop Duplicates from DATA KEY
3. Split data key to process ALL STAGE and LATE STAGE data sets separately (different column names)
4. Drow all rows with NaN
5. Clean (rename) columns
6. Join metadata to each row of subset
7. Combine data subsets
8. Re-join ALL STAGE and LATE STAGE data sets

In [66]:
#
# PROCESS INCIDENCE RATE DATA
#

subset_key_dict = {
    'cancer_by_type': range(0,23),
    'cancer_by_race': range(23,31),
    'cancer_by_sex': range(31,34),
    'cancer_by_age': range(34,39),
    'cancer_all': range(39,40),
    'cancer_latestage_by_type': range(40,59),
}

def process_incidencerates_raw(state):
    
    data_type = 'incidencerates'

    # Set the data path
    cancer_data_dir = raw_data_dir / 'CDC_CancerByCounty'
    this_data_dir = cancer_data_dir / state / data_type

    # Load the data key
    df_key = pd.read_csv(this_data_dir / 'DATA_KEY.csv')
    
    for subset_name, subset_range in subset_key_dict.items():
        
        print(f'Processing data for:\t{subset_name}')
        
        subset_key = df_key[df_key.index.isin(subset_range)]

        # Drop all the duplicates from data key
        subset_key = subset_key.drop_duplicates(
            subset=[c for c in df_key.columns if 'file_name' not in c]
            # Ignore the file_name column for deduplication
        )

       
        if subset_name != 'cancer_latestage_by_type':
        #
        # ALL STAGE DATA  -  Must split 'all stage' and 'late stage' data sets due to difference in columns
        #

            # Load and clean/rename columns
            cols_to_clean= ['locale', 'fips', 'met_health_obj', 
                        'incidence rate_per_100000', 'incidence rate_lower_95_confidence', 'incidence rate_upper_95_confidence',
                        'annual_count_avg', 'recent_trend_str',
                        'trend_last_5', 'trend_last_5_lower_95_confidence', 'trend_last_5_upper_95_confidence',
                        'age', 'areatype', 'cancer', 'file_name', 'race', 'sex', 'source_url', 
                        'stage', 'stateFIPS', 'type']

        else:
        #
        # LATE STAGE DATA
        #

            # Load and clean/rename columns
            cols_to_clean= ['locale', 'fips', 'met_health_obj', 
                        'incidence rate_per_100000', 'incidence rate_lower_95_confidence', 'incidence rate_upper_95_confidence',
                        'annual_count_avg', 'late_stage_%',
                        'age', 'areatype', 'cancer', 'file_name', 'race', 'sex', 'source_url', 
                        'stage', 'stateFIPS', 'type']

            
        df = pd.concat(
            gen_subsets(subset_key, this_data_dir, cols_to_clean=cols_to_clean),
            sort=False
        )


        # Relabel state and US level data
        df.loc[df.locale.str.contains(str.title(state)), 'areatype'] = "state"
        df.loc[df.locale == 'US (SEER+NPCR)(1,10)', 'areatype'] = "country"

        # Save out the cleaned and joined dataset
        df.to_csv(this_data_dir / f'{subset_name}-incidencerates.csv',
                  index=False
                 )

In [64]:
# process_incidencerates_raw('alabama')

In [65]:
for state in states_list[40:]:
    process_incidencerates_raw(state)

  4%|▍         | 1/23 [00:00<00:02,  7.94it/s]

Processing data for:	cancer_by_type


100%|██████████| 23/23 [00:02<00:00,  8.01it/s]
 12%|█▎        | 1/8 [00:00<00:00,  8.52it/s]

Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:00<00:00, 10.86it/s]
 33%|███▎      | 1/3 [00:00<00:00,  7.98it/s]

Processing data for:	cancer_by_sex


100%|██████████| 3/3 [00:00<00:00,  8.01it/s]
 20%|██        | 1/5 [00:00<00:00,  8.19it/s]

Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:00<00:00,  7.92it/s]
100%|██████████| 1/1 [00:00<00:00,  8.16it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

Processing data for:	cancer_all
Processing data for:	cancer_latestage_by_type


100%|██████████| 19/19 [00:02<00:00,  8.23it/s]
  4%|▍         | 1/23 [00:00<00:03,  5.90it/s]

Processing data for:	cancer_by_type


100%|██████████| 23/23 [00:03<00:00,  5.79it/s]
 12%|█▎        | 1/8 [00:00<00:01,  5.83it/s]

Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:01<00:00,  5.70it/s]
 33%|███▎      | 1/3 [00:00<00:00,  6.01it/s]

Processing data for:	cancer_by_sex


100%|██████████| 3/3 [00:00<00:00,  5.94it/s]
 20%|██        | 1/5 [00:00<00:00,  6.08it/s]

Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:00<00:00,  5.87it/s]
100%|██████████| 1/1 [00:00<00:00,  5.93it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

Processing data for:	cancer_all
Processing data for:	cancer_latestage_by_type


100%|██████████| 19/19 [00:03<00:00,  5.65it/s]
  0%|          | 0/23 [00:00<?, ?it/s]

Processing data for:	cancer_by_type


100%|██████████| 23/23 [00:10<00:00,  2.29it/s]
  0%|          | 0/8 [00:00<?, ?it/s]

Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:03<00:00,  2.22it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

Processing data for:	cancer_by_sex


100%|██████████| 3/3 [00:01<00:00,  2.18it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:02<00:00,  2.18it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Processing data for:	cancer_all


100%|██████████| 1/1 [00:00<00:00,  2.14it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

Processing data for:	cancer_latestage_by_type


100%|██████████| 19/19 [00:08<00:00,  2.19it/s]
  9%|▊         | 2/23 [00:00<00:01, 16.70it/s]

Processing data for:	cancer_by_type


100%|██████████| 23/23 [00:01<00:00, 16.40it/s]
 25%|██▌       | 2/8 [00:00<00:00, 16.42it/s]

Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:00<00:00, 15.88it/s]
100%|██████████| 3/3 [00:00<00:00, 16.85it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

Processing data for:	cancer_by_sex
Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:00<00:00, 16.31it/s]
100%|██████████| 1/1 [00:00<00:00, 16.16it/s]
 11%|█         | 2/19 [00:00<00:00, 17.92it/s]

Processing data for:	cancer_all
Processing data for:	cancer_latestage_by_type


100%|██████████| 19/19 [00:01<00:00, 16.54it/s]
 13%|█▎        | 3/23 [00:00<00:00, 26.70it/s]

Processing data for:	cancer_by_type


100%|██████████| 23/23 [00:00<00:00, 27.55it/s]
 38%|███▊      | 3/8 [00:00<00:00, 29.26it/s]

Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:00<00:00, 28.56it/s]
100%|██████████| 3/3 [00:00<00:00, 29.22it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

Processing data for:	cancer_by_sex
Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:00<00:00, 27.30it/s]
100%|██████████| 1/1 [00:00<00:00, 26.60it/s]
 16%|█▌        | 3/19 [00:00<00:00, 29.26it/s]

Processing data for:	cancer_all
Processing data for:	cancer_latestage_by_type


100%|██████████| 19/19 [00:00<00:00, 28.85it/s]
  0%|          | 0/23 [00:00<?, ?it/s]

Processing data for:	cancer_by_type


100%|██████████| 23/23 [00:05<00:00,  4.20it/s]
  0%|          | 0/8 [00:00<?, ?it/s]

Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:01<00:00,  4.11it/s]
  0%|          | 0/3 [00:00<?, ?it/s]

Processing data for:	cancer_by_sex


100%|██████████| 3/3 [00:00<00:00,  4.25it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:01<00:00,  4.20it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Processing data for:	cancer_all


100%|██████████| 1/1 [00:00<00:00,  4.12it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

Processing data for:	cancer_latestage_by_type


100%|██████████| 19/19 [00:04<00:00,  4.13it/s]
  9%|▊         | 2/23 [00:00<00:01, 12.51it/s]

Processing data for:	cancer_by_type


100%|██████████| 23/23 [00:01<00:00, 12.88it/s]
 25%|██▌       | 2/8 [00:00<00:00, 12.15it/s]

Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:00<00:00, 12.34it/s]
 67%|██████▋   | 2/3 [00:00<00:00, 12.87it/s]

Processing data for:	cancer_by_sex


100%|██████████| 3/3 [00:00<00:00, 12.47it/s]
 40%|████      | 2/5 [00:00<00:00, 11.90it/s]

Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:00<00:00, 11.97it/s]
100%|██████████| 1/1 [00:00<00:00, 12.50it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

Processing data for:	cancer_all
Processing data for:	cancer_latestage_by_type


100%|██████████| 19/19 [00:01<00:00, 12.61it/s]
  4%|▍         | 1/23 [00:00<00:02,  8.87it/s]

Processing data for:	cancer_by_type


100%|██████████| 23/23 [00:02<00:00,  9.25it/s]
 12%|█▎        | 1/8 [00:00<00:00,  7.83it/s]

Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:00<00:00,  8.81it/s]
 33%|███▎      | 1/3 [00:00<00:00,  9.94it/s]

Processing data for:	cancer_by_sex


100%|██████████| 3/3 [00:00<00:00,  9.53it/s]
 20%|██        | 1/5 [00:00<00:00,  9.24it/s]

Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:00<00:00,  9.70it/s]
100%|██████████| 1/1 [00:00<00:00,  9.27it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

Processing data for:	cancer_all
Processing data for:	cancer_latestage_by_type


100%|██████████| 19/19 [00:01<00:00,  9.79it/s]
  4%|▍         | 1/23 [00:00<00:02,  7.58it/s]

Processing data for:	cancer_by_type


100%|██████████| 23/23 [00:03<00:00,  7.20it/s]
 12%|█▎        | 1/8 [00:00<00:01,  6.94it/s]

Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:01<00:00,  6.78it/s]
 33%|███▎      | 1/3 [00:00<00:00,  6.51it/s]

Processing data for:	cancer_by_sex


100%|██████████| 3/3 [00:00<00:00,  6.74it/s]
 20%|██        | 1/5 [00:00<00:00,  6.72it/s]

Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:00<00:00,  6.65it/s]
100%|██████████| 1/1 [00:00<00:00,  6.46it/s]
  0%|          | 0/19 [00:00<?, ?it/s]

Processing data for:	cancer_all
Processing data for:	cancer_latestage_by_type


100%|██████████| 19/19 [00:02<00:00,  6.81it/s]
  9%|▊         | 2/23 [00:00<00:01, 19.49it/s]

Processing data for:	cancer_by_type


100%|██████████| 23/23 [00:01<00:00, 19.48it/s]
 25%|██▌       | 2/8 [00:00<00:00, 19.54it/s]

Processing data for:	cancer_by_race


100%|██████████| 8/8 [00:00<00:00, 19.98it/s]
100%|██████████| 3/3 [00:00<00:00, 20.04it/s]
  0%|          | 0/5 [00:00<?, ?it/s]

Processing data for:	cancer_by_sex
Processing data for:	cancer_by_age


100%|██████████| 5/5 [00:00<00:00, 20.66it/s]
100%|██████████| 1/1 [00:00<00:00, 19.03it/s]
 11%|█         | 2/19 [00:00<00:00, 19.93it/s]

Processing data for:	cancer_all
Processing data for:	cancer_latestage_by_type


100%|██████████| 19/19 [00:00<00:00, 20.30it/s]


In [108]:
# ###################################
# Queries to Split incidence datasets

# # Split out the data by stage keys
# df[~df.stage.isna()]

# # Split out the data by age keys
# criterion = ((df.age!=1) & (df.cancer==1))

# # Split out the data by sex keys
# criterion = ((df.sex!=0) & (df.cancer==1))

# # Split out the data by race keys
# criterion = ((df.race!=0))

# # Split out the data by cancer keys
# criterion = ((df.cancer!=1))

# # All cancer, race, sex, age, stage
# df_key_by_all = df_key

In [127]:
subset_key_dict = {
    'crowding': range(0,1),
    'education': range(1,4),
    'income': range(4,6),
    'uninsured': range(6,7),
    'language_isloation': range(7,8),
    'mobility': range(8,13),
    'age': range(13,19),
    'race': range(19,26),
    'sex': range(26,28),
    'poverty': range(28,31),
    'unemployed': range(31,32),
}


data_type = 'demographics'

# Set the data path
cancer_data_dir = raw_data_dir / 'CDC_CancerByCounty'
this_data_dir = cancer_data_dir / state / data_type

# Load the data key
df_key = pd.read_csv(this_data_dir / 'DATA_KEY.csv')
# df_key

subset_range = subset_key_dict['poverty']
df_key[df_key.index.isin(subset_range)]

Unnamed: 0,age,areatype,demo,file_name,race,sex,source_url,stateFIPS,topic,type
28,1,county,7,demographics_028.csv,0,0,https://www.statecancerprofiles.cancer.gov/dem...,4,pov,manyareacensus
29,1,county,8,demographics_029.csv,0,0,https://www.statecancerprofiles.cancer.gov/dem...,4,pov,manyareacensus
30,1,county,9,demographics_030.csv,0,0,https://www.statecancerprofiles.cancer.gov/dem...,4,pov,manyareacensus


---
### Process Demographic Data

Processing Steps
1. Load Each Pre-downloaded data
2. Drop Duplicates from DATA KEY

In [130]:
#
# PROCESS Risk DATA
#

def process_demographics_raw(state): 

    subset_key_dict = {
        'crowding': range(0,1),
        'education': range(1,4),
        'income': range(4,6),
        'uninsured': range(6,7),
        'language_isloation': range(7,8),
        'mobility': range(8,13),
        'age': range(13,19),
        'race': range(19,26),
        'sex': range(26,28),
        'poverty': range(28,31),
        'unemployed': range(31,32),
    }


    data_type = 'demographics'

    # Set the data path
    cancer_data_dir = raw_data_dir / 'CDC_CancerByCounty'
    this_data_dir = cancer_data_dir / state / data_type

    # Load the data key
    df_key = pd.read_csv(this_data_dir / 'DATA_KEY.csv')
    # df_key
    
    for subset_name, subset_range in subset_key_dict.items():
        
        print(f'Processing data for:\t{subset_name}')
        
        subset_key = df_key[df_key.index.isin(subset_range)]

        # Drop all the duplicates from data key
        subset_key = subset_key.drop_duplicates(
            subset=[c for c in df_key.columns if 'file_name' not in c]
            # Ignore the file_name column for deduplication
        )
        
        print('Processing data for all cancer stages')


        df_demographics = pd.concat(
            gen_subsets(subset_key, this_data_dir, cols_to_clean=None),
            sort=False
        ).reset_index(drop=True)

        df_demographics.rename(columns={
            'county': 'locale',
            'borough or census area': 'locale',
            'parish': 'locale'
        }, 
                               inplace=True)

        # Change areatype for state and country entries
        df_demographics.loc[df_demographics.locale.str.contains(str.title(state)), 'areatype'] = "state"    
        df_demographics.loc[df_demographics.locale == 'United States', 'areatype'] = "country"

        # Save out the cleaned and joined dataset
        df.to_csv(this_data_dir / f'{subset_name}-demographics.csv',
                  index=False
                 )

In [12]:
for state in states_list:
    process_demographics_raw(state)

  3%|▎         | 1/32 [00:00<00:03,  8.90it/s]

Processing data for all cancer stages


100%|██████████| 32/32 [00:03<00:00,  8.35it/s]
  6%|▋         | 2/32 [00:00<00:01, 18.87it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:01<00:00, 18.41it/s]
  9%|▉         | 3/32 [00:00<00:00, 29.04it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:01<00:00, 28.26it/s]
  3%|▎         | 1/32 [00:00<00:04,  7.43it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:04<00:00,  7.35it/s]
  3%|▎         | 1/32 [00:00<00:03,  9.57it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:03<00:00,  9.79it/s]
  3%|▎         | 1/32 [00:00<00:03,  8.47it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:03<00:00,  8.68it/s]
 16%|█▌        | 5/32 [00:00<00:00, 42.61it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:00<00:00, 42.26it/s]
 22%|██▏       | 7/32 [00:00<00:00, 64.22it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:00<00:00, 66.69it/s]
  3%|▎         | 1/32 [00:00<00:03,  8.20it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:03<00:00,  8.38it/s]
  0%|          | 0/32 [00:00<?, ?it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:08<00:00,  3.85it/s]
 19%|█▉        | 6/32 [00:00<00:00, 53.57it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:00<00:00, 55.79it/s]
  6%|▋         | 2/32 [00:00<00:02, 11.88it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:02<00:00, 12.26it/s]
  3%|▎         | 1/32 [00:00<00:05,  5.57it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:05<00:00,  5.81it/s]
  3%|▎         | 1/32 [00:00<00:04,  6.20it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:05<00:00,  5.97it/s]
  3%|▎         | 1/32 [00:00<00:05,  5.91it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:05<00:00,  5.81it/s]
  3%|▎         | 1/32 [00:00<00:05,  5.65it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:05<00:00,  5.60it/s]
  3%|▎         | 1/32 [00:00<00:05,  5.20it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:06<00:00,  4.96it/s]
  3%|▎         | 1/32 [00:00<00:03,  8.65it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:03<00:00,  8.77it/s]
  9%|▉         | 3/32 [00:00<00:01, 28.61it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:01<00:00, 27.31it/s]
  9%|▉         | 3/32 [00:00<00:01, 20.61it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:01<00:00, 20.65it/s]
  9%|▉         | 3/32 [00:00<00:00, 29.52it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:01<00:00, 30.00it/s]
  3%|▎         | 1/32 [00:00<00:04,  6.71it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:04<00:00,  6.85it/s]
  3%|▎         | 1/32 [00:00<00:04,  6.51it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:04<00:00,  6.63it/s]
  3%|▎         | 1/32 [00:00<00:04,  6.35it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:04<00:00,  6.92it/s]
  3%|▎         | 1/32 [00:00<00:05,  5.21it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:06<00:00,  5.23it/s]
  3%|▎         | 1/32 [00:00<00:03,  9.39it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:03<00:00,  9.95it/s]
  3%|▎         | 1/32 [00:00<00:05,  6.14it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:05<00:00,  6.27it/s]
  9%|▉         | 3/32 [00:00<00:01, 25.26it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:01<00:00, 26.06it/s]
 12%|█▎        | 4/32 [00:00<00:00, 38.91it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:00<00:00, 37.30it/s]
  9%|▉         | 3/32 [00:00<00:01, 22.98it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:01<00:00, 22.59it/s]
  6%|▋         | 2/32 [00:00<00:01, 15.56it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:02<00:00, 15.47it/s]
  3%|▎         | 1/32 [00:00<00:03,  8.41it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:03<00:00,  8.86it/s]
  3%|▎         | 1/32 [00:00<00:05,  5.36it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:05<00:00,  5.84it/s]
  0%|          | 0/32 [00:00<?, ?it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:03<00:00, 10.46it/s]
  3%|▎         | 1/32 [00:00<00:04,  6.27it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:04<00:00,  6.44it/s]
  3%|▎         | 1/32 [00:00<00:04,  7.29it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:04<00:00,  7.41it/s]
  6%|▋         | 2/32 [00:00<00:02, 14.44it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:02<00:00, 14.66it/s]
  3%|▎         | 1/32 [00:00<00:03,  8.49it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:03<00:00,  8.09it/s]
 16%|█▌        | 5/32 [00:00<00:00, 49.27it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:00<00:00, 50.76it/s]
  6%|▋         | 2/32 [00:00<00:02, 11.24it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:02<00:00, 11.38it/s]
  3%|▎         | 1/32 [00:00<00:03,  8.59it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:03<00:00,  8.41it/s]
  3%|▎         | 1/32 [00:00<00:05,  5.53it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:05<00:00,  5.98it/s]
  0%|          | 0/32 [00:00<?, ?it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:14<00:00,  2.27it/s]
  6%|▋         | 2/32 [00:00<00:01, 15.56it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:01<00:00, 16.80it/s]
  9%|▉         | 3/32 [00:00<00:00, 29.23it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:01<00:00, 30.39it/s]
  0%|          | 0/32 [00:00<?, ?it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:07<00:00,  4.39it/s]
  6%|▋         | 2/32 [00:00<00:02, 13.72it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:02<00:00, 13.39it/s]
  3%|▎         | 1/32 [00:00<00:03,  9.38it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:03<00:00,  9.78it/s]
  3%|▎         | 1/32 [00:00<00:04,  7.68it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:04<00:00,  7.58it/s]
  6%|▋         | 2/32 [00:00<00:01, 19.52it/s]

Data Joined and cleaned for data type:
 -> demographics
Processing data for all cancer stages


100%|██████████| 32/32 [00:01<00:00, 20.35it/s]


Data Joined and cleaned for data type:
 -> demographics


In [132]:
# df_key.head(1)
state

'arizona'

In [14]:
# df_demographics.columns

In [15]:
# print('Processing data for all cancer stages')


# # df_demographics = pd.concat(
# #     gen_subsets(df_key, raw_data_dir, cols_to_clean=None),
# #     sort=False
# # ).reset_index(drop=True)

# df_demographics.rename(columns={'county': 'locale'}, inplace=True)

# # # Change areatype for state and country entries
# df_demographics.loc[df_demographics.locale == 'Georgia', 'areatype'] = "state"
# df_demographics.loc[df_demographics.locale == 'United States', 'areatype'] = "country"

# df_demographics.to_csv(project_data_dir / 'raw_data' / 'CDC_CancerIncidence-GAbyCounty' / data_type / 'data_join_clean.csv',
#           index=False
#          )
# print(f'Data Joined and cleaned for data type:\n -> {data_type}')
# df_demographics.head()

In [41]:
# Subdataset 1 - Demographics: Crowding
# - Also include 'value (percent)', 'rank within us'
criterion = ~df_demographics['households (with >1 person per room)'].isna()

# # Subdataset 2 - Demographics: Education
# - Also include 'value (percent)', 'rank within us'
criterion = ( (~ df_demographics['people (education: less than 9th grade)'].isna()) \
            | (~ df_demographics['people(education: less than high school)'].isna()) \
            | (~ df_demographics['people (education: at least bachelors degree)'].isna()) )

# Subdataset 3 - Demographics: Income
# - Split Further by 
#   - demo=10 --> Median family income, 2013-2017
#   - demo=11 --> Median household income, 2013-2017
criterion = (~df_demographics["value (dollars)"].isna())

# Subdataset 4 - Demographics: Insurance
# - Also include 'value (percent)', 'rank within us'
# - Percent uninsured in demographic group, people at or below 138% of poverty, 2017, Ages <65
criterion = (~df_demographics['people (uninsured)'].isna())

# Subdataset 5 - Demographics: Language
# - Also include 'value (percent)', 'rank within us'
# - Language isolation, 2013-2017:  https://www.statecancerprofiles.cancer.gov/dictionary.php#non-english
criterion = (~df_demographics['households (language isolation)'].isna())

# Subdataset 6 - Demographics: Mobility
# - Also include 'value (percent)', 'rank within us'
criterion = ( (~ df_demographics["people (haven't moved)"].isna()) \
            | (~ df_demographics["people (moved within county)"].isna()) \
            | (~ df_demographics["people (moved from different county in same state)"].isna()) \
            | (~ df_demographics["people (moved from different state)"].isna()) \
            | (~ df_demographics["people (moved from outside us)"].isna()) )


# Subdataset 7 - Demographics: Population - Ages
# - Also include 'value (percent)', 'rank within us'
criterion = ( (~ df_demographics["people (age under 18)"].isna()) \
            | (~ df_demographics["people (age 18-39)"].isna()) \
            | (~ df_demographics["people (age 40-64)"].isna()) \
            | (~ df_demographics["people (age 40 and over)"].isna()) \
            | (~ df_demographics["people (age 50 and over)"].isna()) \
            | (~ df_demographics["people (age 65 and over)"].isna()) )


# Subdataset 8 - Demographics: Population -  Race/Ethnicity
# - Also include 'value (percent)', 'rank within us'
criterion = ( (~ df_demographics["people (ai/an)"].isna()) \
            | (~ df_demographics["people (api)"].isna()) \
            | (~ df_demographics["people (black)"].isna()) \
            | (~ df_demographics["people (foreign born)"].isna()) \
            | (~ df_demographics["people (hispanic)"].isna()) \
            | (~ df_demographics["people (non-hispanic [origin recode])"].isna()) \
            | (~ df_demographics["people (white)"].isna()) )


# Subdataset 9 - Demographics: Population -  Sex
# - Also include 'value (percent)', 'rank within us'
criterion = ( (~ df_demographics['people (male)'].isna()) \
            | (~ df_demographics['people (female)'].isna()) )


# Subdataset 9 - Demographics: Poverty
# - Also include 'value (percent)', 'rank within us'
criterion = ( (~ df_demographics['families (below poverty)'].isna()) \
            | (~ df_demographics['people (below poverty)'].isna()) \
            | (~ df_demographics['people (<150% of poverty)'].isna()) )

# Subdataset 5 - Demographics: Workforce/Unemployment
# - Also include 'value (percent)', 'rank within us'
criterion = (~df_demographics['people (unemployed)'].isna())

---
### Join all data per type across all states

In [16]:
cancer_data_dir = raw_data_dir / 'CDC_CancerByCounty'

cancer_data_dir

PosixPath('/Users/merrelbook/Projects/HealthDataVizGA/DataVisualAnalytics_Industries-cancer/data_raw/CDC_CancerByCounty')

In [68]:
def combine_states(cancer_data_dir, data_type, data_subset):
    for state_dir in cancer_data_dir.iterdir():

        if state_dir.name.startswith('.'):
            # skip
            continue
        else:
            df = pd.read_csv(state_dir / data_type / f'{data_subset}-incidencerates.csv')
            df['state'] = str.lower(state_dir.name)

        yield df


# def combine_states(cancer_data_dir, data_type):
#     for state_dir in cancer_data_dir.iterdir():

#         if state_dir.name.startswith('.'):
#             # skip
#             continue
#         else:
#             df = pd.read_csv(state_dir / data_type / f'all_{data_type}.csv')
#             df['state'] = str.lower(state_dir.name)

#         yield df

In [104]:
subset_key_dict = {
    'cancer_by_type': range(0,23),
    'cancer_by_race': range(23,31),
    'cancer_by_sex': range(31,34),
    'cancer_by_age': range(34,39),
    'cancer_all': range(39,40),
    'cancer_latestage_by_type': range(40,59),
}

# Load the cancer_id key
cancer_id_key = pd.read_csv(repo_dir / 'data_raw' / 'cancer_ID_list.csv')
cancer_key_dict = cancer_id_key.set_index('Cancer_ID').to_dict()['Cancer_Description']

# Clean cancer data dir
clean_data_dir = repo_dir / 'data_clean' / 'CDC_CancerByCounty' / 'incidencerates'

if not clean_data_dir.exists():
    clean_data_dir.mkdir(parents=True)

for subset_name, value in subset_key_dict.items():
    df_consolidated = pd.concat(
        combine_states(cancer_data_dir, subset_name)
    )

    df_consolidated = (df_consolidated
                       .sort_values(by=['fips', 'cancer'])
                       .drop_duplicates(subset=['locale', 'cancer'])
    )

    df_consolidated['cancer_description'] = df_consolidated.cancer.apply(
        lambda x: cancer_key_dict[x]
    )
    
    df_consolidated.to_csv(clean_data_dir / f'{subset_name}.csv', index=False)


In [103]:
df = pd.read_csv('/Users/merrelbook/Projects/HealthDataVizGA/DataVisualAnalytics_Industries-cancer/data_clean/CDC_CancerByCounty/cancer_by_type-incidencerates.csv')
df

Unnamed: 0,locale,fips,met_health_obj,incidence rate_per_100000,incidence rate_lower_95_confidence,incidence rate_upper_95_confidence,annual_count_avg,recent_trend_str,trend_last_5,trend_last_5_lower_95_confidence,trend_last_5_upper_95_confidence,age,areatype,cancer,file_name,race,sex,source_url,stage,stateFIPS,type,state,cancer_description
0,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,-0.1,1,country,1,incidencerates_000.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,All Cancer Sites
1,"US (SEER+NPCR)(1,10)",0.0,***,11.7,11.7,11.8,43864,stable,0.5,-0.7,1.6,1,country,3,incidencerates_016.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,Oral Cavity & Pharynx
2,"US (SEER+NPCR)(1,10)",0.0,***,4.5,4.5,4.6,17084,stable,-0.8,-1.6,0.0,1,country,17,incidencerates_009.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,Esophagus
3,"US (SEER+NPCR)(1,10)",0.0,***,6.6,6.5,6.6,23871,falling,-1.3,-2.0,-0.5,1,country,18,incidencerates_020.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,Stomach
4,"US (SEER+NPCR)(1,10)",0.0,Yes,38.7,38.6,38.8,140982,falling,-1.3,-2.1,-0.6,1,country,20,incidencerates_008.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,Colon & Rectum
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52656,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,*,1,county,86,incidencerates_015.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,Non-Hodgkin Lymphoma
52657,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,*,1,county,90,incidencerates_011.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,Leukemia
52658,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,*,1,county,400,incidencerates_004.csv,0,2,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,Breast (in situ) (Female)
52659,"Weston County(6,10)",56045.0,***,*,*,*,3 or fewer,*,*,*,*,15,county,515,incidencerates_007.csv,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,56,incd,wyoming,"Childhood (Ages <20, All Sites)"


In [84]:
# df.sort_values(by=['fips', 'cancer']).drop_duplicates(subset=['locale'])

# # Drop all the duplicates from data key
# subset_key = subset_key.drop_duplicates(
#     subset=[c for c in df_key.columns if 'file_name' not in c]
#     # Ignore the file_name column for deduplication
# )

df.query('fips==56045')

23

In [79]:
df.cancer.unique()

array([  1, 515,   3,  17,  57,  35,  80,  18,  61])