In [1]:
import pathlib
import pandas as pd
import numpy as np

### Goal: Process Cancer Incidence Data --> .tsv for choropleth

Current US choropleth example takes .tsv with the following format:

```
id	rate
01001	5.1
01003	4.9
01005	8.6
01007	6.2
```

where id is the county FIPS and rate is some value for coloring the choropleth

In [2]:
# Functions
def fips_to_str(fips):
    fips_str = str(int(fips))

    while len(fips_str) < 5:
        fips_str = '0' + fips_str

    return fips_str

def extract_state_fips(fips_str):
    return int(fips_str[0:2])


In [3]:
base_dir = pathlib.Path.cwd()

# Load list of all county fips
county_fips_file = base_dir.parent.parent / 'data_raw' / 'counties_fips.csv'
counties = pd.read_csv(county_fips_file)
counties['fips_str'] = counties.FIPS.apply(fips_to_str)

# Load the cancer incidence data
data_dir = base_dir.parent.parent / 'data_clean' / 'CDC_CancerByCounty'

incidence_data_csv = data_dir / 'incidencerates.csv'

# Load ALL incidence rate data to dataframe
df = pd.read_csv(incidence_data_csv)

# Some initial additions
df['fips_str'] = df.fips.apply(fips_to_str)
df['fips_state'] = df.fips_str.apply(extract_state_fips)

df.head()

Unnamed: 0,locale,fips,met_health_obj,incidence rate_per_100000,incidence rate_lower_95_confidence,incidence rate_upper_95_confidence,annual_count_avg,recent_trend_str,trend_last_5,trend_last_5_lower_95_confidence,...,race,sex,source_url,stage,stateFIPS,type,late_stage_%,state,fips_str,fips_state
0,"Missouri(6,10)",29000.0,***,455.2,452.9,457.4,32947.0,stable,-0.7,-1.7,...,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,,missouri,29000,29
1,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,,missouri,0,0
2,"Iron County(6,10)",29093.0,***,543.7,490.1,602.1,82.0,stable,-0.1,-15.0,...,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,,missouri,29093,29
3,"Henry County(6,10)",29083.0,***,540.8,503.6,580.4,170.0,stable,1.1,-6.2,...,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,,missouri,29083,29
4,"Mississippi County(6,10)",29133.0,***,527.8,479.8,579.5,92.0,stable,5.9,-4.5,...,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,,missouri,29133,29


In [4]:
counties

Unnamed: 0,FIPS,Name,State,fips_str
0,1001,Autauga,AL,01001
1,1003,Baldwin,AL,01003
2,1005,Barbour,AL,01005
3,1007,Bibb,AL,01007
4,1009,Blount,AL,01009
...,...,...,...,...
3227,72151,Yabucoa,PR,72151
3228,72153,Yauco,PR,72153
3229,78010,St. Croix,VI,78010
3230,78020,St. John,VI,78020


In [5]:
# ###################################
# Queries to Split incidence datasets

# # Split out the data by stage keys
# df[~df.stage.isna()]

# # Split out the data by age keys
# criterion = ((df.age!=1) & (df.cancer==1))

# # Split out the data by sex keys
# criterion = ((df.sex!=0) & (df.cancer==1))

# # Split out the data by race keys
# criterion = ((df.race!=0))

# # Split out the data by cancer keys
# criterion = ((df.cancer!=1))

# # All cancer, race, sex, age, stage
# df_key_by_all = df_key

In [6]:
# # Split out the data by cancer keys and all states
# criterion = ((df.cancer!=1) & (df.stage.isna()))

# columns = ['locale', 'fips', 'incidence rate_per_100000', 'cancer', 'state', 'fips_state']

# df_cancer_type = df[criterion][columns]

# # Some initial cleaning
# df_cancer_type.rename(columns={'incidence rate_per_100000': 'rate'}, inplace=True)
# df_cancer_type.fips = df_cancer_type.fips.apply(int)
# # df.met_health_obj = pd.to_numeric(df.met_health_obj, errors='coerce')

# # df_cancer_type.head()

# df_cancer_type[
#     (df_cancer_type.fips.isin(counties.fips_str.tolist())) & 
#     (df_cancer_type.state == 'alaska') & 
#     (df_cancer_type.cancer==71)
# ]

In [15]:
def write_cancer_data(cancer_id):
    # Split out the data by cancer keys and all states
    criterion = ((df.cancer!=1) & (df.stage.isna()))
#     criterion = (df.stage.isna())


    columns = ['fips', 'fips_str', 'incidence rate_per_100000', 'cancer']

    df_cancer_type = df[criterion][columns]

    # Sort values by fips
    df_cancer_type.sort_values(by='fips', inplace=True)

    # Some initial cleaning
    df_cancer_type.rename(columns={'incidence rate_per_100000': 'rate'}, inplace=True)
    df_cancer_type.fips = df_cancer_type.fips.apply(int)
    # df.met_health_obj = pd.to_numeric(df.met_health_obj, errors='coerce')


    df_single_cancer = df_cancer_type[
        (df_cancer_type.fips.isin(counties.fips_str.tolist())) & 
        (df_cancer_type.cancer==cancer_id)
    ][['fips_str', 'rate']]

    df_single_cancer.rate = (pd
                             .to_numeric(df_single_cancer.rate, errors='coerce')
                             .replace(np.nan, 0, regex=True)
                            )
    # rename columns
    df_single_cancer.rename(columns={'fips_str': 'id'}, inplace=True)

    # Save cancer data to tsv
    tsv_out_path = base_dir / f'cancer.tsv'
    df_single_cancer.to_csv(tsv_out_path, sep='\t', index=False)

    df_single_cancer.head()

In [16]:
df.cancer.unique()

array([  1,  71,  76,  55, 400,  57, 516, 515,  20,  17,  72,  90,  35,
        47,  53,  86,   3,  61,  40,  66,  18,  80,  58])

In [22]:
# Select a cancer
cancer_id = 516

write_cancer_data(cancer_id)

In [24]:
# Save cancer data to tsv
tsv_out_path = base_dir / f'cancer.tsv'
df_single_cancer.to_csv(tsv_out_path, sep='\t', index=False)

In [29]:
cancer_id

criterion = ((df.stage.isna()))

columns = ['fips', 'fips_str', 'incidence rate_per_100000', 'cancer']

df_cancer_type = df[criterion][columns]

# Sort values by fips
df_cancer_type.sort_values(by=['cancer','fips'], inplace=True)

# Some initial cleaning
df_cancer_type.rename(columns={'incidence rate_per_100000': 'rate'}, inplace=True)
df_cancer_type.fips = df_cancer_type.fips.apply(int)
# df.met_health_obj = pd.to_numeric(df.met_health_obj, errors='coerce')


# df_single_cancer = df_cancer_type[
#     (df_cancer_type.fips.isin(counties.fips_str.tolist())) & 
#     (df_cancer_type.cancer==cancer_id)
# ][['fips_str', 'rate']]

# df_single_cancer.rate = (pd
#                          .to_numeric(df_single_cancer.rate, errors='coerce')
#                          .replace(np.nan, 0, regex=True)
#                         )


df_cancer_rates = df_cancer_type[df_cancer_type.fips.isin(counties.fips_str.tolist())]

df_cancer_rates.rate = (pd
                         .to_numeric(df_cancer_rates.rate, errors='coerce')
                         .replace(np.nan, 0, regex=True)
                        )

# rename columns
df_cancer_rates.rename(columns={'fips_str': 'id'}, inplace=True)
df_cancer_rates.drop('fips', inplace=True, axis=1)

# Save cancer data to tsv
tsv_out_path = data_dir.parent / f'cancer_all_types.tsv'
df_cancer_rates.to_csv(tsv_out_path, sep='\t', index=False)

df_cancer_rates.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,id,rate,cancer
47280,1001,495.6,1
48873,1001,484.6,1
48947,1001,0.0,1
49011,1001,489.2,1
49077,1001,522.0,1
