In [1]:
import pathlib
import pandas as pd
import numpy as np

### Goal: Process Cancer Incidence Data --> .tsv for choropleth

Current US choropleth example takes .tsv with the following format:

```
id	rate
01001	5.1
01003	4.9
01005	8.6
01007	6.2
```

where id is the county FIPS and rate is some value for coloring the choropleth

In [2]:
# Functions
def fips_to_str(fips):
    fips_str = str(int(fips))

    while len(fips_str) < 5:
        fips_str = '0' + fips_str

    return fips_str


def extract_state_fips(fips_str):
    return int(fips_str[0:2])


def compute_delta(row):
    national_rate = df_us_cancer_rates.loc[row.cancer, 'rate']
    return row.rate - national_rate


def compute_delta_percent(row):
    national_rate = df_us_cancer_rates.loc[row.cancer, 'rate']
    percent = 100 * ((row.rate - national_rate)/national_rate)
    return percent

In [5]:
base_dir = pathlib.Path.cwd()

# Load list of all county fips
county_fips_file = base_dir.parent.parent / 'data_raw' / 'counties_fips.csv'
counties = pd.read_csv(county_fips_file)
counties['fips_str'] = counties.FIPS.apply(fips_to_str)

# Load the cancer incidence data
data_dir = base_dir.parent.parent / 'data_clean' / 'CDC_CancerByCounty' / 'incidencerates'

incidence_data_csv = data_dir / 'cancer_by_type.csv'

# Load ALL incidence rate data to dataframe
df = pd.read_csv(incidence_data_csv)

# Some initial additions
df['fips_str'] = df.fips.apply(fips_to_str)
df['fips_state'] = df.fips_str.apply(extract_state_fips)
df['rate'] = df['incidence rate_per_100000']

#
# Some initial cleaning
#

# Exclude Kansas and Minnesota due to missing data
df = df[~df.state.isin(['kansas', 'minnesota'])]

# Handle cases where rate value includes trailing ' #'
df['rate'] = df['rate'].apply(
    lambda x: float(x.strip(' #')) if '#' in x else x
)

# Handle all remaining non-numeric entries
df.rate = (pd
             .to_numeric(df.rate, errors='coerce')
             .replace(np.nan, 0, regex=True)
          )

In [7]:
df.columns

Index(['locale', 'fips', 'met_health_obj', 'incidence rate_per_100000',
       'incidence rate_lower_95_confidence',
       'incidence rate_upper_95_confidence', 'annual_count_avg',
       'recent_trend_str', 'trend_last_5', 'trend_last_5_lower_95_confidence',
       'trend_last_5_upper_95_confidence', 'age', 'areatype', 'cancer',
       'file_name', 'race', 'sex', 'source_url', 'stage', 'stateFIPS', 'type',
       'state', 'cancer_description', 'fips_str', 'fips_state', 'rate'],
      dtype='object')

In [6]:
df[(df.state=='newmexico') & (df.cancer==1)].reset_index(drop=True)

Unnamed: 0,locale,fips,met_health_obj,incidence rate_per_100000,incidence rate_lower_95_confidence,incidence rate_upper_95_confidence,annual_count_avg,recent_trend_str,trend_last_5,trend_last_5_lower_95_confidence,...,sex,source_url,stage,stateFIPS,type,state,cancer_description,fips_str,fips_state,rate
0,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,35,incd,newmexico,All Cancer Sites,0,0,448.0
1,"New Mexico(7,8)",35000.0,***,374.3,370.8,377.9,9116.0,falling,-1.7,-2.2,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,35,incd,newmexico,All Cancer Sites,35000,35,374.3
2,"Bernalillo County(7,8)",35001.0,***,390.0,383.6,396.5,2957.0,falling,-0.8,-0.9,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,35,incd,newmexico,All Cancer Sites,35001,35,390.0
3,"Catron County(7,8)",35003.0,***,254.0,203.5,321.4,21.0,falling,-8.4,-12.7,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,35,incd,newmexico,All Cancer Sites,35003,35,254.0
4,"Chaves County(7,8)",35005.0,***,367.2,347.5,387.7,272.0,falling,-3.0,-4.8,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,35,incd,newmexico,All Cancer Sites,35005,35,367.2
5,"Cibola County(7,8)",35006.0,***,338.7,309.7,369.8,104.0,stable,-0.4,-1.0,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,35,incd,newmexico,All Cancer Sites,35006,35,338.7
6,"Colfax County(7,8)",35007.0,***,366.6,326.8,410.6,73.0,stable,-1.8,-4.6,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,35,incd,newmexico,All Cancer Sites,35007,35,366.6
7,"Curry County(7,8)",35009.0,***,370.8,346.3,396.5,176.0,falling,-0.6,-1.1,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,35,incd,newmexico,All Cancer Sites,35009,35,370.8
8,"De Baca County(7,8)",35011.0,***,419.3,310.2,558.8,12.0,stable,0.2,-1.4,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,35,incd,newmexico,All Cancer Sites,35011,35,419.3
9,"Dona Ana County(7,8)",35013.0,***,390.7,379.0,402.6,883.0,stable,0.4,-0.4,...,0,https://www.statecancerprofiles.cancer.gov/inc...,,35,incd,newmexico,All Cancer Sites,35013,35,390.7


### Process the National Level Cancer Data

In [11]:
df.query('fips==0')

columns = ['fips', 'fips_str', 'rate', 'cancer']

df_us_cancer_rates = df.query('fips==0')[columns]
df_us_cancer_rates.fips = df_us_cancer_rates.fips.apply(int)

# rename columns
df_us_cancer_rates.rename(columns={'fips_str': 'id'}, inplace=True)
df_us_cancer_rates.drop(['fips', 'id'], inplace=True, axis=1)
df_us_cancer_rates.set_index('cancer', inplace=True)
df_us_cancer_rates.drop_duplicates(inplace=True)

# Save cancer data to tsv
tsv_out_path = data_dir.parent / f'cancer_US_byType.tsv'
# df_us_cancer_rates.to_csv(tsv_out_path, sep='\t', index=False)

df_us_cancer_rates.head()

Unnamed: 0_level_0,rate
cancer,Unnamed: 1_level_1
1,448.0
3,11.7
17,4.5
18,6.6
20,38.7


### Process the county Level Data

In [12]:
# cancer_id

criterion = ((df.stage.isna()))

columns = ['fips', 'fips_str', 'rate', 'cancer', 'state']

df_cancer_type = df[criterion][columns]

# Sort values by fips
df_cancer_type.sort_values(by=['cancer','fips'], inplace=True)

# Some initial cleaning
# df_cancer_type.rename(columns={'incidence rate_per_100000': 'rate'}, inplace=True)
df_cancer_type.fips = df_cancer_type.fips.apply(int)

df_county_cancer_rates = df_cancer_type[df_cancer_type.fips.isin(counties.fips_str.tolist())]

# df_county_cancer_rates.rate = (pd
#                          .to_numeric(df_county_cancer_rates.rate, errors='coerce')
#                          .replace(np.nan, 0, regex=True)
#                         )

# rename columns
df_county_cancer_rates.rename(columns={'fips_str': 'id'}, inplace=True)
df_county_cancer_rates.drop('fips', inplace=True, axis=1)


# Process the delta form national average
df_county_cancer_rates['rate_delta'] = df_county_cancer_rates.apply(compute_delta, axis=1)
df_county_cancer_rates['rate_delta_percent'] = df_county_cancer_rates.apply(compute_delta_percent, axis=1)

# Save cancer data to tsv
tsv_out_path = data_dir.parent / f'cancer_byCounty_byType.tsv'
df_county_cancer_rates.to_csv(tsv_out_path, sep='\t', index=False)

df_county_cancer_rates.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,id,rate,cancer,state,rate_delta,rate_delta_percent
1173,1001,495.6,1,alabama,47.6,10.625
1196,1003,445.0,1,alabama,-3.0,-0.669643
1219,1005,429.8,1,alabama,-18.2,-4.0625
1242,1007,478.3,1,alabama,30.3,6.763393
1265,1009,440.0,1,alabama,-8.0,-1.785714
