In [1]:
import pathlib
import pandas as pd
import numpy as np

### Goal: Process Cancer Incidence Data --> .tsv for choropleth

Current US choropleth example takes .tsv with the following format:

```
id	rate
01001	5.1
01003	4.9
01005	8.6
01007	6.2
```

where id is the county FIPS and rate is some value for coloring the choropleth

In [2]:
# Functions
def fips_to_str(fips):
    fips_str = str(int(fips))

    while len(fips_str) < 5:
        fips_str = '0' + fips_str

    return fips_str


def extract_state_fips(fips_str):
    return int(fips_str[0:2])


def compute_delta(row):
    national_rate = df_us_cancer_rates.loc[row.cancer, 'rate']
    return row.rate - national_rate


def compute_delta_percent(row):
    national_rate = df_us_cancer_rates.loc[row.cancer, 'rate']
    percent = 100 * ((row.rate - national_rate)/national_rate)
    return percent

In [3]:
base_dir = pathlib.Path.cwd()

# Load list of all county fips
county_fips_file = base_dir.parent.parent / 'data_raw' / 'counties_fips.csv'
counties = pd.read_csv(county_fips_file)
counties['fips_str'] = counties.FIPS.apply(fips_to_str)

# Load the cancer incidence data
data_dir = base_dir.parent.parent / 'data_clean' / 'CDC_CancerByCounty' / 'incidencerates'

incidence_data_csv = data_dir / 'cancer_by_type.csv'

# Load ALL incidence rate data to dataframe
df = pd.read_csv(incidence_data_csv)

# Some initial additions
df['fips_str'] = df.fips.apply(fips_to_str)
df['fips_state'] = df.fips_str.apply(extract_state_fips)

df.head()

Unnamed: 0,locale,fips,met_health_obj,incidence rate_per_100000,incidence rate_lower_95_confidence,incidence rate_upper_95_confidence,annual_count_avg,recent_trend_str,trend_last_5,trend_last_5_lower_95_confidence,...,race,sex,source_url,stage,stateFIPS,type,state,cancer_description,fips_str,fips_state
0,"US (SEER+NPCR)(1,10)",0.0,***,448.0,447.7,448.4,1638110.0,falling,-0.9,-1.6,...,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,All Cancer Sites,0,0
1,"US (SEER+NPCR)(1,10)",0.0,***,11.7,11.7,11.8,43864.0,stable,0.5,-0.7,...,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,Oral Cavity & Pharynx,0,0
2,"US (SEER+NPCR)(1,10)",0.0,***,4.5,4.5,4.6,17084.0,stable,-0.8,-1.6,...,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,Esophagus,0,0
3,"US (SEER+NPCR)(1,10)",0.0,***,6.6,6.5,6.6,23871.0,falling,-1.3,-2.0,...,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,Stomach,0,0
4,"US (SEER+NPCR)(1,10)",0.0,Yes,38.7,38.6,38.8,140982.0,falling,-1.3,-2.1,...,0,0,https://www.statecancerprofiles.cancer.gov/inc...,,29,incd,missouri,Colon & Rectum,0,0


### Process the National Level Cancer Data

In [5]:
df.query('fips==0')

columns = ['fips', 'fips_str', 'incidence rate_per_100000', 'cancer']

df_cancer_us = df.query('fips==0')[columns]

df_cancer_us.rename(columns={'incidence rate_per_100000': 'rate'}, inplace=True)
df_cancer_us.fips = df_cancer_us.fips.apply(int)

df_us_cancer_rates = df_cancer_us

df_us_cancer_rates.rate = (pd
                         .to_numeric(df_us_cancer_rates.rate, errors='coerce')
                         .replace(np.nan, 0, regex=True)
                        )

# rename columns
df_us_cancer_rates.rename(columns={'fips_str': 'id'}, inplace=True)
df_us_cancer_rates.drop(['fips', 'id'], inplace=True, axis=1)
df_us_cancer_rates.set_index('cancer', inplace=True)

# Save cancer data to tsv
tsv_out_path = data_dir.parent / f'cancer_US_byType.tsv'
# df_us_cancer_rates.to_csv(tsv_out_path, sep='\t', index=False)

df_us_cancer_rates.head()

Unnamed: 0_level_0,rate
cancer,Unnamed: 1_level_1
1,448.0
3,11.7
17,4.5
18,6.6
20,38.7


### Process the county Level Data

In [10]:
# cancer_id

criterion = ((df.stage.isna()))

columns = ['fips', 'fips_str', 'incidence rate_per_100000', 'cancer']

df_cancer_type = df[criterion][columns]

# Sort values by fips
df_cancer_type.sort_values(by=['cancer','fips'], inplace=True)

# Some initial cleaning
df_cancer_type.rename(columns={'incidence rate_per_100000': 'rate'}, inplace=True)
df_cancer_type.fips = df_cancer_type.fips.apply(int)

df_county_cancer_rates = df_cancer_type[df_cancer_type.fips.isin(counties.fips_str.tolist())]

df_county_cancer_rates.rate = (pd
                         .to_numeric(df_county_cancer_rates.rate, errors='coerce')
                         .replace(np.nan, 0, regex=True)
                        )

# rename columns
df_county_cancer_rates.rename(columns={'fips_str': 'id'}, inplace=True)
df_county_cancer_rates.drop('fips', inplace=True, axis=1)


# Process the delta form national average
df_county_cancer_rates['rate_delta'] = df_county_cancer_rates.apply(compute_delta, axis=1)
df_county_cancer_rates['rate_delta_percent'] = df_county_cancer_rates.apply(compute_delta_percent, axis=1)

# Save cancer data to tsv
tsv_out_path = data_dir.parent / f'cancer_byCounty_byType.tsv'
df_county_cancer_rates.to_csv(tsv_out_path, sep='\t', index=False)

df_county_cancer_rates.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value

Unnamed: 0,id,rate,cancer,rate_delta,rate_delta_percent
46,1001,495.6,1,47.6,10.625
69,1003,445.0,1,-3.0,-0.669643
92,1005,429.8,1,-18.2,-4.0625
115,1007,478.3,1,30.3,6.763393
138,1009,440.0,1,-8.0,-1.785714
