In [1]:
import os

import numpy as np
import pandas as pd

from IPython.display import display

DIR = r'c://downloads'

In [2]:
new_cancer_cases_by_state = pd.read_csv(os.path.join(DIR, 'new_cancer_cases_by_state.csv'), thousands = ',', index_col = 'State')
new_cancer_cases_by_state = new_cancer_cases_by_state.iloc[:-1, 1:]
display(new_cancer_cases_by_state)

Unnamed: 0_level_0,Female Breast,Uterine Cervix,Colon & Rectum,Uterine Corpus,Leukemia,Lung & Bronchus,Melanoma of the Skin,Non-Hodgkin Lymphoma,Prostate,Urinary Bladder
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,3660,210.0,2350,650,690,4160,1320,990,3760,990
Alaska,450,,280,100,100,430,90,140,530,150
Arizona,4520,210.0,2560,910,950,4280,1430,1320,4390,1490
Arkansas,2050,140.0,1500,400,480,2660,490,660,2240,640
California,26130,1550.0,13930,5650,5650,18780,8440,7770,23010,7210
Colorado,3780,160.0,1720,750,870,2540,1400,1060,3680,1040
Connecticut,3160,120.0,1650,790,610,2730,1090,920,3120,1170
Delaware,760,,420,180,150,790,290,220,800,260
Dist. of Columbia,430,,250,100,60,320,80,100,510,80
Florida,15480,960.0,10230,3410,3810,17960,5320,5050,16590,5800


In [3]:
from scipy.stats import fisher_exact, chi2_contingency
from statsmodels.stats.multitest import multipletests

_, pval, _, expected_data = chi2_contingency(new_cancer_cases_by_state.dropna())
print('%.2f of the expected values are above 5, %.2f are above 1' % (np.mean(expected_data >= 5), np.mean(expected_data >= 1)))
print('Chi-squared p-value: %e' % pval)

1.00 of the expected values are above 5, 1.00 are above 1
Chi-squared p-value: 0.000000e+00


In [4]:
from ipywidgets import FloatProgress

test_cancer_types = []
test_states = []
test_RRs = []
test_pvals = []

progress_bar = FloatProgress(max = new_cancer_cases_by_state.size)
display(progress_bar)

for cancer_type in new_cancer_cases_by_state.columns:

    other_cancer_types = new_cancer_cases_by_state.loc[:, new_cancer_cases_by_state.columns != cancer_type].sum(axis = 1)
    other_cancer_types.name = 'Other'
    cancer_type_data = pd.concat([new_cancer_cases_by_state[cancer_type], other_cancer_types], axis = 1)

    for state in new_cancer_cases_by_state.index:
    
        progress_bar.value += 1
        
        other_states = cancer_type_data[other_cancer_types.index != state].sum()
        other_states.name = 'Other'
        data = pd.DataFrame([cancer_type_data.loc[state], other_states])
        
        if pd.isnull(data).any().any():
            continue
        
        state_risk = data.iloc[0, 0] / data.iloc[0, :].sum()
        other_states_risk = data.iloc[1, 0] / data.iloc[1, :].sum()
        RR = state_risk / other_states_risk
        _, pval = fisher_exact(data)
        
        test_cancer_types.append(cancer_type)
        test_states.append(state)
        test_RRs.append(RR)
        test_pvals.append(pval)
        
test_summary = pd.DataFrame({'cancer_type': test_cancer_types, 'state': test_states, 'RR': test_RRs, 'pval': test_pvals})

FloatProgress(value=0.0, max=510.0)

In [5]:
test_summary['significance'], test_summary['qval'], _, _ = multipletests(test_summary['pval'], method = 'bonferroni')

# Sorting by max{RR, 1 / RR}
test_summary = test_summary.assign(absolute_RR = test_summary['RR'].apply(lambda RR: max(RR, 1 / RR)))\
        .sort_values('absolute_RR', ascending = False).drop('absolute_RR', axis = 1)

display(test_summary[test_summary['significance']])

Unnamed: 0,cancer_type,state,RR,pval,significance,qval
289,Lung & Bronchus,Utah,0.481162,2.097583e-117,True,1.048791e-114
340,Melanoma of the Skin,Utah,1.691588,3.951260e-44,True,1.975630e-41
297,Melanoma of the Skin,Alaska,0.606716,1.538582e-07,True,7.692912e-05
304,Melanoma of the Skin,Dist. of Columbia,0.634412,7.395569e-06,True,3.697785e-03
457,Urinary Bladder,Dist. of Columbia,0.646488,2.187035e-05,True,1.093517e-02
339,Melanoma of the Skin,Texas,0.658463,8.247214e-152,True,4.123607e-149
299,Melanoma of the Skin,Arkansas,0.664275,3.017647e-23,True,1.508824e-20
71,Uterine Cervix,Minnesota,0.668900,1.202174e-06,True,6.010871e-04
314,Melanoma of the Skin,Louisiana,0.683422,5.824079e-30,True,2.912039e-27
86,Uterine Cervix,Texas,1.455509,8.820529e-31,True,4.410264e-28


Note: the fact that for (Melanoma of the Skin, Texas) we have RR = 0.66 does **NOT mean** that people in Texas are only 66% as likely to develop melanoma compared to other US states. It means that **GIVEN they already have one of the specified types cancer**, it's only 66% as likely to be melanoma (and not one of the other types).