In [6]:
! pip install pandas google-cloud-bigquery lifelines db-dtypes

Defaulting to user installation because normal site-packages is not writeable
Collecting google-cloud-bigquery
  Using cached google_cloud_bigquery-3.18.0-py2.py3-none-any.whl (230 kB)
Collecting db-dtypes
  Using cached db_dtypes-1.2.0-py2.py3-none-any.whl (14 kB)
Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 (from google-cloud-bigquery)
  Using cached google_api_core-2.17.1-py3-none-any.whl (137 kB)
Collecting google-cloud-core<3.0.0dev,>=1.6.0 (from google-cloud-bigquery)
  Using cached google_cloud_core-2.4.1-py2.py3-none-any.whl (29 kB)
Collecting google-resumable-media<3.0dev,>=0.6.0 (from google-cloud-bigquery)
  Using cached google_resumable_media-2.7.0-py2.py3-none-any.whl (80 kB)
Collecting pyarrow>=3.0.0 (from db-dtypes)
  Downloading pyarrow-15.0.1-cp311-cp311-win_amd64.whl (24.8 MB)
                                              0.0/24.8 MB ? eta -:--:--
                                              0.0/24.8 MB ? eta -:--:--
                  

In [30]:
import os
import pandas as pd
from lifelines import CoxPHFitter
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "nhanesgcp-93d0f7365fdc.json"

In [31]:
from google.cloud import bigquery
client = bigquery.Client()

In [32]:
query = """
WITH SmokingData AS (
    SELECT
        respondent_sequence_number,
        smoked_at_least_100_cigarettes_in_life
    FROM
        `nhanesgcp.dbt.smoking_cigarette_use_questionnaire`
),
AlcoholData AS (
    SELECT
        respondent_sequence_number,
        had_at_least_12_alcohol_drinks_1_yr
    FROM
        `nhanesgcp.dbt.alcohol_use_questionnaire`
),
SmokingStatus AS (
    SELECT
        respondent_sequence_number,
        CASE
            WHEN smoked_at_least_100_cigarettes_in_life IS NULL THEN NULL
            WHEN smoked_at_least_100_cigarettes_in_life = 'Yes' THEN 'Smoker'
            ELSE 'Non-Smoker'
        END AS smoking_status
    FROM
        SmokingData
),
AlcoholDrinkingStatus AS (
    SELECT
        respondent_sequence_number,
        CASE
            WHEN had_at_least_12_alcohol_drinks_1_yr IS NULL THEN NULL
            WHEN had_at_least_12_alcohol_drinks_1_yr = 'Yes' THEN 'Drinker'
            ELSE 'Non-Drinker'
        END AS alcohol_drinking_status
    FROM
        AlcoholData
),
PhysicalActivityData AS (
    SELECT
        respondent_sequence_number,
        COALESCE(SUM(CASE
                WHEN vigorous_activity_over_past_30_days = 'Yes' THEN of_times_past_30_days * how_long_each_time_minutes
                ELSE 0
            END), 0) +
        COALESCE(SUM(CASE
                WHEN moderate_activity_over_past_30_days = 'Yes' THEN of_times_past_30_days * how_long_each_time_minutes
                ELSE 0
            END), 0) AS total_activity_duration_minutes
    FROM
        `nhanesgcp.dbt.physical_activity_questionnaire`
    GROUP BY
        respondent_sequence_number
),
SII_Calculation AS (
    SELECT 
        respondent_sequence_number,
        (segmented_neutrophils_num_1000_cell_ul * platelet_count_1000_cells_ul) / lymphocyte_number_1000_cells_ul AS SII
    FROM 
        `nhanesgcp.dbt.complete_blood_count_with_5_part_differential_whole_blood_laboratory`
),
Filtered_Demographics AS (
    SELECT 
        respondent_sequence_number,
        age_in_years_at_screening AS age,
        gender,
        race_hispanic_origin AS race
    FROM 
        `nhanesgcp.dbt.demographic_variables_sample_weights_demographics`
    WHERE 
        end_year <= 2015
        AND 
        CAST(age_in_years_at_screening AS INT64) >= 40
),
BodyMeasurements AS (
    SELECT 
        respondent_sequence_number,
        weight_kg,
        standing_height_cm,
        weight_kg / (standing_height_cm / 100) AS bmi
    FROM 
        `nhanesgcp.dbt.body_measures_examination`
),
PhysicalActivityStatus AS (
    SELECT
        respondent_sequence_number,
        CASE
            WHEN COALESCE(total_activity_duration_minutes, 0) >= 150 THEN 'Physically Active'
            ELSE 'Physically Inactive'
        END AS physical_activity_status
    FROM
        PhysicalActivityData
),
MortalityData AS (
    SELECT
        respondent_sequence_number,
        ucod_leading,
        mort_stat,
        permth_exm
    FROM
        `nhanesgcp.nhanes_mort_1999_2018.nhanes_mort`
),
SII_Stats AS (
    SELECT
        APPROX_QUANTILES(SII, 4) AS quartiles
    FROM
        SII_Calculation
),
SII_Groups AS (
    SELECT
        SII_Calculation.*,
        CASE
            WHEN SII IS NULL THEN NULL
            WHEN SII <= (SELECT quartiles[OFFSET(1)] FROM SII_Stats) THEN 'Low'
            WHEN SII <= (SELECT quartiles[OFFSET(2)] FROM SII_Stats) THEN 'Middle'
            ELSE 'High'
        END AS SII_Group
    FROM
        SII_Calculation
    CROSS JOIN
        SII_Stats
)
SELECT 
    fd.respondent_sequence_number,
    fd.age,
    fd.gender,
    fd.race,
    bm.bmi,
    sii.SII,
    ss.smoking_status,
    ads.alcohol_drinking_status,
    pa.total_activity_duration_minutes AS physical_activity_duration_minutes,
    pas.physical_activity_status,
    md.ucod_leading,
    md.mort_stat,
    md.permth_exm,
    sg.SII_Group
FROM 
    Filtered_Demographics fd
LEFT JOIN 
    BodyMeasurements bm ON fd.respondent_sequence_number = bm.respondent_sequence_number
LEFT JOIN 
    SII_Calculation sii ON fd.respondent_sequence_number = sii.respondent_sequence_number
LEFT JOIN 
    SmokingStatus ss ON fd.respondent_sequence_number = ss.respondent_sequence_number
LEFT JOIN 
    AlcoholDrinkingStatus ads ON fd.respondent_sequence_number = ads.respondent_sequence_number
LEFT JOIN 
    SmokingData smoking ON fd.respondent_sequence_number = smoking.respondent_sequence_number
LEFT JOIN 
    AlcoholData alcohol ON fd.respondent_sequence_number = alcohol.respondent_sequence_number
LEFT JOIN
    PhysicalActivityData pa ON fd.respondent_sequence_number = pa.respondent_sequence_number
LEFT JOIN
    PhysicalActivityStatus pas ON fd.respondent_sequence_number = pas.respondent_sequence_number
LEFT JOIN
    MortalityData md ON fd.respondent_sequence_number = md.respondent_sequence_number
LEFT JOIN
    SII_Groups sg ON fd.respondent_sequence_number = sg.respondent_sequence_number
WHERE
    sii.SII IS NOT NULL
ORDER BY
    fd.respondent_sequence_number;

"""

In [33]:
df = client.query(query).to_dataframe()

In [34]:
df.head(50)

Unnamed: 0,respondent_sequence_number,age,gender,race,bmi,SII,smoking_status,alcohol_drinking_status,physical_activity_duration_minutes,physical_activity_status,ucod_leading,mort_stat,permth_exm,SII_Group
0,2,77.0,Male,Non-Hispanic White,43.333333,682.125,,,0.0,Physically Inactive,6.0,1,177,High
1,5,49.0,Male,Non-Hispanic White,51.878856,294.5,,,2040.0,Physically Active,,0,244,Low
2,7,59.0,Female,Non-Hispanic Black,47.882136,222.782609,,,12.0,Physically Inactive,,0,236,Low
3,10,43.0,Male,Non-Hispanic Black,58.811152,175.789474,,,0.0,Physically Inactive,1.0,1,231,Low
4,13,70.0,Male,Mexican American,40.32974,1459.2,,,0.0,Physically Inactive,1.0,1,16,High
5,14,81.0,Male,Non-Hispanic White,45.427196,337.777778,,,1080.0,Physically Active,3.0,1,136,Middle
6,16,85.0,Female,Non-Hispanic Black,28.779473,500.571429,,,780.0,Physically Active,2.0,1,62,High
7,24,53.0,Female,Non-Hispanic White,42.570037,438.351351,,,0.0,Physically Inactive,2.0,1,160,Middle
8,25,42.0,Female,Non-Hispanic White,62.687463,531.423077,,,0.0,Physically Inactive,10.0,1,142,High
9,29,62.0,Male,Non-Hispanic White,64.608348,4930.0,,,0.0,Physically Inactive,3.0,1,26,High


In [35]:
df = df.dropna()

In [36]:
cph = CoxPHFitter()

In [37]:
# Asscoiation between SII and total mortality

df['SII_Group'] = pd.Categorical(df['SII_Group'], categories=['Low', 'Middle', 'High'], ordered=True)

cph.fit(df, duration_col='permth_exm', event_col='mort_stat', formula="SII_Group")

cph.summary

Unnamed: 0_level_0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
covariate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SII_Group[T.Middle],-0.093138,0.911068,0.064772,-0.220089,0.033814,0.802447,1.034392,0.0,-1.437924,0.150456,2.73259
SII_Group[T.High],-0.056702,0.944876,0.053151,-0.160876,0.047472,0.851397,1.048617,0.0,-1.066808,0.286059,1.805617


In [38]:
# Asscoiation between SII and cause-specific mortality (CVD)

cause_specific_df = df[df['ucod_leading'] == 1]
cph.fit(cause_specific_df, duration_col='permth_exm', event_col='mort_stat', formula="SII_Group")
hazard_ratios = cph.summary['exp(coef)']
cph.summary

Unnamed: 0_level_0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
covariate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SII_Group[T.Middle],0.023758,1.024043,0.138296,-0.247298,0.294814,0.780908,1.342876,0.0,0.171791,0.863602,0.211562
SII_Group[T.High],0.065243,1.067418,0.113292,-0.156806,0.287292,0.85487,1.332813,0.0,0.575883,0.564694,0.824458


In [40]:
# Asscoiation between SII and cause-specific chronic lower respiratory diseases

cause_specific_df = df[df['ucod_leading'] == 3]
cph.fit(cause_specific_df, duration_col='permth_exm', event_col='mort_stat', formula="SII_Group")
hazard_ratios = cph.summary['exp(coef)']
cph.summary

Unnamed: 0_level_0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
covariate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SII_Group[T.Middle],0.768939,2.157477,0.344325,0.094076,1.443803,1.098643,4.236778,0.0,2.233182,0.025537,5.29127
SII_Group[T.High],0.249507,1.283393,0.283685,-0.306506,0.80552,0.736014,2.23786,0.0,0.879521,0.379119,1.399277


In [41]:
# Asscoiation between SII and all other causes (residual)

cause_specific_df = df[df['ucod_leading'] == 10]
cph.fit(cause_specific_df, duration_col='permth_exm', event_col='mort_stat', formula="SII_Group")
hazard_ratios = cph.summary['exp(coef)']
cph.summary

Unnamed: 0_level_0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,cmp to,z,p,-log2(p)
covariate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
SII_Group[T.Middle],-0.325675,0.72204,0.125894,-0.572423,-0.078926,0.564157,0.924108,0.0,-2.586888,0.009685,6.690076
SII_Group[T.High],-0.305408,0.736823,0.102138,-0.505595,-0.105221,0.603146,0.900125,0.0,-2.990151,0.002788,8.486348


In [42]:
# Asscoiation between SII and cause-specific Cancer 

# cause_specific_df = df[df['ucod_leading'] == 2]
# cph.fit(cause_specific_df, duration_col='permth_exm', event_col='mort_stat', formula="SII_Group")
# hazard_ratios = cph.summary['exp(coef)']
# cph.summary