In [1]:
import pandas as pd
from diet_analysis import prepare_panel, ewma_exposure
from diet_analysis import fit_gee_binomial,fit_gee_gaussian_ar1, extract_gee_results
import warnings
warnings.filterwarnings("ignore")

In [2]:
### define columns and load data
idcol = 'R-ID'
datecol= '수진일'
demographics = ['체중','성별','생년월','신장']
lifestyles = ['일반담배_흡연여부','음주'] #, '활동량'
ffqs = ['간식빈도','고지방 육류','곡류','과일','단맛','단백질류','물','밥 양','식사 빈도','식사량','외식빈도','유제품','음료류','인스턴트 가공식품','짠 간','짠 식습관','채소','커피','튀김']
medications = ['고혈압_투약여부','당뇨_투약여부','고지혈증_투약여부']

biomarkers = ['SBP', 'DBP', 'CHOL.', 'TG', 'LDL CHOL.', 'HDL CHOL.', 'GLUCOSE', 'HBA1C', 'eGFR', '허리둘레(WAIST)', '체질량지수']
analysis=['고혈압_통합','당뇨_통합','고지혈증_통합','협심증/심근경색증_통합','뇌졸중(중풍)_통합', '비만', 'Chronic kidney disease (eGFR<60)']

fn = 'total_again.xlsx'
df = pd.read_excel(fn)[[idcol, datecol] + demographics + lifestyles + ffqs + medications + biomarkers + analysis]

In [3]:
## preprocess data
df2 = prepare_panel(df, idcol, datecol, ffqs, biomarkers, lifestyles, medications, extra_numeric=['나이','성별'], exclude=['활동량'])
df_processed = ewma_exposure(df2, idcol, datecol, cols=ffqs, halflife_days=365.25*3) # 3-year half-life

#### Analysis ####

In [9]:
base_predictors = [col for col in df_processed.columns if col not in 
                  ['R-ID', '수진일', 'visit_index', 'days_since_prev', 'interval_years', 
                   'med_any', 'med_any_change']]

In [10]:
all_results = []

# EWMA 분석
print('\nType: ewma')

# Continuous biomarkers
for y in biomarkers:
    if y not in df_processed.columns:
        continue
    print(f'\nOutcome: {y}')
    try:
        # outcome과 관련된 변수들 제외
        exclude_patterns = [y, y.replace(' ', '_'), y.replace('.', '')]
        predictors = [f'{f}_ewma' for f in base_predictors 
                     if f'{f}_ewma' in df_processed.columns 
                     and not any(pattern in f for pattern in exclude_patterns)]
        
        if len(predictors) == 0:
            print(f'No predictors available for {y}')
            continue
            
        res, variable_names = fit_gee_gaussian_ar1(df_processed, idcol, 
                                                  outcome=y, predictors=predictors, datecol=datecol)
        result_df = extract_gee_results(res, y, 'gaussian')
        result_df['variable_name'] = variable_names
        result_df['type'] = 'ewma'
        all_results.append(result_df)
    except Exception as e:
        print(f'Error for {y}: {e}')

# Binary outcomes
for y in analysis:
    if y not in df_processed.columns:
        continue
    print(f'\nOutcome: {y}')
    try:
        # outcome과 관련된 변수들 제외
        exclude_patterns = [y, y.replace(' ', '_'), y.replace('.', '')]
        predictors = [f'{f}_ewma' for f in base_predictors 
                     if f'{f}_ewma' in df_processed.columns 
                     and not any(pattern in f for pattern in exclude_patterns)]
        
        if len(predictors) == 0:
            print(f'No predictors available for {y}')
            continue
            
        gee, variable_names = fit_gee_binomial(df_processed, idcol, outcome=y, predictors=predictors)
        result_df = extract_gee_results(gee, y, 'binomial')
        result_df['variable_name'] = variable_names
        result_df['type'] = 'ewma'
        all_results.append(result_df)
    except Exception as e:
        print(f'Error for {y}: {e}')



Type: ewma

Outcome: SBP

Outcome: DBP

Outcome: CHOL.

Outcome: TG

Outcome: LDL CHOL.

Outcome: HDL CHOL.

Outcome: GLUCOSE

Outcome: HBA1C

Outcome: eGFR

Outcome: 허리둘레(WAIST)

Outcome: 체질량지수

Outcome: 고혈압_통합

Outcome: 당뇨_통합

Outcome: 고지혈증_통합

Outcome: 협심증/심근경색증_통합

Outcome: 뇌졸중(중풍)_통합

Outcome: 비만

Outcome: Chronic kidney disease (eGFR<60)


In [11]:
print('\nType: delta')

for y in biomarkers:
    if y not in df_processed.columns:
        continue
    print(f'\nOutcome: {y}')
    try:
        # Delta 변수들 중 존재하는 것만 선택
        exclude_patterns = [y, y.replace(' ', '_'), y.replace('.', '')]
        delta_predictors = [f'{f}_delta' for f in ffqs + lifestyles 
                           if f'{f}_delta' in df_processed.columns
                           and not any(pattern in f for pattern in exclude_patterns)]
        
        if len(delta_predictors) == 0:
            print(f'No delta predictors available for {y}')
            continue
            
        res, variable_names = fit_gee_gaussian_ar1(df_processed, idcol, 
                                                  outcome=y, predictors=delta_predictors, datecol=datecol)
        result_df = extract_gee_results(res, y, 'gaussian')
        result_df['variable_name'] = variable_names
        result_df['type'] = 'delta'
        all_results.append(result_df)
    except Exception as e:
        print(f'Error for {y}: {e}')



Type: delta

Outcome: SBP
No delta predictors available for SBP

Outcome: DBP
No delta predictors available for DBP

Outcome: CHOL.
No delta predictors available for CHOL.

Outcome: TG
No delta predictors available for TG

Outcome: LDL CHOL.
No delta predictors available for LDL CHOL.

Outcome: HDL CHOL.
No delta predictors available for HDL CHOL.

Outcome: GLUCOSE
No delta predictors available for GLUCOSE

Outcome: HBA1C
No delta predictors available for HBA1C

Outcome: eGFR
No delta predictors available for eGFR

Outcome: 허리둘레(WAIST)
No delta predictors available for 허리둘레(WAIST)

Outcome: 체질량지수
No delta predictors available for 체질량지수


In [12]:
if all_results:
    combined_results = pd.concat(all_results, ignore_index=True)
    combined_results.to_excel('gee_results_all_outcomes.xlsx', index=False)
    print(f'\n결과 저장 완료: {len(all_results)}개 모델')
else:
    print('\n저장할 결과가 없습니다.')


결과 저장 완료: 18개 모델
