In [1]:
import pandas as pd
import numpy as np

data_path = '../data/total_again_an.xlsx'
df = pd.read_excel(data_path).drop(columns=['Unnamed: 0', '생년월'])

In [2]:
# 성별에 따른 허리둘레 기준 설정
df['Increased waist circumference'] = (
    ((df['성별'] == 'M') & (df['허리둘레(WAIST)'].astype(float) >= 90)) | 
    ((df['성별'] == 'F') & (df['허리둘레(WAIST)'].astype(float) >= 85))
)

# 혈압 기준 설정
df['Elevated blood pressure'] = (
    ((df['SBP'].astype(float) >= 130) | 
     (df['DBP'].astype(float) >= 85)) | 
    (df['고혈압_투약여부'] == 1)
)

# 공복혈당 기준 설정
df['Impaired fasting glucose'] = (
    (df['GLUCOSE'].astype(float) >= 100) | 
    (df['당뇨_투약여부'] == 1)
)

# 중성지방 기준 설정
df['Elevated triglycerides'] = (
    (df['TG'].astype(float) >= 150) | 
    (df['고지혈증_투약여부'] == 1)
)

# 성별에 따른 HDL 콜레스테롤 기준 설정
df['Decreased HDL-C'] = (
    ((df['성별'] == 'M') & (df['HDL CHOL.'].astype(float) < 40)) | 
    ((df['성별'] == 'F') & (df['HDL CHOL.'].astype(float) < 50))
)

mets_tf = ['Increased waist circumference', 'Elevated blood pressure', 'Impaired fasting glucose', 'Elevated triglycerides', 'Decreased HDL-C']
df['MetS'] = (df[mets_tf] == 1).sum(axis=1).apply(lambda x: 1 if x >= 3 else 0)

df['일반담배_흡연여부'] = df['일반담배_흡연여부'].replace('Missing value', 0)
df['활동량'] = df['활동량'].replace('Missing value', 1)
df['음주'] = df['음주'].replace('Missing value', 1)
df['성별'] = df['성별'].replace('M', 0).replace('F', 1)

df['수진일'] = pd.to_datetime(df['수진일'])

In [3]:
demo_cols = ['수진일', '성별', '나이', '신장']  # 기본 인구통계학적 변수
lifestyle_cols = ['흡연', '활동량', '음주']  # 생활습관 변수
ffq_cols = ['간식빈도', '고지방 육류', '곡류', '과일', '단맛', '단백질류', '물', '밥 양','커피', '튀김',
                '식사 빈도', '식사량', '외식빈도', '유제품', '음료류', '인스턴트 가공식품', '짠 간', '짠 식습관', '채소']
biomarker_cols = ['SBP', 'DBP', 'WAIST', '체중', 'BMI', 'GLUCOSE', 'HBA1C', 'TG', 'HDL CHOL', 'LDL CHOL', 'eGFR']
mets_cols = ['Increased waist circumference', 'Elevated blood pressure', 'Impaired fasting glucose', 'Elevated triglycerides', 'Decreased HDL-C']

for col in mets_cols:
    if col in df.columns:
        df[col] = df[col].astype(int)

In [4]:
optimal_data = []

for patient_id, patient_data in df.groupby('R-ID'):
    patient_data = patient_data.sort_values('수진일')
    
    if len(patient_data) < 2:
        continue
    
    max_disease_change = 0
    best_pair = None
    
    for i in range(len(patient_data)):
        for j in range(i+1, len(patient_data)):
            visit1 = patient_data.iloc[i]
            visit2 = patient_data.iloc[j]
            
            disease_changes = 0
            for col in mets_cols:
                if col in patient_data.columns:
                    disease_changes += abs(visit2[col] - visit1[col])
            
            if disease_changes > max_disease_change:
                max_disease_change = disease_changes
                best_pair = (visit1, visit2)

    if best_pair is None:
        best_pair = (patient_data.iloc[0], patient_data.iloc[-1])

    visit1, visit2 = best_pair
    days_between = (visit2['수진일'] - visit1['수진일']).days
    
    row = {'R-ID': patient_id, 'days_between': days_between}
    
    for col in visit2.index:
        if col not in ['수진일', 'R-ID']:  # 제외할 컬럼들
            row[f'{col}'] = visit2[col]
    
    # 질병 상태 변화 (3클래스)
    for col in mets_cols:
        if col in visit1.index and col in visit2.index:
            change = visit2[col] - visit1[col]
            if change == -1:    # 개선
                target_class = 0
            elif change == 0:   # 유지
                target_class = 1
            else:               # 악화
                target_class = 2
            row[f'{col}_delta'] = target_class
    
    optimal_data.append(row)

df = pd.DataFrame(optimal_data)

In [5]:
korean_to_paper = {
    # Basic information
    'R-ID': 'R-ID',
    '나이': 'Age',

    '성별': 'Sex',
    '최종학력': 'Education Level',
    '결혼상태': 'Marital Status', 
    '가계수입': 'Household Income',
    
    'BMI category' : 'BMI category', 
    'WC (M>=90, F>=85)' : 'WC (M>=90, F>=85)',
    
    # Anthropometric measures
    '신장': 'Height (cm)',
    '체중': 'Weight (kg)',
    '체질량지수': 'BMI(kg/m2)',
    '허리둘레(WAIST)': 'Waist circumference (cm)',
        
    # Disease status and medication
    '고혈압_통합': 'Hypertension',
    '고혈압_투약여부': 'Antihypertensive Medication',
    '당뇨_통합': 'Diabetes Mellitus',
    '당뇨_투약여부': 'Antidiabetic Medication',
    '고지혈증_통합': 'Hyperlipidemia',
    '고지혈증_투약여부': 'Statin Medication',
    '협심증/심근경색증_통합': 'Angina/Myocardial Infarction',
    '뇌졸중(중풍)_통합': 'Stroke',
    'Chronic kidney disease (eGFR<60)': 'Chronic kidney disease (eGFR<60)',

    # Clinical measures
    'SBP': 'Systolic blood pressure (mmHg)',
    'DBP': 'Diastolic Blood Pressure (mmHg)',
    'CHOL.' : 'Total Cholesterol (mg/dL)',
    'TG': 'Triglycerides (mg/dL)',
    'LDL CHOL.': 'LDL-C (mg/dL)',
    'HDL CHOL.': 'HDL-C (mg/dL)',
    'nonHDLC': 'Non-HDL-C (mg/dL)',
    'GLUCOSE': 'Fasting glucose (mg/dL)',
    'HBA1C': 'HbA1c (%)',
    'eGFR': 'eGFR (mL/min/1.73m2)',

    # Lifestyle risk component
    '비만' : 'Obese', # Obese (BMI ≥ 25 kg/m2), Overweight (BMI 23-24.9 kg/m2), Normal (BMI < 23 kg/m2)
    '활동량': 'Physical activity', # Inactive (0), Minimally active (1), Active (2)
    '음주': 'Alcohol Consumption', # Non-drinker (0), Moderate drinker (1), Heavy drinker (2)
    '일반담배_흡연여부': 'Current smoking', # Non-smoker (0), Current smoker (1)

    # Dietary patterns
    '식사 빈도': 'Meal Frequency',
    '식사량': 'Meal Portion Size',
    '외식빈도': 'Eating Out Frequency',
    '밥 양': 'Rice Portion Size',
    '간식빈도': 'Snacking Frequency',
    
    # Food groups
    '곡류': 'Grain Products',
    '단백질류': 'Protein Foods',
    '채소': 'Vegetables',
    '유제품': 'Dairy Products',
    '과일': 'Fruits',
    '튀김': 'Fried Foods',
    '고지방 육류': 'High Fat Meat',
    '인스턴트 가공식품': 'Processed Foods',
    
    # Beverages
    '물': 'Water Intake',
    '커피': 'Coffee Consumption',
    '음료류': 'Sugar-Sweetened Beverages',
    
    # Taste preferences
    '짠 간': 'Additional Salt Use',
    '짠 식습관': 'Salty Food Consumption',
    '단맛': 'Sweet Food Consumption',

    # MetS
    'Increased waist circumference' : 'Increased waist circumference', 
    'Elevated blood pressure' : 'Elevated blood pressure', 
    'Impaired fasting glucose' : 'Impaired fasting glucose', 
    'Elevated triglycerides' : 'Elevated triglycerides', 
    'Decreased HDL-C' : 'Decreased HDL-C',
    'MetS': 'MetS'
}

df.rename(columns=korean_to_paper, inplace=True)

In [7]:
df = df[['R-ID', 'Age', 'Sex', 'Education Level', 'Marital Status', 
       'Household Income', 'BMI category', 'WC (M>=90, F>=85)', 'Height (cm)',
       'Weight (kg)', 'BMI(kg/m2)', 'Waist circumference (cm)', 'Hypertension',
       'Antihypertensive Medication', 'Diabetes Mellitus',
       'Antidiabetic Medication', 'Hyperlipidemia', 'Statin Medication',
       'Angina/Myocardial Infarction', 'Stroke',
       'Chronic kidney disease (eGFR<60)', 'Systolic blood pressure (mmHg)',
       'Diastolic Blood Pressure (mmHg)', 'Total Cholesterol (mg/dL)',
       'Triglycerides (mg/dL)', 'LDL-C (mg/dL)', 'HDL-C (mg/dL)',
       'Non-HDL-C (mg/dL)', 'Fasting glucose (mg/dL)', 'HbA1c (%)',
       'eGFR (mL/min/1.73m2)', 'Obese', 'Physical activity',
       'Alcohol Consumption', 'Current smoking', 'Meal Frequency',
       'Meal Portion Size', 'Eating Out Frequency', 'Rice Portion Size',
       'Snacking Frequency', 'Grain Products', 'Protein Foods', 'Vegetables',
       'Dairy Products', 'Fruits', 'Fried Foods', 'High Fat Meat',
       'Processed Foods', 'Water Intake', 'Coffee Consumption',
       'Sugar-Sweetened Beverages', 'Additional Salt Use',
       'Salty Food Consumption', 'Sweet Food Consumption',
       'Increased waist circumference', 'Elevated blood pressure',
       'Impaired fasting glucose', 'Elevated triglycerides', 'Decreased HDL-C',
       'MetS']]

In [8]:
age_col = ['Age'] 
AM_col = ['Height (cm)', 'Weight (kg)', 'BMI(kg/m2)', 'Waist circumference (cm)']
CM_col = ['Systolic blood pressure (mmHg)', 'Diastolic Blood Pressure (mmHg)', 'Triglycerides (mg/dL)',
          'HDL-C (mg/dL)', 'LDL-C (mg/dL)', 'Non-HDL-C (mg/dL)', 'Total Cholesterol (mg/dL)', 'Fasting glucose (mg/dL)', 'HbA1c (%)', 'eGFR (mL/min/1.73m2)']
mets_cols = ['Increased waist circumference', 'Elevated blood pressure', 'Impaired fasting glucose', 'Elevated triglycerides', 'Decreased HDL-C']
disease_cols = ['Hypertension', 'Antihypertensive Medication', 'Diabetes Mellitus', 'Antidiabetic Medication', 'Hyperlipidemia', 
                'Statin Medication', 'Angina/Myocardial Infarction', 'Stroke', 'Chronic kidney disease (eGFR<60)']

continuous_cols = age_col + AM_col + CM_col
categorical_cols = list(set(df.columns[1:]) - set(age_col) - set(AM_col) - set(CM_col)) + mets_cols + ['MetS']

df[continuous_cols] = df[continuous_cols].replace('Missing value', np.nan).fillna(0).astype(float)

In [9]:
all_df = df
men_df = df[df['Sex'] == 'M']
women_df = df[df['Sex'] == 'F']

In [10]:
results_by_category = {}
categorized_data = {}
base_df = df

In [11]:
def clean_and_convert(x):
    if isinstance(x, str):
        match = re.search(r'[-+]?\d*\.?\d+', x)
        if match:
            return float(match.group())
    return x

def format_numeric(values):
    if len(values) > 0:
        return f"{values.mean():.2f}±{values.std():.2f}"
    return ""

def format_numeric2(values):
    if len(values) > 0:
        median = np.median(values)
        q1 = np.percentile(values, 25)
        q3 = np.percentile(values, 75)
        return f"{median:.2f} ({q1:.2f}-{q3:.2f})"
    return ""

def format_categorical(count, total):
    if total > 0:
        return f"{count} ({count/total*100:.1f}%)"
    return "0 (0.0%)"

def age_filter(df, age_range):
    if age_range == "under40":
        return df[df['Age'] < 40]
    elif age_range == "40-49":
        return df[(df['Age'] >= 40) & (df['Age'] < 50)]
    elif age_range == "50-64":
        return df[(df['Age'] >= 50) & (df['Age'] < 65)]
    elif age_range == "65up":
        return df[df['Age'] >= 65]
    else:
        return df

def get_category(col, age_col, AM_col, CM_col, mets_cols, disease_cols):
    if col in age_col:
        return 'Age'
    elif col in AM_col:
        return 'Anthropometric Measures'
    elif col in CM_col:
        return 'Clinical Measures'
    elif col in mets_cols + ['MetS']:
        return 'Metabolic syndrome'
    elif col in disease_cols:
        return 'Disease status & Medication'
    elif col == 'Sex':
        return 'Sex'
    else:
        return col
    
def move_column(ws, old_index, new_index):
    max_row = ws.max_row
    
    column_data = []
    for row in range(1, max_row + 1):
        cell = ws.cell(row=row, column=old_index)
        column_data.append(cell.value)
    
    if old_index < new_index:
        for row in range(1, max_row + 1):
            for col in range(old_index, new_index):
                ws.cell(row=row, column=col).value = ws.cell(row=row, column=col+1).value
    else:
        for row in range(1, max_row + 1):
            for col in range(old_index, new_index, -1):
                ws.cell(row=row, column=col).value = ws.cell(row=row, column=col-1).value
    
    for row in range(1, max_row + 1):
        ws.cell(row=row, column=new_index).value = column_data[row-1]

def sep_group(base_df, type='met'):
    if type == 'met':
        mets_groups = {
            "Total": base_df,
            "MetS": base_df[base_df['MetS'] == 1],
            "Non-MetS": base_df[base_df['MetS'] == 0]
        }
    else:
        mets_groups = {
            "Total": base_df,
            "Non-MetS & Non-DM": base_df[(base_df['Diabetes Mellitus'] == 0) | (base_df['MetS'] == 0)],
            "MetS & Non-DM": base_df[(base_df['Diabetes Mellitus'] == 0) | (base_df['MetS'] == 1)],
            "DM": base_df[base_df['Diabetes Mellitus'] == 1]
        }
    return mets_groups

def read_p(p_value):
    if pd.isna(p_value):
        return ""
    elif p_value == "-":
        return "-"
    else:
        return f"{p_value:.3f}"

In [12]:
mets_groups = sep_group(base_df)
for mets_status, mets_df in mets_groups.items():
    print(f'{mets_status}: {len(mets_df)}')

for category_name, base_df in [("Men", men_df), ("Women", women_df)]:    
    mets_groups = sep_group(base_df)
    for mets_status, mets_df in mets_groups.items():
        print(f'{category_name}_{mets_status}: {len(mets_df)}')

Total: 11238
MetS: 3053
Non-MetS: 8185
Men_Total: 0
Men_MetS: 0
Men_Non-MetS: 0
Women_Total: 0
Women_MetS: 0
Women_Non-MetS: 0


In [13]:
for col in list(df.columns[1:]):
    base_df = all_df
    mets_groups = sep_group(base_df)
    category = get_category(col, age_col, AM_col, CM_col, mets_cols, disease_cols)
    
    if category not in categorized_data:
        categorized_data[category] = []
        category_row = {'Variable': category}
        for mets_status in mets_groups.keys():
            category_row[mets_status] = category
        categorized_data[category].append(category_row)
    
    if col in age_col + AM_col + CM_col:
        row = {'Variable': col}
        for mets_status, mets_df in mets_groups.items():
            values = mets_df[col].apply(clean_and_convert).dropna()
            if col == 'Triglycerides (mg/dL)':
                row[mets_status] = format_numeric2(values)
            else:
                row[mets_status] = format_numeric(values)
        categorized_data[category].append(row)
        
    else:
        temp_data = {mets_status: mets_df[col].astype(str).replace('nan', 'Missing value').value_counts() for mets_status, mets_df in mets_groups.items()}
        base_counts = base_df[col].astype(str).replace('nan', 'Missing value').value_counts()
        categories = list(base_counts.keys()) if len(base_counts) > 0 else []
        binary_numeric = set(categories).issubset({'0', '1', 'Missing value'})
        binary_boolean = set(categories).issubset({'True', 'False', 'Missing value'})
        
        group_counts = {mets_status: len(mets_df) for mets_status, mets_df in mets_groups.items()}
        
        if col == 'Sex':
            row = {'Variable': 'Male'}
            for mets_status, counts in temp_data.items():
                male_count = counts.get('M', 0)
                row[mets_status] = format_categorical(male_count, group_counts[mets_status])
            categorized_data[category].append(row)
            
        elif binary_numeric or binary_boolean:
            target_value = '1' if binary_numeric else 'True'
            row = {'Variable': col}
            
            for mets_status, counts in temp_data.items():
                target_count = counts.get(target_value, 0)
                row[mets_status] = format_categorical(target_count, group_counts[mets_status])
            categorized_data[category].append(row)
            
        else:
            categories = []
            missing_count = 0
            for val, count in base_counts.items():
                if val == 'Missing value':
                    missing_count = count
                else:
                    try:
                        categories.append((float(val), val, count))
                    except:
                        categories.append((-float('inf'), val, count))
            
            categories.sort(reverse=True)
            sorted_items = [(cat[1], cat[2]) for cat in categories]
            if missing_count > 0:
                sorted_items.append(('Missing value', missing_count))
            
            for val, _ in sorted_items:
                row = {'Variable': val}
                for mets_status, counts in temp_data.items():
                    val_count = counts.get(val, 0)
                    row[mets_status] = format_categorical(val_count, group_counts[mets_status])
                categorized_data[category].append(row)

results_by_category["All"] = categorized_data

for category_name, base_df in [("Men", men_df), ("Women", women_df)]:    
    mets_groups = sep_group(base_df)
    categorized_data = {}
    
    for col in list(df.columns[1:]):
        category = get_category(col, age_col, AM_col, CM_col, mets_cols, disease_cols)
        
        if category not in categorized_data:
            categorized_data[category] = []
            category_row = {'Variable': category}
            for mets_status in mets_groups.keys():
                for age_range in ["under40", "40-49", "50-64", "65up"]:
                    col_key = f"{mets_status}_{age_range}"
                    category_row[col_key] = category
            categorized_data[category].append(category_row)
        
        if col in age_col + AM_col + CM_col:
            row = {'Variable': col}            
            for mets_status, mets_df in mets_groups.items():
                for age_range in ["under40", "40-49", "50-64", "65up"]:
                    age_df = age_filter(mets_df, age_range)
                    values = age_df[col].apply(clean_and_convert).dropna()
                    col_key = f"{mets_status}_{age_range}"
                    if col == 'Triglycerides (mg/dL)':
                        row[col_key] = format_numeric2(values)
                    else:
                        row[col_key] = format_numeric(values)
            categorized_data[category].append(row)
            
        else:
            temp_data = {}
            for mets_status, mets_df in mets_groups.items():
                for age_range in ["under40", "40-49", "50-64", "65up"]:
                    age_df = age_filter(mets_df, age_range)
                    col_key = f"{mets_status}_{age_range}"
                    temp_data[col_key] = age_df[col].astype(str).replace('nan', 'Missing value').value_counts()
            
            base_counts = base_df[col].astype(str).replace('nan', 'Missing value').value_counts()
            categories = list(base_counts.keys()) if len(base_counts) > 0 else []
            binary_numeric = set(categories).issubset({'0', '1', 'Missing value'})
            binary_boolean = set(categories).issubset({'True', 'False', 'Missing value'})
            
            group_counts = {}
            for mets_status, mets_df in mets_groups.items():
                for age_range in ["under40", "40-49", "50-64", "65up"]:
                    age_df = age_filter(mets_df, age_range)
                    col_key = f"{mets_status}_{age_range}"
                    group_counts[col_key] = len(age_df)
            
            if col == 'Sex':
                row = {'Variable': 'Male'}
                for col_key, counts in temp_data.items():
                    male_count = counts.get('M', 0)
                    row[col_key] = format_categorical(male_count, group_counts[col_key])
                categorized_data[category].append(row)
                
            elif binary_numeric or binary_boolean:
                target_value = '1' if binary_numeric else 'True'
                row = {'Variable': col}
                
                for col_key, counts in temp_data.items():
                    target_count = counts.get(target_value, 0)
                    row[col_key] = format_categorical(target_count, group_counts[col_key])
                categorized_data[category].append(row)
                
            else:
                categories = []
                missing_count = 0
                
                for val, count in base_counts.items():
                    if val == 'Missing value':
                        missing_count = count
                    else:
                        try:
                            categories.append((float(val), val, count))
                        except:
                            categories.append((-float('inf'), val, count))
                
                categories.sort(reverse=True)
                sorted_items = [(cat[1], cat[2]) for cat in categories]
                if missing_count > 0:
                    sorted_items.append(('Missing value', missing_count))
                
                for val, _ in sorted_items:
                    row = {'Variable': val}
                    for col_key, counts in temp_data.items():
                        val_count = counts.get(val, 0)
                        row[col_key] = format_categorical(val_count, group_counts[col_key])
                    categorized_data[category].append(row)
    
    results_by_category[category_name] = categorized_data

In [14]:
from scipy import stats
import re

def calculate_total_p_values(base_df):
    base_df = base_df.replace('Missing value', np.nan)

    mets_df = base_df[base_df['MetS'] == 1]
    non_mets_df = base_df[base_df['MetS'] == 0]
    
    p_values = {}
    
    def calculate_p_value(group1, group2, col):
        values1 = group1[col].apply(clean_and_convert).dropna()
        values2 = group2[col].apply(clean_and_convert).dropna()
        
        if col in age_col + AM_col + CM_col: # 연속형 변수인 경우 t-test 사용
            _, p_value = stats.ttest_ind(values1, values2, equal_var=False)
            return p_value
        elif col in categorical_cols: # 범주형 변수인 경우 chi-square test 사용
            values1_str = values1.astype(str)
            values2_str = values2.astype(str)
            
            all_unique = np.unique(np.concatenate([values1_str, values2_str]))
            
            if set(all_unique).issubset({'0', '1'}): # 이진 범주 변수 처리
                cont_table = pd.crosstab(
                    pd.Series(np.concatenate([values1_str, values2_str])),
                    pd.Series(np.concatenate([np.ones(len(values1_str)), np.zeros(len(values2_str))])))
                for category in ['0', '1']:
                    if category not in cont_table.index:
                        cont_table.loc[category] = [0, 0]
                cont_table = cont_table.fillna(0)
                if cont_table.shape[0] > 1 and cont_table.shape[1] > 1:
                    _, p_value, _, _ = stats.chi2_contingency(cont_table)
                    return p_value
                
            else: # 다범주 변수 처리
                contingency = {}
                for category in all_unique:
                    contingency[category] = [sum(values1_str == category), sum(values2_str == category)]
                cont_table = pd.DataFrame.from_dict(contingency, orient='index')                    
                if cont_table.shape[0] > 1 and not (cont_table == 0).any(axis=None):
                    _, p_value, _, _ = stats.chi2_contingency(cont_table)
                    return p_value
        return np.nan

    for col in list(base_df.columns[1:]) + mets_cols:
        p_value = calculate_p_value(mets_df, non_mets_df, col)
        p_values[col] = p_value
        
    return pd.DataFrame({'Variable': list(p_values.keys()), 'p_value': list(p_values.values())})

def calculate_gender_age_p_values(gender_df):
    gender_df = gender_df.replace('Missing value', np.nan)

    age_groups = ["under40", "40-49", "50-64", "65up"]
    results_dict = {}
    
    def perform_age_anova(df, col):
        age_data = []
        age_labels = []    
        for age_range in age_groups:
            age_df = age_filter(df, age_range)
            values = age_df[col].apply(clean_and_convert).dropna()            
            age_data.extend(values)
            age_labels.extend([age_range] * len(values))            
        
        if col in age_col + AM_col + CM_col:  # 연속형 변수는 일원배치 ANOVA
            groups = []
            for age_range in age_groups:
                group_data = [age_data[i] for i in range(len(age_data)) if age_labels[i] == age_range]
                groups.append(group_data)
            _, p_value = stats.f_oneway(*groups)
        
        elif col in categorical_cols:  # 범주형 변수는 카이제곱 검정
            age_labels_array = np.array(age_labels)
            age_data_array = np.array(age_data)
            unique_ages = np.unique(age_labels_array)
            unique_values = np.unique(age_data_array)

            contingency_table = np.zeros((len(unique_values), len(unique_ages)))
            for i, val in enumerate(unique_values):
                for j, age in enumerate(unique_ages):
                    contingency_table[i, j] = np.sum((age_data_array == val) & (age_labels_array == age))
            if contingency_table.shape[0] > 1 and contingency_table.shape[1] > 1:
                _, p_value, _, _ = stats.chi2_contingency(contingency_table)
            else:
                p_value = '-'

        else:
            p_value = '해당없는 변수'

        return p_value

    for col in list(df.columns[1:]) + mets_cols:   
        if col not in results_dict:
            results_dict[col] = {}
            
        results_dict[col]['Total'] = perform_age_anova(gender_df, col)
        
        mets_df = gender_df[gender_df['MetS'] == 1]
        results_dict[col]['MetS'] = perform_age_anova(mets_df, col)
        
        non_mets_df = gender_df[gender_df['MetS'] == 0]
        results_dict[col]['NonMetS'] = perform_age_anova(non_mets_df, col)
    
    return pd.DataFrame.from_dict(results_dict, orient='index')

total_p_values = calculate_total_p_values(df)
men_p_values = calculate_gender_age_p_values(men_df)
women_p_values = calculate_gender_age_p_values(women_df)



In [15]:
from openpyxl import Workbook
from openpyxl.styles import Alignment, Border, Side, Font

wb = Workbook()
default_sheet = wb.active
wb.remove(default_sheet)

thin_border = Border(left=Side(style='thin'), right=Side(style='thin'), top=Side(style='thin'), bottom=Side(style='thin'))
bold_font = Font(bold=True)

In [16]:
# All 시트 생성 (연령대 구분 없음)
if "All" in wb.sheetnames:
    del wb["All"]
ws = wb.create_sheet("All")

# 헤더 생성 - Category 열 제거
ws.merge_cells(start_row=1, start_column=1, end_row=2, end_column=1)
cell = ws.cell(row=1, column=1, value="Variable")
cell.alignment = Alignment(horizontal='center', vertical='center')
cell.border = thin_border
cell.font = bold_font

# Total, MetS, Non-MetS 헤더
for i, status in enumerate(["Total", "MetS", "Non-MetS"], 2):
    cell = ws.cell(row=1, column=i, value=status)
    ws.merge_cells(start_row=1, start_column=i, end_row=2, end_column=i)
    cell.alignment = Alignment(horizontal='center', vertical='center')
    cell.border = thin_border
    cell.font = bold_font

# p-value 헤더 추가
cell = ws.cell(row=1, column=5, value="p-value")
ws.merge_cells(start_row=1, start_column=5, end_row=2, end_column=5)
cell.alignment = Alignment(horizontal='center', vertical='center')
cell.border = thin_border
cell.font = bold_font

# 데이터 행 추가
row_idx = 3

p_value_dict = dict(zip(total_p_values['Variable'], total_p_values['p_value']))
for category, rows in results_by_category["All"].items():
    cell = ws.cell(row=row_idx, column=1, value=category)
    cell.font = bold_font
    cell.border = thin_border
    
    # p-value 추가
    p_value = p_value_dict.get(category, np.nan)
    p_value_str = read_p(p_value)
    cell = ws.cell(row=row_idx, column=6, value=p_value_str)
    cell.border = thin_border

    ws.merge_cells(start_row=row_idx, start_column=1, end_row=row_idx, end_column=4)
    
    row_idx += 1
    
    for i, row_data in enumerate(rows):
        if i == 0 and row_data['Variable'] == row_data.get('Total', '') == category:
            continue
            
        # 변수 이름 추가
        variable_name = row_data['Variable']
        ws.cell(row=row_idx, column=1, value=variable_name).border = thin_border
        
        # Total, MetS, Non-MetS 데이터 추가
        for col_idx, status in enumerate(["Total", "MetS", "Non-MetS"], 2):
            cell = ws.cell(row=row_idx, column=col_idx, value=row_data.get(status, ""))
            cell.border = thin_border

        # p-value 추가
        p_value = p_value_dict.get(variable_name, np.nan)
        p_value_str = read_p(p_value)
        cell = ws.cell(row=row_idx, column=6, value=p_value_str)            
        cell.border = thin_border

        row_idx += 1

# 열 너비 조정
ws.column_dimensions['A'].width = 38
for col_letter in ['B', 'C', 'D', 'E']:
    ws.column_dimensions[col_letter].width = 15

In [17]:
# 남여 시트 생성 (연령대 구분 있음)
for category_name, p_values_df in [("Men", men_p_values), ("Women", women_p_values)]:
    if category_name in wb.sheetnames:
        del wb[category_name]
    ws = wb.create_sheet(category_name)
    
    # 헤더 생성 - Category 열 제거
    ws.merge_cells(start_row=1, start_column=1, end_row=2, end_column=1)
    cell = ws.cell(row=1, column=1, value="Variable")
    cell.alignment = Alignment(horizontal='center', vertical='center')
    cell.border = thin_border
    cell.font = bold_font
    
    # 메타볼릭 신드롬 상태 헤더
    cell = ws.cell(row=1, column=2, value="Total")
    cell.alignment = Alignment(horizontal='center')
    cell.border = thin_border
    cell.font = bold_font
    
    cell = ws.cell(row=1, column=6, value="MetS")
    cell.alignment = Alignment(horizontal='center')
    cell.border = thin_border
    cell.font = bold_font
    
    cell = ws.cell(row=1, column=10, value="Non-MetS")
    cell.alignment = Alignment(horizontal='center')
    cell.border = thin_border
    cell.font = bold_font
    
    # p-value 헤더 추가
    cell = ws.cell(row=1, column=14, value="")
    cell.alignment = Alignment(horizontal='center')
    cell.border = thin_border
    cell.font = bold_font
    
    # p-value 서브헤더 추가
    for i, status in enumerate(["P-value", "P-value", "P-value"], 0):
        col = 14 + i
        cell = ws.cell(row=2, column=col, value=status)
        cell.alignment = Alignment(horizontal='center')
        cell.border = thin_border
        cell.font = bold_font
    
    # 연령대 헤더
    age_labels = ["age < 40", "40 ≤ age < 50", "50 ≤ age < 65", "65 ≤"]
    for base_col, prefix in [(2, "Total"), (6, "MetS"), (10, "Non-MetS")]:
        for i, age_label in enumerate(age_labels):
            cell = ws.cell(row=2, column=base_col + i, value=age_label)
            cell.alignment = Alignment(horizontal='center')
            cell.border = thin_border
            cell.font = bold_font
    
    # 데이터 행 추가
    row_idx = 3
    
    for category, rows in results_by_category[category_name].items():
        cell = ws.cell(row=row_idx, column=1, value=category)
        cell.font = bold_font
        cell.border = thin_border
                
        # 카테고리에 대한 p-value 추가
        if category in p_values_df.index:
            for i, status in enumerate(['Total', 'MetS', 'NonMetS']):
                p_value = p_values_df.loc[category].get(status, np.nan)
                if pd.isna(p_value):
                    p_value_str = ""
                elif p_value == "-":
                    p_value_str = "-"
                else:
                    p_value_str = f"{p_value:.3f}"
                
                cell = ws.cell(row=row_idx, column=14 + i, value=p_value_str)
                cell.border = thin_border

        row_idx += 1
        
        for i, row_data in enumerate(rows):
            if i == 0:
                is_category_row = True
                for mets_status in ["Total", "MetS", "Non-MetS"]:
                    for age_range in ["under40", "40-49", "50-64", "65up"]:
                        col_key = f"{mets_status}_{age_range}"
                        if row_data.get(col_key, "") != category:
                            is_category_row = False
                            break
                    if not is_category_row:
                        break
                
                if is_category_row:
                    continue
            
            # 변수 이름 추가
            variable_name = row_data['Variable']
            ws.cell(row=row_idx, column=1, value=variable_name).border = thin_border
            
            # 데이터 추가
            col_idx = 2
            for mets_status in ["Total", "MetS", "Non-MetS"]:
                for age_range in ["under40", "40-49", "50-64", "65up"]:
                    col_key = f"{mets_status}_{age_range}"
                    cell = ws.cell(row=row_idx, column=col_idx, value=row_data.get(col_key, ""))
                    cell.border = thin_border
                    col_idx += 1
            
            # 수정된 p-value 처리 방식 적용
            if variable_name in p_values_df.index:
                for i, status in enumerate(['Total', 'MetS', 'NonMetS']):
                    p_value = p_values_df.loc[variable_name].get(status, np.nan)
                    
                    if pd.isna(p_value):
                        p_value_str = ""
                    elif p_value == "-":
                        p_value_str = "-"
                    else:
                        p_value_str = f"{p_value:.3f}"
                    
                    cell = ws.cell(row=row_idx, column=14 + i, value=p_value_str)
                    cell.border = thin_border
            
            row_idx += 1
    
    # 열 너비 조정
    ws.column_dimensions['A'].width = 38
    for col_letter in ['B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']:
        ws.column_dimensions[col_letter].width = 15
    
    move_column(ws, 14, 6)
    move_column(ws, 15, 11)

    ws.merge_cells(start_row=1, start_column=2, end_row=1, end_column=6)
    ws.merge_cells(start_row=1, start_column=7, end_row=1, end_column=11)
    ws.merge_cells(start_row=1, start_column=12, end_row=1, end_column=16)

In [18]:
excel_filename = 'analysis_results_by_category_MetS.xlsx'
wb.save(excel_filename)