In [18]:
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt 
import seaborn as sns
from IPython.display import display

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.float_format", "{:.4f}".format)

input_data_path = "../data/input_data.xlsx"
final_data_path = "../data/final_data.xlsx"
imputed_data_path = "../data/imputed_data.xlsx"
nutrition_data_path = "../data/nutrition_data.xlsx"

input_df = pd.ExcelFile(input_data_path).parse(sheet_name="Sheet1")
final_df = pd.ExcelFile(final_data_path).parse(sheet_name="Sheet2")
imputed_df = pd.ExcelFile(imputed_data_path).parse(sheet_name="Sheet1") # (341, 38)
nutrition_df = pd.ExcelFile(nutrition_data_path).parse(sheet_name="Sheet1") # (2191, 64)

In [19]:
# GẮN CỘT NUMBER VÀO IMPUTED DATA

imputed_df.insert(0, "sub_id", final_df["number"])

In [20]:
# LỌC BỎ PHẦN TPLS_

nutrition_df.loc[nutrition_df["Number"].str.contains(r"^TPLS_", case=False, na=False), 
    "Number"
] = nutrition_df["Number"].str.replace(r"^TPLS_", "", regex=True)

In [21]:
# CHUẨN BỊ IMPUTED_SUBSET

imputed_df.rename(columns={"cholesterol": "cholesterol_blood"}, inplace=True)

removed_cols = ["waist", "hip", "SBP", "DBP"]

imputed_subset = imputed_df.drop(columns=removed_cols).copy()



imputed_subset["insulin_re"] = imputed_subset.apply(
    lambda x: "no" if (x["insulin"] * x["glucose_ac"]) / 405 <= 3.16 else "yes",
    axis=1
)

imputed_subset.drop(columns=["insulin", "glucose_ac"], inplace=True)

imputed_subset.columns

Index(['sub_id', 'age', 'zbmi', 'cholesterol_blood', 'TG', 'HDL', 'cortisol',
       'CPP', 'sex', 'family_income', 'pregnancy_smoking', 'GDM',
       'gestational_weight_gain', 'preterm_birth', 'gestational_age_week',
       'birth_weight_gram', 'exclusive_breastfeeding_month',
       'mixed_breastfeeding_month', 'father_diabetes', 'mother_diabetes',
       'education_level', 'father_BMI', 'mother_BMI',
       'sedentary_lifestyle_hour_day', 'low_physical_activity_hour_day',
       'sleep_duration', 'PSQI_score', 'snoring_times_week',
       'sleep_apnea_times_week', 'score_5A', 'PSS', 'score_5B', 'CES_D',
       'score_5C', 'RSE', 'insulin_re'],
      dtype='object')

In [22]:
# ĐÁNH GIÁ MỨC ĐỘ THIẾU DỮ LIỆU DINH DƯỠNG 

nutrition_cols = nutrition_df.columns

high_missing_cols = []

for col in nutrition_cols: 
    missing_percent = (nutrition_df[col].isna().mean() * 100).round(2)
    if missing_percent > 40:
        high_missing_cols.append(col)
    # print(f"The {col} column has a missing data rate of {missing_percent}%.")


In [23]:
# ĐIỀN GIÁ TRỊ THIẾU TRONG DỮ LIỆU DINH DƯỠNG BẰNG MEDIAN 

nutrition_df = nutrition_df.fillna(nutrition_df.median(numeric_only=True))

# chú ý cách lập luận tại sao lại chọn điền bằng median

In [24]:
# ĐỔI TÊN BẢNG DỮ LIỆU DINH DƯỠNG 

column_names_map = {
    "Number": "sub_id",
    "Energy(kcal)": "energy_kcal", 
    "Crude_Protein(g)": "crude_protein_g", 
    "Crude_Fat(g)": "crude_fat_g", 
    "Total_Carbohydrate(g)": "total_carbohydrate_g", 
    "Water(g)": "water_g", 
    "Fruits": "fruits", 
    "Vegetables": "vegetables", 
    "WholeGrainsAndRoots": "whole_grains_roots", 
    "protein(LowFat)": "protein_low_fat", 
    "protein(MediumFat)": "protein_medium_fat", 
    "protein(HighFat)": "protein_high_fat", 
    "protein(SuperHighFat)": "protein_superhigh_fat", 
    "Dairy(Skim)": "dairy_skim", 
    "Dairy(LowFat)": "dairy_low_fat", 
    "Dairy(WholeFat)": "dairy_whole_fat", 
    "Oils_Nuts_And_Seeds": "oils_nuts_and_seeds", 
    "Total_Polyunsaturated_Fatty_Acids(g)": "total_polyunsaturated_fatty_acids_g", 
    "Total_Monounsaturated_Fatty_Acids(g)": "total_monounsaturated_fatty_acids_g", 
    "Total_Saturated_Fatty_Acids(g)": "total_saturated_fatty_acids_g", 
    "CrudeFiber(g)": "crude_fiber_g", 
    "Dietary_Fiber(g)": "dietary_fiber_g", 
    "Total_Sugar(g)": "total_sugar_g", 
    "Glucose(g)": "glucose_g", 
    "Fructose(g)": "fructose_g", 
    "Maltose(g)": "maltose_g", 
    "Sucrose(g)": "sucrose_g",
    "Lactose(g)": "lactose_g", 
    "Cholesterol(mg)": "cholesterol_mg", 
    "Sodium(mg)": "sodium_mg", 
    "Potassium(mg)": "potassium_mg", 
    "Calcium(mg)": "calcium_mg", 
    "Magnesium(mg)": "magnesium_mg", 
    "Phosphorus(mg)": "phosphorus_mg", 
    "Iron(mg)": "iron_mg", 
    "Zinc(mg)": "zinc_mg", 
    "VitaminB1(Thiamin)(mg)": "vitamin_b1_mg", 
    "VitaminB2(Riboflavin)(mg)": "vitamin_b2_mg", 
    "Niacin(mg)": "niacin_mg", 
    "VitaminB6(mg)": "vitamin_b6_mg", 
    "VitaminB12(ug)": "vitamin_b12_ug", 
    "FolicAcid(ug)": "folicacid_ug", 
    "VitaminC(mg)": "vitaminc_mg",
    "TotalVitaminA(I.U.)": "totalVitaminA_ui", 
    "TotalVitaminE(mg)": "totalVitaminE_mg", 
    "trans_fat": "trans_fat", 
    "Threonine(mg)": "threonine_mg", 
    "Valine(mg)": "valine_mg", 
    "Methionine(mg)": "methionine_mg", 
    "Isoleucine(mg)": "isoleucine_mg", 
    "Leucine(mg)": "leucine_mg", 
    "Phenylalanine(mg)": "phenylalanine_mg", 
    "Lysine(mg)": "lysine_mg", 
    "Histidine(mg)": "histidine_mg", 
    "Tryptophan(mg)": "tryptophan_mg", 
    "Alanine(mg)": "alanine_mg", 
    "Arginine(mg)": "arginine_mg", 
    "AsparticAcid(mg)": "aspartic_mg", 
    "GlutamicAcid(mg)": "glutamic_mg", 
    "Glycine(mg)": "glycine_mg", 
    "Proline(mg)": "proline_mg", 
    "Serine(mg)": "serine_mg", 
    "Tyrosine(mg)": "tyrosine_mg", 
    "Cystine(mg)": "cystine_mg"           
}

nutrition_df.rename(columns=column_names_map, inplace=True)

# len(column_names_map) # 63 không tính cột number

In [25]:
# MỘT SỐ CỘT CÓ VẤN ĐỀ

nutrition_df.drop(columns=["total_sugar_g", "crude_fiber_g"], inplace=True) # hai cột trên không có trong biểu đồ tròn
# biến Cu (Copper) có trong biểu đồ tròn nhưng không có trong cơ sở dữ liệu 
# biến calories trong biểu đồ tròn lấy giá trị của cột nào: cột energy_kcal trong nutrition_df hay cột calories(kcal) trong imputed_subset


In [26]:
# GỘP MỘT SỐ CỘT ĐỂ THÀNH FAMILY 

# EAA 
nutrition_df["EAA"] = nutrition_df["threonine_mg"] + nutrition_df["valine_mg"] + nutrition_df["methionine_mg"] + nutrition_df["isoleucine_mg"] + nutrition_df["leucine_mg"] + nutrition_df["phenylalanine_mg"] + nutrition_df["lysine_mg"] + nutrition_df["histidine_mg"] + nutrition_df["tryptophan_mg"] 

# NEAA
nutrition_df["NEAA"] = nutrition_df["alanine_mg"] + nutrition_df["arginine_mg"] + nutrition_df["aspartic_mg"] + nutrition_df["glutamic_mg"] + nutrition_df["glycine_mg"] + nutrition_df["proline_mg"] + nutrition_df["serine_mg"] + nutrition_df["tyrosine_mg"] + nutrition_df["cystine_mg"]

# PLMF
nutrition_df["PLMF"] = nutrition_df["protein_low_fat"] + nutrition_df["protein_medium_fat"]

# PHSF 
nutrition_df["PHSF"] = nutrition_df["protein_high_fat"] + nutrition_df["protein_superhigh_fat"]

# monosaccharides
nutrition_df["monosaccharides"] = nutrition_df["glucose_g"] + nutrition_df["fructose_g"]

# disaccharides
nutrition_df["disaccharides"] = nutrition_df["maltose_g"] + nutrition_df["sucrose_g"] + nutrition_df["lactose_g"]

# dairy (skim & lowfat)
nutrition_df["dairy_skim_low_fat"] = nutrition_df["dairy_skim"] + nutrition_df["dairy_low_fat"]

# VitB
nutrition_df["totalVitaminB_mg"] = nutrition_df["vitamin_b1_mg"] + nutrition_df["vitamin_b2_mg"] + nutrition_df["niacin_mg"] + nutrition_df["vitamin_b6_mg"] + nutrition_df["vitamin_b12_ug"] + nutrition_df["folicacid_ug"] 

In [27]:
# CHUẨN BỊ NUTRI_SUBSET

nutri_subset = nutrition_df[[
    "sub_id",
    "energy_kcal",
    # nhóm protein 
    "crude_protein_g", 
    "EAA", 
    "NEAA", 
    "PLMF", 
    "PHSF", 
    # nhóm chất béo 
    "crude_fat_g", 
    "trans_fat", 
    "total_saturated_fatty_acids_g", 
    "total_monounsaturated_fatty_acids_g", 
    "total_polyunsaturated_fatty_acids_g", 
    "cholesterol_mg", 
    "oils_nuts_and_seeds", 
    # nhóm CH
    "total_carbohydrate_g", 
    "whole_grains_roots", 
    "monosaccharides", 
    "disaccharides", 
    # nhóm nước, trái cây, rau xanh
    "water_g", 
    "fruits", 
    "vegetables", 
    "dietary_fiber_g",
    # nhóm sữa
    "dairy_whole_fat", 
    "dairy_skim_low_fat", 
    # nhóm ions 
    "sodium_mg",
    "potassium_mg",
    "calcium_mg", 
    "magnesium_mg", 
    "phosphorus_mg",
    "iron_mg",
    "zinc_mg",
    # nhóm vitamin 
    "totalVitaminA_ui", 
    "totalVitaminB_mg",
    "vitaminc_mg", 
    "totalVitaminE_mg"
]].copy()

In [28]:
# ĐỔI TÊN CỘT 

nutri_subset_names_mapper = {
    "energy_kcal": "Calories",
    # nhóm protein 
    "crude_protein_g": "Crude_protein", 
    "EAA": "EAA", 
    "NEAA": "NEAA", 
    "PLMF": "PLMF", 
    "PHSF": "PHSF", 
    # nhóm chất béo 
    "crude_fat_g": "Crude_fat", 
    "trans_fat": "Trans_fat", 
    "total_saturated_fatty_acids_g": "TFA_S", 
    "total_monounsaturated_fatty_acids_g": "TFA_M", 
    "total_polyunsaturated_fatty_acids_g": "TFA_P", 
    "cholesterol_mg": "Cholesterol", 
    "oils_nuts_and_seeds": "Seeds", 
    # nhóm CH
    "total_carbohydrate_g": "Total_carbohydrate", 
    "whole_grains_roots": "Whole_Rhizome", 
    "monosaccharides": "Monosaccharides", 
    "disaccharides": "Disaccharides", 
    # nhóm nước, trái cây, rau xanh
    "water_g": "Water", 
    "fruits": "Fruit", 
    "vegetables": "Vegetable", 
    "dietary_fiber_g": "Dietary_fiber",
    # nhóm sữa
    "dairy_whole_fat": "Dairy_fullfat", 
    "dairy_skim_low_fat": "Dairy_skim_lowfat", 
    # nhóm ions 
    "sodium_mg": "Sodium",
    "potassium_mg": "Potassium",
    "calcium_mg": "Calcium", 
    "magnesium_mg": "Magnesium", 
    "phosphorus_mg": "Phosphorus",
    "iron_mg": "Iron",
    "zinc_mg": "Zinc",
    # nhóm vitamin 
    "totalVitaminA_ui": "VitA", 
    "totalVitaminB_mg": "VitB", 
    "vitaminc_mg": "VitC",
    "totalVitaminE_mg": "VitE"
}

nutri_subset.rename(columns=nutri_subset_names_mapper, inplace=True)



In [29]:
# MERGE DATA

merged_df = pd.merge(imputed_subset, nutri_subset, how="inner", on="sub_id").reset_index(drop=True)

In [30]:
# phenotype.columns

### Main components for rexposome data

pheno_cols = c('age', 'zbmi', 'sex', 'insulin_re')

families = list(
    Prenatal = c('pregnancy_smoking', 'GDM', 'gestational_weight_gain'), 
    Perinatal = c('preterm_birth', 'gestational_age_week', 'birth_weight_gram'), 
    Postnatal = c('CPP', 'exclusive_breastfeeding_month', 'mixed_breastfeeding_month'), 
    Biomaker = c('cholesterol_blood', 'TG', 'HDL', 'cortisol'),
    Family = c('family_income', 'father_diabetes', 'mother_diabetes', 'education_level', 'father_BMI', 'mother_BMI'),
    Activity = c('sedentary_lifestyle_hour_day', 'low_physical_activity_hour_day'), 
    Sleep = c('sleep_duration', 'PSQI_score', 'snoring_times_week', 'sleep_apnea_times_week'),
    Psycho = c('score_5A', 'PSS', 'score_5B', 'CES_D', 'score_5C', 'RSE',), 
    
    Calories = c('Calories'), 
    Protein = c('Crude_protein', 'EAA', 'NEAA', 'PLMF', 'PHSF'), 
    Fat = c('Crude_fat', 'Trans_fat', 'TFA_S', 'TFA_M', 'TFA_P', 'Cholesterol', 'Seeds'), 
    Carbohydrates = c('Total_carbohydrate', 'Whole_Rhizome', 'Monosaccharides', 'Disaccharides'), 
    "Water & fruits & vegetables" = c('Water', 'Fruit', 'Vegetable', 'Dietary fiber'), 
    Diary = c('Dairy_fullfat', 'Dairy_skim_lowfat'), 
    Ions = c('Sodium', 'Potassium', 'Calcium', 'Magnesium', 'Phosphorus', 'Iron', 'Zinc'), 
    Vitamins = c('VitA', 'VitB', 'VitC', 'VitE') 
)


In [31]:
# BIẾN ĐỔI NHÃN CÁC BIẾN SỐ PHÂN LOẠI 

cpp_code = {0: "no", 1: "yes"}
sex_code = {0: "female", 1: "male"}
pregnancy_smoking_code = {0: "no", 1: "yes"}
# GDM_code = {"no-dia": 1, "gmd": 2, "other-dia": 3}
GDM_code = {1: "no", 2: "yes", 3: "no"}
preterm_birth_code = {0: "no", 1: "yes"}
father_diabetes_code = {0: "no", 1: "yes"}
mother_diabetes_code = {0: "no", 1: "yes"}
# education_level_code = {"senior high": 1, "vocational": 2, "graduate": 3, "college": 4}
education_level_code = {1: "low", 2: "low", 3: "high", 4: "medium"}



merged_df["CPP"] = merged_df["CPP"].map(cpp_code).astype("category")
merged_df["sex"] = merged_df["sex"].map(sex_code).astype("category")
merged_df["pregnancy_smoking"] = merged_df["pregnancy_smoking"].map(pregnancy_smoking_code).astype("category")
merged_df["GDM"] = merged_df["GDM"].map(GDM_code).astype("category")
merged_df["preterm_birth"] = merged_df["preterm_birth"].map(preterm_birth_code).astype("category")
merged_df["father_diabetes"] = merged_df["father_diabetes"].map(father_diabetes_code).astype("category")
merged_df["mother_diabetes"] = merged_df["mother_diabetes"].map(mother_diabetes_code).astype("category")

custom_categories = ["low", "medium", "high"]
merged_df["education_level"] = pd.Categorical(
    merged_df["education_level"].map(education_level_code),
    categories=custom_categories,
    ordered=True
)

In [32]:
# KIỂM TRA MỘT SỐ CỘT NGHI NGỜ LÀ BIẾN DANH ĐỊNH

test_cols = ["sedentary_lifestyle_hour_day", "PSQI_score", "snoring_times_week", "sleep_apnea_times_week"]

# for col in test_cols:
#     display(phenotype[col].value_counts())

In [33]:
# CHIA NHỎ BẢNG DỮ LIỆU

exposures = merged_df[nutri_subset.columns] # exposures.shape # (237, 35)
# exposures.to_csv("../data/exposome_data/exposures.csv", sep=",", index=False, encoding="utf-8")

phenotype = merged_df[imputed_subset.columns] # phenotype.shape # (237, 6)
# phenotype.to_csv("../data/exposome_data/phenotype.csv", sep=",", index=False, encoding="utf-8")

# display(exposures.head(5))
# display(phenotype.head(5))

# merged_df.shape # (237, 67)

In [None]:
# XUẤT FILE MERGED DATA

# merged_df.to_csv("../data/exposome_data/total_exwas_data.csv", sep=",", index=False, encoding="utf-8")