In [72]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json

DEMOGRAPHICS_TO_KEEP = ["SEQN", "SDDSRVYR", "RIAGENDR", "RIDAGEYR", "RIDAGEMN", "RIDAGEEX", "RIDRETH1", "DMDBORN2", "DMDMARTL", "DMDHHSIZ", "DMDFMSIZ", "INDHHIN2", "INDFMIN2", ]

with open("splitting/split_examination.json") as json_file:
    splitting_examination = json.load(json_file)

files_examination = pd.read_feather("../extraction/raw_data/files_examination.feather")


# Drop P_BPXO: file not accessible and covered by order files name
files_examination.drop(index=files_examination.index[files_examination["data_file_name"].isna() | files_examination["data_file_name"].isin(["P_BPXO"])], inplace=True)

files_examination.set_index("data_file_description", inplace=True)

In [83]:
for category, data_file_description in splitting_examination.items():
    if category in ["body", "eyes", "ears", "nose_mouth", "lungs", "heart", "liver", "bones", "nerves", "muscles", "skin", "physical_activity"]:
        continue
    print(category)
    file_names = files_examination.loc[data_file_description, "data_file_name"].drop_duplicates()
    min_seqn = float("inf")
    max_seqn = -float("inf")

    for file_name in tqdm(file_names):
        raw_data = pd.read_csv("../extraction/raw_data/examination/" + file_name + ".csv")
        
        if "SEQN" not in raw_data.columns:
            continue

        if raw_data["SEQN"].min() < min_seqn:
            min_seqn = raw_data["SEQN"].min()
        if max_seqn < raw_data["SEQN"].max():
            max_seqn = raw_data["SEQN"].max()
    
    data_category = pd.DataFrame(None, index=pd.Index(range(int(min_seqn), int(max_seqn) + 1), name="SEQN"))
    
    for file_name in tqdm(file_names):
        raw_data = pd.read_csv("../extraction/raw_data/examination/" + file_name + ".csv")

        if "SEQN" not in raw_data.columns:
            continue

        raw_data.set_index("SEQN", inplace=True)
        if "SPXRAW" not in file_name:  # Spirometry - Raw Curve Data does not contain extra columns
            raw_data.drop(columns=["file_name", "cycle", "begin_year", "end_year"], inplace=True)
    
        data_category.loc[raw_data.index, raw_data.columns] = raw_data

    
    object_columns = data_category.columns[data_category.dtypes == "object"]
    data_category[object_columns] = data_category[object_columns].astype(str, copy=False)
    
    data_category.dropna(how="all", inplace=True)
    data_category.reset_index().to_feather(f"fusionned_data/examination/{category}.feather")
    break

100%|██████████| 1/1 [00:00<00:00, 21.64it/s]
100%|██████████| 1/1 [00:00<00:00, 51.99it/s]

other



