In [1]:
import pandas as pd

In [2]:
## DATA FORM REFERENCE

DATA_PATH = '../Obesity-Prediction/data/train.csv'

df = pd.read_csv(DATA_PATH)
df = df.drop('id', axis=1)
print(f'Sample size: {len(df)}, Data shape: {df.shape}')
df.head()

Sample size: 20758, Data shape: (20758, 17)


Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Male,24.443011,1.699998,81.66995,yes,yes,2.0,2.983297,Sometimes,no,2.763573,no,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,Female,18.0,1.56,57.0,yes,yes,2.0,3.0,Frequently,no,2.0,no,1.0,1.0,no,Automobile,Normal_Weight
2,Female,18.0,1.71146,50.165754,yes,yes,1.880534,1.411685,Sometimes,no,1.910378,no,0.866045,1.673584,no,Public_Transportation,Insufficient_Weight
3,Female,20.952737,1.71073,131.274851,yes,yes,3.0,3.0,Sometimes,no,1.674061,no,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,Male,31.641081,1.914186,93.798055,yes,yes,2.679664,1.971472,Sometimes,no,1.979848,no,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


In [10]:
import pandas as pd
import requests
from io import BytesIO

url_list = {
    'Demographic Variables': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DEMO_L.xpt',
    'Total Nutrient Intakes, First Day': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DR1TOT_L.xpt',
    'Body Measures': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/BMX_L.xpt',
    'Individual Foods, First Day': 'https://wwwn.cdc.gov/Nchs/Data/Nhanes/Public/2021/DataFiles/DR1IFF_L.xpt',
}

df = pd.DataFrame()

for dataset_name, url in url_list.items():
    response = requests.get(url)
    if response.status_code == 200:
        xpt_data = pd.read_sas(BytesIO(response.content), format='xport')

        if 'DR1ILINE' in xpt_data:
            valid_meals = [1, 2, 3] # Breakfast, Lunch, Dinner Code
            filtered_df = xpt_data[xpt_data['DR1_030Z'].isin(valid_meals)]
            meal_count_df = filtered_df.groupby('SEQN')['DR1ILINE'].count().reset_index(name='MealCount')
            print(meal_count_df)

        # Merge on SEQN if df already contains data
        if df.empty:
            df = xpt_data
        else:
            df = pd.merge(df, xpt_data, on='SEQN', how='outer')
        print(f'{dataset_name:<40}:{xpt_data.shape}')
    else:
        print(f"Failed to load {url}. Status code: {response.status_code}")

print(df.shape)


Demographic Variables                   :(11933, 27)
Total Nutrient Intakes, First Day       :(8860, 168)
Body Measures                           :(8860, 22)
          SEQN  MealCount
0     130378.0         13
1     130379.0         13
2     130380.0         14
3     130381.0         14
4     130382.0         12
...        ...        ...
6177  142303.0          6
6178  142304.0          6
6179  142307.0          7
6180  142309.0          4
6181  142310.0         15

[6182 rows x 2 columns]
Individual Foods, First Day             :(100116, 84)
(105298, 298)


In [4]:
print(df['MealCount'])

KeyError: 'MealCount'

In [9]:
expected_columns = [
    'Gender', 'Age', 'Height', 'Weight', #'family_history_with_overweight',
    'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 
    'TUE', 'CALC', 'MTRANS'
]

df_renamed = df.rename(columns={
    "RIAGENDR"  : "Gender",
    "RIDAGEYR"  : "Age",
    "BMXWT"     : "Weight",
    "BMXHT"     : "Height",
    "DR1TKCAL"  : "FAVC",
    "MealCount" : "NCP"
}, inplace=False)

# Check if all expected columns are in the renamed DataFrame
missing_columns = [col for col in expected_columns if col not in df_renamed.columns]

if not missing_columns:
    print("All expected columns are present.")
else:
    print(f"Missing columns: {', '.join(missing_columns)}")

Missing columns: Gender, Age, FAVC, FCVC, NCP, CAEC, SMOKE, CH2O, SCC, FAF, TUE, CALC, MTRANS
