In [2]:
# NHANES Hackathon - Full Notebook

# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier

# Step 2: Load Data
train = pd.read_csv("Train_Data.csv")
test = pd.read_csv("Test_Data.csv")
sample_submission = pd.read_csv("Sample_Submission.csv")

# Step 3: Basic EDA
print("Train Shape:", train.shape)
print("Test Shape:", test.shape)
print(train['age_group'].value_counts(normalize=True))

# Step 4: Preprocessing
# Drop rows with missing target
train = train.dropna(subset=['age_group'])

# Target encoding
target_mapping = {'Adult': 0, 'Senior': 1}
train['age_group_num'] = train['age_group'].map(target_mapping)

# Columns
features = ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']
target = 'age_group_num'

# Imputation
numeric_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']
categorical_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']

num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

train[numeric_cols] = num_imputer.fit_transform(train[numeric_cols])
train[categorical_cols] = cat_imputer.fit_transform(train[categorical_cols])
test[numeric_cols] = num_imputer.transform(test[numeric_cols])
test[categorical_cols] = cat_imputer.transform(test[categorical_cols])

# Step 5: Feature Engineering
train['BMI_bin'] = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile').fit_transform(train[['BMXBMI']])
test['BMI_bin'] = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile').fit(train[['BMXBMI']]).transform(test[['BMXBMI']])

train['GLU_INS_ratio'] = train['LBXGLU'] / (train['LBXIN'] + 1e-3)
test['GLU_INS_ratio'] = test['LBXGLU'] / (test['LBXIN'] + 1e-3)

features_adv = ['RIAGENDR', 'PAQ605', 'DIQ010', 'BMI_bin', 'LBXGLU', 'LBXGLT', 'LBXIN', 'GLU_INS_ratio']

# Step 6: Prepare Data
X = train[features_adv]
y = train[target]
X_test = test[features_adv]

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Step 7: Modeling
# SMOTE for imbalance
smote = SMOTE(random_state=42)

# XGBoost Classifier
xgb = XGBClassifier(
    n_estimators=500, max_depth=6, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8,
    scale_pos_weight=len(y[y==0])/len(y[y==1]),
    random_state=42, n_jobs=-1, use_label_encoder=False, eval_metric='logloss'
)

# Pipeline
pipeline = ImbPipeline(steps=[
    ('smote', smote),
    ('xgb', xgb)
])

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='f1', n_jobs=-1)
print("CV F1 Score:", cv_scores.mean())

# Step 8: Train on Full Data
pipeline.fit(X, y)

# Step 9: Predict on Test
test_preds = pipeline.predict(X_test)

# Step 10: Submission
submission = pd.DataFrame({'age_group': test_preds})
submission.to_csv("NHANES_final_submission.csv", index=False)
print("Submission saved as NHANES_final_submission.csv")


Train Shape: (1966, 9)
Test Shape: (312, 8)
age_group
Adult     0.839139
Senior    0.160861
Name: proportion, dtype: float64
CV F1 Score: 0.35068704282331187


Parameters: { "use_label_encoder" } are not used.



Submission saved as NHANES_final_submission.csv
