In [123]:
# ====================
# Step 1: Import Libraries
# ====================
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import xgboost as xgb

In [124]:
# ====================
# Step 2: Load Data
# ====================
train_df = pd.read_csv("Train_Data.csv")
test_df = pd.read_csv("Test_Data.csv")
sample_submission = pd.read_csv("Sample_Submission.csv")

In [125]:
# ====================
# Step 3: Clean Target Column
# ====================
# Drop rows with missing target
train_df = train_df.dropna(subset=['age_group']).copy()

In [126]:
# Map 'Adult' → 0, 'Senior' → 1
age_map = {'Adult': 0, 'Senior': 1}
train_df['age_group'] = train_df['age_group'].map(age_map)

In [127]:
# ====================
# Step 4: Drop ID Column if exists
# ====================
id_column = 'SEQN'
if id_column in train_df.columns:
    train_df.drop(columns=[id_column], inplace=True)
    test_df.drop(columns=[id_column], inplace=True)

In [128]:
# ====================
# Step 5: Encode Gender
# ====================
gender_map = {1: 0, 2: 1}  # 1=Male, 2=Female
train_df['RIAGENDR'] = train_df['RIAGENDR'].map(gender_map)
test_df['RIAGENDR'] = test_df['RIAGENDR'].map(gender_map)

In [129]:
# ====================
# Step 6: Define Features
# ====================
features = ['RIAGENDR', 'PAQ605', 'BMXBMI', 'LBXGLU', 'DIQ010', 'LBXGLT', 'LBXIN']

In [130]:
# ====================
# Step 7: Combine Train+Test for Consistent Imputation
# ====================
combined = pd.concat([train_df[features], test_df[features]], axis=0).reset_index(drop=True)

In [131]:
# Impute missing with median
imputer = SimpleImputer(strategy='median')
combined_imputed = pd.DataFrame(imputer.fit_transform(combined), columns=features)

In [132]:
# ====================
# Step 8: Split Back to Train and Test
# ====================
X_train = combined_imputed.iloc[:len(train_df)].copy()
X_test = combined_imputed.iloc[len(train_df):].copy()

In [133]:
# ====================
# Step 9: Create Safe Feature: GLU_IN_RATIO
# ====================
X_train['GLU_IN_RATIO'] = X_train['LBXGLU'] / X_train['LBXIN'].replace(0, np.nan)
X_test['GLU_IN_RATIO'] = X_test['LBXGLU'] / X_test['LBXIN'].replace(0, np.nan)

In [134]:
# Replace infs and NaNs with 0
X_train['GLU_IN_RATIO'] = X_train['GLU_IN_RATIO'].replace([np.inf, -np.inf], np.nan).fillna(0)
X_test['GLU_IN_RATIO'] = X_test['GLU_IN_RATIO'].replace([np.inf, -np.inf], np.nan).fillna(0)

In [135]:
# ====================
# Step 10: Scale Features
# ====================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [136]:
# ====================
# Step 11: Set Target
# ====================
y = train_df['age_group']

In [137]:
# ================================
# Step 11: Compute scale_pos_weight
# ================================
class_counts = y.value_counts()
scale_pos_weight = class_counts[0] / class_counts[1]
print("scale_pos_weight:", round(scale_pos_weight, 2))


scale_pos_weight: 5.22


In [168]:
# ================================
# Step 12: Train XGBoost Model
# ================================
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,
    gamma=5
)

xgb_model.fit(X_scaled, y)


In [169]:
# ================================
# Step 13: Cross-Validated F1 Score
# ================================
cv_f1 = cross_val_score(xgb_model, X_scaled, y, cv=5, scoring='f1')
print("XGBoost Cross-validated F1 Score:", round(cv_f1.mean(), 4))

XGBoost Cross-validated F1 Score: 0.4263


In [170]:
# ================================
# Step 14: Predict and Save Submission
# ================================
xgb_preds = xgb_model.predict(X_test_scaled)

submission = pd.DataFrame({'age_group': xgb_preds})
submission.to_csv("xgb_submission.csv", index=False)
print("✅ Saved submission to xgb_submission.csv")

✅ Saved submission to xgb_submission.csv
