In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
from google.colab import files

In [4]:
train = pd.read_csv("Train_Data.csv")
test = pd.read_csv("Test_Data.csv")

train.drop(columns=["SEQN"], inplace=True)
test_ids = test["SEQN"]
test.drop(columns=["SEQN"], inplace=True)
train["age_group"] = train["age_group"].map({"Adult": 0, "Senior": 1})
train.dropna(subset=["age_group"], inplace=True)

In [5]:
# Feature Engineering
def add_features(df):
    df["GLU_INS_RATIO"] = df["LBXGLU"] / (df["LBXIN"] + 1e-5)
    df["BMI_GLU"] = df["BMXBMI"] * df["LBXGLU"]
    df["BMI_OVER_GLU"] = df["BMXBMI"] / (df["LBXGLU"] + 1e-5)
    df["INS_X_GLU"] = df["LBXIN"] * df["LBXGLU"]
    df["IS_OBESE"] = (df["BMXBMI"] > 30).astype(int)
    df["IS_HIGHGLU"] = (df["LBXGLU"] > 125).astype(int)
    df["ACTIVE_FLAG"] = (df["PAQ605"] == 1).astype(int)
    df["DIABETIC_FLAG"] = ((df["LBXGLU"] > 125) | (df["DIQ010"] == 1)).astype(int)
    df["INSULIN_BIN"] = pd.cut(df["LBXIN"], bins=[-1, 2, 5, 10, 20, 1000], labels=False)
    df["BMI_CATEGORY"] = pd.cut(df["BMXBMI"], bins=[0, 18.5, 25, 30, 35, 100], labels=False)
    return df

train = add_features(train)
test = add_features(test)

In [6]:
#Preprocessing
X = train.drop(columns=["age_group"])
y = train["age_group"]

imp = SimpleImputer(strategy="median")
scaler = StandardScaler()

X = imp.fit_transform(X)
X = scaler.fit_transform(X)
X_test = imp.transform(test)
X_test = scaler.transform(X_test)

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [7]:
rf = RandomForestClassifier(n_estimators=150, max_depth=6, class_weight='balanced', random_state=42)
xgb = XGBClassifier(n_estimators=200, max_depth=4, learning_rate=0.07, scale_pos_weight=5, use_label_encoder=False, eval_metric='logloss', random_state=42)
lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
gb = GradientBoostingClassifier(n_estimators=150, learning_rate=0.05, max_depth=4, random_state=42)

voting = VotingClassifier(estimators=[('rf', rf), ('xgb', xgb), ('lr', lr), ('gb', gb)], voting='soft')
voting.fit(X_train, y_train)

val_probs = voting.predict_proba(X_val)[:, 1]
best_f1 = 0
best_thresh = 0.5
for t in np.arange(0.1, 0.9, 0.05):
    preds = (val_probs >= t).astype(int)
    f1 = f1_score(y_val, preds)
    print(f"Threshold: {t:.2f} → F1 Score: {f1:.4f}")
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

Parameters: { "use_label_encoder" } are not used.



Threshold: 0.10 → F1 Score: 0.2940
Threshold: 0.15 → F1 Score: 0.3140
Threshold: 0.20 → F1 Score: 0.3366
Threshold: 0.25 → F1 Score: 0.3676
Threshold: 0.30 → F1 Score: 0.3540
Threshold: 0.35 → F1 Score: 0.3756
Threshold: 0.40 → F1 Score: 0.3735
Threshold: 0.45 → F1 Score: 0.3310
Threshold: 0.50 → F1 Score: 0.3206
Threshold: 0.55 → F1 Score: 0.2703
Threshold: 0.60 → F1 Score: 0.2574
Threshold: 0.65 → F1 Score: 0.2444
Threshold: 0.70 → F1 Score: 0.1772
Threshold: 0.75 → F1 Score: 0.1600
Threshold: 0.80 → F1 Score: 0.1143
Threshold: 0.85 → F1 Score: 0.0312


In [8]:
print(f"\n📌 Best Threshold: {best_thresh:.2f} → F1: {best_f1:.4f}")
val_preds = (val_probs >= best_thresh).astype(int)
print("\nClassification Report on Validation Set:")
print(classification_report(y_val, val_preds))


📌 Best Threshold: 0.35 → F1: 0.3756

Classification Report on Validation Set:
              precision    recall  f1-score   support

         0.0       0.90      0.70      0.79       328
         1.0       0.28      0.59      0.38        63

    accuracy                           0.69       391
   macro avg       0.59      0.65      0.58       391
weighted avg       0.80      0.69      0.72       391



In [9]:
test_probs = voting.predict_proba(X_test)[:, 1]
test_preds = (test_probs >= best_thresh).astype(int)
submission = pd.DataFrame({"age_group": test_preds})
submission.to_csv("submission.csv", index=False, encoding='utf-8-sig', lineterminator='\n')
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>