In [4]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# from sklearn.impute import SimpleImputer
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report, accuracy_score

# # Load Data
# train_df = pd.read_csv("Train_Data 1.csv")
# test_df = pd.read_csv("Test_Data 1.csv")

# # Encode target variable
# # train_df['age_group'] = train_df['age_group'].map({'Adult': 0, 'Senior': 1})
# # Strip whitespace, drop NaNs, then map
# train_df['age_group'] = train_df['age_group'].astype(str).str.strip()
# train_df = train_df[train_df['age_group'].isin(['Adult', 'Senior'])]  # Keep only valid classes
# train_df['age_group'] = train_df['age_group'].map({'Adult': 0, 'Senior': 1})


# # Save test IDs
# test_ids = test_df['SEQN']

# # Drop SEQN
# train_df.drop(columns=['SEQN'], inplace=True)
# test_df.drop(columns=['SEQN'], inplace=True)

# # Define columns
# categorical_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']
# numerical_cols = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']

# # Impute missing categorical with mode
# cat_imputer = SimpleImputer(strategy='most_frequent')
# train_df[categorical_cols] = cat_imputer.fit_transform(train_df[categorical_cols])
# test_df[categorical_cols] = cat_imputer.transform(test_df[categorical_cols])

# # Impute missing numerical with median
# num_imputer = SimpleImputer(strategy='median')
# train_df[numerical_cols] = num_imputer.fit_transform(train_df[numerical_cols])
# test_df[numerical_cols] = num_imputer.transform(test_df[numerical_cols])

# # Split features and target
# X = train_df.drop(columns=['age_group'])
# y = train_df['age_group']

# # Split for evaluation
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# # Train Random Forest Classifier
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X_train, y_train)

# # Evaluate
# val_preds = clf.predict(X_val)
# print("\nValidation Accuracy:", accuracy_score(y_val, val_preds))
# print("\nClassification Report:\n", classification_report(y_val, val_preds, target_names=['Adult', 'Senior']))

# # Predict on test set
# test_preds = clf.predict(test_df)

# # Create submission
# submission = pd.DataFrame({'age_group': test_preds})
# submission.to_csv("submission.csv", index=False)
# print("\n✅ submission.csv has been created.")


In [6]:
# ---------------------------------------------
# Machine Learning Hackathon - Full Pipeline
# Logistic Regression with Tuning and Feature Engineering
# ---------------------------------------------

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# -------------------------
# Load Data
# -------------------------
train_df = pd.read_csv("Train_Data 1.csv")
test_df = pd.read_csv("Test_Data 1.csv")

# Clean and encode target
train_df['age_group'] = train_df['age_group'].astype(str).str.strip()
train_df = train_df[train_df['age_group'].isin(['Adult', 'Senior'])]
train_df['age_group'] = train_df['age_group'].map({'Adult': 0, 'Senior': 1})

# -------------------------
# Feature Engineering (Train)
# -------------------------
def feature_engineering(df):
    df = df.copy()
    df["IN_GL_Ratio"] = df["LBXIN"] / (df["LBXGLU"] + 1)
    df["IsActive"] = df["PAQ605"].apply(lambda x: 1 if x == 1 else 0)
    df["BMI_Category"] = pd.cut(df["BMXBMI"],
                                 bins=[0, 18.5, 24.9, 29.9, 100],
                                 labels=["Underweight", "Normal", "Overweight", "Obese"])
    df = pd.get_dummies(df, columns=["BMI_Category"], drop_first=True)
    df["GLU_DIABETES"] = df["LBXGLU"] * df["DIQ010"]
    df["BMI_GLU"] = df["BMXBMI"] * df["LBXGLU"]
    df["IsHighRisk"] = ((df["LBXGLU"] > 125) & (df["DIQ010"] == 1)).astype(int)
    return df

train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

# -------------------------
# Prepare Features
# -------------------------
X = train_df.drop(columns=['SEQN', 'age_group'])
y = train_df['age_group']
test_ids = test_df['SEQN']
test_df = test_df.drop(columns=['SEQN'])

# Ensure consistent columns
for col in X.columns:
    if col not in test_df.columns:
        test_df[col] = 0
test_df = test_df[X.columns]  # reorder

# -------------------------
# Imputation & Scaling
# -------------------------
cat_cols = ['RIAGENDR', 'PAQ605', 'DIQ010']
num_cols = [col for col in X.columns if col not in cat_cols]

imputer = SimpleImputer(strategy='most_frequent')
X[cat_cols] = imputer.fit_transform(X[cat_cols])
test_df[cat_cols] = imputer.transform(test_df[cat_cols])

imputer_num = SimpleImputer(strategy='median')
X[num_cols] = imputer_num.fit_transform(X[num_cols])
test_df[num_cols] = imputer_num.transform(test_df[num_cols])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_scaled = scaler.transform(test_df)

# -------------------------
# Train-Test Split
# -------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# -------------------------
# GridSearchCV for Best C
# -------------------------
param_grid = {'C': [0.01, 0.1, 0.5, 1, 5, 10]}
grid = GridSearchCV(LogisticRegression(max_iter=1000, class_weight='balanced'), param_grid, cv=5)
grid.fit(X_train, y_train)
model = grid.best_estimator_
print("Best C:", grid.best_params_)

# -------------------------
# Evaluation
# -------------------------
y_val_pred = model.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:\n", classification_report(y_val, y_val_pred, target_names=['Adult', 'Senior']))

# -------------------------
# Predict on Test Set with Threshold Tuning
# -------------------------
test_probs = model.predict_proba(test_scaled)[:, 1]
test_preds = (test_probs > 0.45).astype(int)  # tuned threshold

# -------------------------
# Submission
# -------------------------
submission = pd.DataFrame({'age_group': test_preds})
submission.to_csv("submission2.csv", index=False)
print("\n✅ submission.csv saved with optimized logistic regression!")

Best C: {'C': 0.01}

Validation Accuracy: 0.7212276214833759

Classification Report:
               precision    recall  f1-score   support

       Adult       0.90      0.75      0.82       328
      Senior       0.30      0.56      0.39        63

    accuracy                           0.72       391
   macro avg       0.60      0.65      0.61       391
weighted avg       0.80      0.72      0.75       391


✅ submission.csv saved with optimized logistic regression!
