In [2]:
# --- Section 0: Initial Setup and Library Imports ---
import pandas as pd
import numpy as np

# FIX: Import to enable experimental IterativeImputer
from sklearn.experimental import enable_iterative_imputer

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
import warnings
import gc # Garbage collection for memory management

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set a global random state for reproducibility
GLOBAL_RANDOM_STATE = 42

print("Libraries loaded and global random state set.")

# --- Section 1: Data Loading ---
print("\n--- Section 1: Data Loading ---")
# Adjust paths if your files are in a 'MultipleFiles' subfolder
try:
    train_df = pd.read_csv('C:/Users/Janvi/myproject/Summer Analytics 2025/Train_Data.csv')
    test_df = pd.read_csv('C:/Users/Janvi/myproject/Summer Analytics 2025/Test_Data.csv')
    print("Datasets loaded successfully from 'MultipleFiles' directory.")
except FileNotFoundError:
    print("Files not found in 'MultipleFiles' directory. Trying current directory...")
    train_df = pd.read_csv('Train_Data.csv')
    test_df = pd.read_csv('Test_Data.csv')
    print("Datasets loaded successfully from current directory.")

# Display initial data info
print("\nTrain Data Head:\n", train_df.head())
print("\nTest Data Head:\n", test_df.head())
print("\nTrain Data Shape:", train_df.shape)
print("Test Data Shape:", test_df.shape)

# --- FIX START ---
# Handle missing values in the target variable 'age_group'
# Drop rows where 'age_group' is NaN
initial_train_rows = train_df.shape[0]
train_df.dropna(subset=['age_group'], inplace=True)
rows_after_drop = train_df.shape[0]
print(f"\nDropped {initial_train_rows - rows_after_drop} rows from training data due to missing 'age_group'.")
print(f"New Train Data Shape after target NaN drop: {train_df.shape}")
# --- FIX END ---


# --- Section 2: Exploratory Data Analysis (EDA) & Initial Observations ---
# This section is crucial for understanding data quality and guiding preprocessing/FE.
print("\n--- Section 2: Exploratory Data Analysis (EDA) ---")

print("\nMissing values in Train Data:")
print(train_df.isnull().sum()) # Check again after dropping target NaNs
print("\nMissing values in Test Data:")
print(test_df.isnull().sum())

print("\nValue counts for 'age_group' in Train Data:")
print(train_df['age_group'].value_counts())

# Check unique values for categorical-like columns to confirm their nature
# These columns have limited discrete values and might be better treated as categorical
categorical_like_cols_initial = ['RIAGENDR', 'PAQ605', 'DIQ010']
for col in categorical_like_cols_initial:
    print(f"\nUnique values for {col} in Train Data: {train_df[col].unique()}")
    print(f"Unique values for {col} in Test Data: {test_df[col].unique()}")

# --- Section 3: Data Preprocessing & Feature Engineering ---
print("\n--- Section 3: Data Preprocessing & Feature Engineering ---")

# 3.1 Target Variable Encoding (for training data only)
# Map 'Adult' to 0 and 'Senior' to 1 as per instructions
age_group_mapping = {'Adult': 0, 'Senior': 1}
train_df['age_group'] = train_df['age_group'].map(age_group_mapping)
print("Target variable 'age_group' encoded (Adult=0, Senior=1).")

# 3.2 Define Feature Engineering Function
# This function will be applied to both training and test sets consistently.
def apply_feature_engineering(df):
    df_copy = df.copy() # Work on a copy to avoid SettingWithCopyWarning

    # Convert relevant columns to numeric, coercing errors to NaN
    # This is crucial before any arithmetic operations or binning
    for col in ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']:
        df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')

    # --- Feature Engineering based on domain knowledge and common practices ---

    # 1. BMI Categories (as discussed, good for capturing non-linear effects)
    # Labels are numerical for easier processing later (e.g., OneHotEncoder)
    df_copy['BMI_Category_FE'] = pd.cut(df_copy['BMXBMI'],
                                       bins=[0, 18.5, 24.9, 29.9, 34.9, 39.9, np.inf],
                                       labels=[0, 1, 2, 3, 4, 5], # Underweight, Normal, Overweight, Obese I, II, III
                                       right=True,
                                       include_lowest=True)

    # 2. Glucose-Insulin Interaction (potential indicator of insulin resistance)
    # Handle potential NaNs in LBXGLU or LBXIN before multiplication
    df_copy['GLU_IN_Interaction_FE'] = df_copy['LBXGLU'] * df_copy['LBXIN']

    # 3. Glucose-BMI Ratio (how glucose levels relate to body mass)
    # Handle potential division by zero or NaN in BMXBMI
    df_copy['GLU_BMI_Ratio_FE'] = df_copy['LBXGLU'] / df_copy['BMXBMI']

    # 4. Glucose Tolerance Index (LBXGLT / LBXGLU)
    # This ratio can be indicative of how well the body processes glucose
    df_copy['GLT_GLU_Ratio_FE'] = df_copy['LBXGLT'] / df_copy['LBXGLU']

    # 5. Activity Level Indicator (PAQ605) - Convert to binary if not already
    # Assuming 1.0 is active, 2.0 is not active. Convert to 0/1.
    # Handle potential NaNs in PAQ605
    df_copy['PAQ605_Active_FE'] = df_copy['PAQ605'].apply(lambda x: 1 if x == 1.0 else (0 if x == 2.0 else np.nan))

    # 6. Diabetes Status Indicator (DIQ010) - Convert to binary
    # Assuming 1.0 is Yes, 2.0 is No, 3.0 is Borderline. Convert to 0/1 for 'has diabetes'
    # Handle potential NaNs in DIQ010
    df_copy['DIQ010_Diabetes_FE'] = df_copy['DIQ010'].apply(lambda x: 1 if x == 1.0 else (0 if x in [2.0, 3.0] else np.nan))

    # 7. Gender (RIAGENDR) - Convert to binary (Male=0, Female=1)
    # This ensures consistency and proper encoding if not already done by LabelEncoder
    df_copy['RIAGENDR_Binary_FE'] = df_copy['RIAGENDR'].apply(lambda x: 0 if x == 1.0 else (1 if x == 2.0 else np.nan))

    return df_copy

# Apply FE to both training features (X) and test features (X_test)
X = apply_feature_engineering(train_df.drop(['SEQN', 'age_group'], axis=1))
y = train_df['age_group'] # Target variable
X_test = apply_feature_engineering(test_df.drop('SEQN', axis=1))

print("Feature Engineering applied to both datasets.")

# 3.3 Define Column Types for Preprocessing Pipelines
# Original numerical features (will be imputed and scaled)
original_numerical_features = ['BMXBMI', 'LBXGLU', 'LBXGLT', 'LBXIN']

# Original categorical-like features (will be imputed and one-hot encoded)
original_categorical_features = ['RIAGENDR', 'PAQ605', 'DIQ010']

# Newly engineered numerical features (will be imputed and scaled)
engineered_numerical_features = [
    'GLU_IN_Interaction_FE',
    'GLU_BMI_Ratio_FE',
    'GLT_GLU_Ratio_FE'
]

# Newly engineered categorical features (will be imputed and one-hot encoded)
engineered_categorical_features = [
    'BMI_Category_FE',
    'PAQ605_Active_FE',
    'DIQ010_Diabetes_FE',
    'RIAGENDR_Binary_FE'
]

# Combine lists for the ColumnTransformer
all_numerical_features = original_numerical_features + engineered_numerical_features
all_categorical_features = original_categorical_features + engineered_categorical_features

# 3.4 Define Preprocessing Steps (Pipelines for Numerical and Categorical)
# Numerical pipeline: Impute with MICE, then scale
numerical_transformer = Pipeline(steps=[
    ('imputer', IterativeImputer(random_state=GLOBAL_RANDOM_STATE, max_iter=10, initial_strategy='mean')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: Impute with most frequent, then one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # handle_unknown='ignore' for unseen categories in test set
])

# Create a ColumnTransformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, all_numerical_features),
        ('cat', categorical_transformer, all_categorical_features)
    ],
    remainder='drop' # Drop any columns not explicitly transformed (like original SEQN)
)

print("Preprocessing pipelines defined for all features (original + engineered).")

# --- Section 4: Model Selection & Pipeline Creation ---
print("\n--- Section 4: Model Selection & Pipeline Creation ---")

# Using LightGBM Classifier, known for high performance and speed
# Set objective to 'binary' for binary classification
lgbm_model = lgb.LGBMClassifier(objective='binary', random_state=GLOBAL_RANDOM_STATE, n_jobs=-1)

# Create the full pipeline including preprocessing and the model
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lgbm_model)
])

print("Full ML pipeline created.")

# --- Section 5: Hyperparameter Tuning (RandomizedSearchCV with StratifiedKFold) ---
print("\n--- Section 5: Hyperparameter Tuning (RandomizedSearchCV) ---")

# Define parameter distribution for RandomizedSearchCV
# These ranges are carefully chosen based on common LGBM best practices.
param_distributions = {
    'classifier__n_estimators': [200, 300, 500, 700, 1000], # Number of boosting rounds
    'classifier__learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1], # Step size shrinkage
    'classifier__num_leaves': [20, 31, 40, 50, 60, 80], # Max tree leaves for base learners
    'classifier__max_depth': [-1, 5, 8, 10, 12, 15], # Max tree depth (-1 means no limit)
    'classifier__min_child_samples': [10, 20, 30, 40, 50, 60], # Minimum data in a leaf
    'classifier__subsample': [0.7, 0.8, 0.9, 1.0], # Subsample ratio of the training instance
    'classifier__colsample_bytree': [0.7, 0.8, 0.9, 1.0], # Subsample ratio of columns when constructing each tree
    'classifier__reg_alpha': [0, 0.01, 0.1, 0.5, 1, 2], # L1 regularization term
    'classifier__reg_lambda': [0, 0.01, 0.1, 0.5, 1, 2], # L2 regularization term
    'classifier__boosting_type': ['gbdt', 'dart'] # Gradient Boosting Decision Tree or DART
}

# Use StratifiedKFold for cross-validation to maintain class balance in each fold
# This is important for classification tasks, especially with potential class imbalance.
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=GLOBAL_RANDOM_STATE)

# RandomizedSearchCV setup
# n_iter: number of parameter settings that are sampled. Increase for more thorough search.
# scoring: 'accuracy' is good for balanced datasets, 'roc_auc' for imbalanced. Sticking to accuracy as per problem.
random_search = RandomizedSearchCV(
    full_pipeline,
    param_distributions=param_distributions,
    n_iter=150, # Increased iterations for a more exhaustive search (adjust based on time/resources)
    cv=cv_strategy, # Use StratifiedKFold
    scoring='accuracy',
    random_state=GLOBAL_RANDOM_STATE,
    n_jobs=-1, # Use all available CPU cores
    verbose=2, # Increased verbosity to see progress
    error_score='raise' # Raise errors to debug issues during search
)

# Fit RandomizedSearchCV on the full training data (X, y)
print(f"Starting RandomizedSearchCV with {random_search.n_iter} iterations and {cv_strategy.n_splits}-fold CV...")
random_search.fit(X, y)

print(f"\nBest parameters found: {random_search.best_params_}")
print(f"Best cross-validation accuracy: {random_search.best_score_:.4f}")

# Get the best model from the search
best_model = random_search.best_estimator_

# Clean up memory after search
del random_search
gc.collect()

print("Hyperparameter tuning complete. Best model selected.")

# --- Section 6: Final Evaluation on Training Data (for insight, not for submission) ---
print("\n--- Section 6: Final Evaluation on Training Data ---")
# This gives an idea of how well the best model performs on the data it was trained/validated on.
y_pred_train = best_model.predict(X)
train_accuracy = accuracy_score(y, y_pred_train)
train_roc_auc = roc_auc_score(y, best_model.predict_proba(X)[:, 1])

print(f"Accuracy on full training data: {train_accuracy:.4f}")
print(f"ROC AUC on full training data: {train_roc_auc:.4f}")
print("\nClassification Report on full training data:")
print(classification_report(y, y_pred_train))

# --- Section 7: Make Predictions on Test Data ---
print("\n--- Section 7: Making Predictions on Test Data ---")

# Make predictions using the best model
test_predictions = best_model.predict(X_test)

print("Predictions generated for the test set.")

# --- Section 8: Prepare Submission File ---
print("\n--- Section 8: Prepare Submission File ---")

# Create the submission DataFrame
submission_df = pd.DataFrame({'age_group': test_predictions})

# Ensure the age_group column is integer type (0 or 1) as required by sample_submission.csv
submission_df['age_group'] = submission_df['age_group'].astype(int)

# Save the submission file
submission_file_path = 'submission.csv'
submission_df.to_csv(submission_file_path, index=False)

print(f"\nSubmission file created successfully at: {submission_file_path}")
print("Submission Head:\n", submission_df.head())
print("Submission Value Counts:\n", submission_df['age_group'].value_counts())

print("\n--- Workflow Complete ---")


Libraries loaded and global random state set.

--- Section 1: Data Loading ---
Datasets loaded successfully from 'MultipleFiles' directory.

Train Data Head:
       SEQN  RIAGENDR  PAQ605  BMXBMI  LBXGLU  DIQ010  LBXGLT  LBXIN age_group
0  73564.0       2.0     2.0    35.7   110.0     2.0   150.0  14.91     Adult
1  73568.0       2.0     2.0    20.3    89.0     2.0    80.0   3.85     Adult
2  73576.0       1.0     2.0    23.2    89.0     2.0    68.0   6.14     Adult
3  73577.0       1.0     2.0    28.9   104.0     NaN    84.0  16.15     Adult
4  73580.0       2.0     1.0    35.9   103.0     2.0    81.0  10.92     Adult

Test Data Head:
       SEQN  RIAGENDR  PAQ605  BMXBMI  LBXGLU  DIQ010  LBXGLT  LBXIN
0  77017.0       1.0     1.0    32.2    96.0     2.0   135.0  15.11
1  75580.0       2.0     2.0    26.3   100.0     2.0   141.0  15.26
2  73820.0       1.0     2.0    28.6   107.0     2.0   136.0   8.82
3  80489.0       2.0     1.0    22.1    93.0     2.0   111.0  12.13
4  82047.0     