In [21]:
import pandas as pd
import os

# 1. Load the datasets
# Assuming the files are in the same directory as your script/notebook
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

# 2. Basic sanity check - Print the size of the datasets
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# 3. Look at the first few rows to understand the features
print("\n--- Train Data Head ---")
print(train_df.head())

print("\n--- Submission Format Example ---")
print(submission_df.head())

Train shape: (700000, 26)
Test shape: (300000, 25)

--- Train Data Head ---
   id  age  alcohol_consumption_per_week  physical_activity_minutes_per_week  \
0   0   31                             1                                  45   
1   1   50                             2                                  73   
2   2   32                             3                                 158   
3   3   54                             3                                  77   
4   4   54                             1                                  55   

   diet_score  sleep_hours_per_day  screen_time_hours_per_day   bmi  \
0         7.7                  6.8                        6.1  33.4   
1         5.7                  6.5                        5.8  23.8   
2         8.5                  7.4                        9.1  24.1   
3         4.6                  7.0                        9.2  26.6   
4         5.7                  6.2                        5.1  28.8   

   waist_to_hip_

In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# 1. Separate Target and ID
target = 'diagnosed_diabetes'
train_df = train_df.dropna(subset=[target]) 

y = train_df[target].values # The labels for training
train_ids = train_df['id']
test_ids = test_df['id']

# 2. Drop unnecessary columns
train_features = train_df.drop(['id', 'diagnosed_diabetes'], axis=1)
test_features = test_df.drop(['id'], axis=1)

# 3. Combine temporarily for consistent preprocessing
all_features = pd.concat([train_features, test_features], axis=0)

# --- START FEATURE ENGINEERING ---

# 1. Interaction Features (Continuous)
# Multiply features that amplify each other's risk
all_features['age_bmi'] = all_features['age'] * all_features['bmi']
all_features['age_bp'] = all_features['age'] * all_features['systolic_bp']
all_features['bmi_bp'] = all_features['bmi'] * all_features['systolic_bp']

# 2. Threshold Features (Binary/Categorical)
# Create explicit flags for medical risk groups
all_features['is_obese'] = (all_features['bmi'] >= 30).astype(int)
all_features['is_overweight'] = ((all_features['bmi'] >= 25) & (all_features['bmi'] < 30)).astype(int)
all_features['high_bp'] = (all_features['systolic_bp'] >= 130).astype(int)
all_features['is_senior'] = (all_features['age'] >= 65).astype(int)

# 3. Lifestyle Risk Score (Ordinal)
# Combine negative factors: High BMI + Low Activity + Low Sleep
# We cast booleans to int (True=1, False=0) and sum them up
all_features['lifestyle_risk'] = (
    (all_features['bmi'] > 30).astype(int) + 
    (all_features['physical_activity_minutes_per_week'] < 60).astype(int) +
    (all_features['sleep_hours_per_day'] < 6).astype(int)
)

# --- END FEATURE ENGINEERING ---

# --- IDENTIFY COLUMNS ---

# Update Numerical columns to include the NEW continuous interaction features
numerical_cols = [
    'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
    'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day',
    'bmi', 'waist_to_hip_ratio', 'systolic_bp',
    # NEW FEATURES
    'age_bmi', 'age_bp', 'bmi_bp', 'lifestyle_risk'
]

# Categorical columns (Text/Strings) - Unchanged
categorical_cols = [
    'gender', 'ethnicity', 'education_level', 
    'income_level', 'smoking_status', 'employment_status'
]

# Binary/Already Numeric columns (0/1)
# The new binary features (is_obese, etc.) are already 0/1, so we don't scale or encode them.
# We just let them pass through.

# --- PREPROCESSING ---

# A. Handle Categorical Data (One-Hot Encoding)
all_features = pd.get_dummies(all_features, columns=categorical_cols, drop_first=True)

# B. Handle Numerical Data (Scaling)
# We now include the new interaction terms in the scaling so they don't dominate gradients
scaler = StandardScaler()
all_features[numerical_cols] = scaler.fit_transform(all_features[numerical_cols])

# C. Handle Missing Values
all_features[numerical_cols] = all_features[numerical_cols].fillna(all_features[numerical_cols].mean())
all_features = all_features.fillna(0)



In [23]:
# --- SPLIT BACK TO TRAIN / TEST ---
X = all_features.iloc[:len(train_df)].values
X_kaggle_test = all_features.iloc[len(train_df):].values

# Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Final Input Shape: {X_train.shape}")

Final Input Shape: (560000, 44)


In [24]:
# 1. Check what columns are causing the issue (Debugging)
print("Checking for non-numeric columns...")
# Convert back to DataFrame just to see dtypes
temp_df = pd.DataFrame(X_train)
# Print columns that are of type 'object'
print(temp_df.select_dtypes(include=['object']).head())

# 2. THE FIX: Force conversion to float32
# This will turn Booleans (True/False) into 1.0/0.0
# And if there are strings like '1', it converts them. 
# If there are strings like 'Male', it will crash and tell us exactly which one is wrong.
try:
    X_train = X_train.astype(np.float32)
    X_val = X_val.astype(np.float32)
    y_train = y_train.astype(np.float32)
    y_val = y_val.astype(np.float32)
    print("SUCCESS: Data converted to float32.")
except ValueError as e:
    print("ERROR: You still have text in your data that cannot be converted!")
    print(e)

Checking for non-numeric columns...
         0         1         2         3         4         5         6   \
0 -0.373175 -0.073644 -0.106648  0.368804 -1.657032  0.092278  0.635062   
1  0.989543   0.87543 -0.379345 -0.380158  1.213476  1.468802  0.844062   
2 -1.565553 -0.073644 -0.197547 -1.537644 -0.994607  1.173833  0.112563   
3  0.819204 -1.022719 -0.233907  0.232629  0.551051  1.321318 -0.758268   
4  0.734034 -0.073644 -0.342985   2.34334 -0.884203  0.928025  0.983395   

         7         8   9   ...     34     35     36     37     38     39  \
0  1.865628  0.061517  62  ...  False  False   True  False  False  False   
1  1.079128  0.061517  86  ...  False  False  False   True  False   True   
2  0.554794  0.693086  64  ...  False  False  False   True  False  False   
3 -0.231706 -1.291843  70  ...  False  False  False  False   True  False   
4  1.341294  0.422414  69  ...  False  False  False   True  False  False   

      40     41     42     43  
0  False  False  False  

In [26]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

print("--- Starting XGBoost Training ---")

# 1. Define the Model
# We use standard "starter" hyperparameters that usually work well
xgb_model_v2 = XGBClassifier(
    n_estimators=3000,          # Increased, but we rely on early_stopping
    learning_rate=0.01,         # Slower learning = better generalization
    
    # --- Regularization Parameters ---
    max_depth=4,                # Reduced from 6. Forces simpler trees.
    min_child_weight=5,         # Requires more data to make a split (prevents isolating outliers)
    gamma=0.2,                  # Minimum loss reduction required to make a split
    reg_alpha=1.0,              # L1 Regularization (Lasso) - kills useless features
    reg_lambda=1.5,             # L2 Regularization (Ridge) - reduces weights
    subsample=0.7,              # Use less data per tree to add randomness
    colsample_bytree=0.7,       # Use fewer columns per tree
    
    objective='binary:logistic',
    eval_metric='auc',
    random_state=42,
    early_stopping_rounds=100,  # Give it more patience since LR is lower
    n_jobs=-1
)

print("Training Reguralized Model...")
xgb_model_v2.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=200
)

# 3. Check Final Score
val_preds_xgb = xgb_model_v2.predict_proba(X_val)[:, 1] # Get probabilities, not just 0/1
final_auc = roc_auc_score(y_val, val_preds_xgb)

print(f"\nFinal XGBoost Validation AUC: {final_auc:.5f}")

--- Starting XGBoost Training ---
Training Reguralized Model...
[0]	validation_0-auc:0.67332	validation_1-auc:0.67452
[200]	validation_0-auc:0.69753	validation_1-auc:0.69722
[400]	validation_0-auc:0.70513	validation_1-auc:0.70446
[600]	validation_0-auc:0.70934	validation_1-auc:0.70814
[800]	validation_0-auc:0.71231	validation_1-auc:0.71060
[1000]	validation_0-auc:0.71507	validation_1-auc:0.71287
[1200]	validation_0-auc:0.71760	validation_1-auc:0.71495
[1400]	validation_0-auc:0.71959	validation_1-auc:0.71649
[1600]	validation_0-auc:0.72110	validation_1-auc:0.71760
[1800]	validation_0-auc:0.72246	validation_1-auc:0.71859
[2000]	validation_0-auc:0.72381	validation_1-auc:0.71954
[2200]	validation_0-auc:0.72491	validation_1-auc:0.72028
[2400]	validation_0-auc:0.72584	validation_1-auc:0.72084
[2600]	validation_0-auc:0.72681	validation_1-auc:0.72144
[2800]	validation_0-auc:0.72764	validation_1-auc:0.72192
[2999]	validation_0-auc:0.72841	validation_1-auc:0.72232

Final XGBoost Validation AUC: 

In [27]:
# 1. Predict on Kaggle Test Set
# Note: XGBoost uses predict_proba to get the score between 0 and 1
# We take [:, 1] because that is the probability of class "1" (Diabetes)
test_probs_xgb = xgb_model_v2.predict_proba(X_kaggle_test)[:, 1]

# 2. Create Submission DataFrame
submission_xgb = pd.DataFrame({
    'id': test_ids,
    'diagnosed_diabetes': test_probs_xgb
})

# 3. Save
submission_xgb.to_csv('submission_xgb.csv', index=False)
print("submission_xgb.csv saved!")

submission_xgb.csv saved!
