In [1]:
import pandas as pd
import os

# 1. Load the datasets
# Assuming the files are in the same directory as your script/notebook
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

# 2. Basic sanity check - Print the size of the datasets
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# 3. Look at the first few rows to understand the features
print("\n--- Train Data Head ---")
print(train_df.head())

print("\n--- Submission Format Example ---")
print(submission_df.head())

Train shape: (700000, 26)
Test shape: (300000, 25)

--- Train Data Head ---
   id  age  alcohol_consumption_per_week  physical_activity_minutes_per_week  \
0   0   31                             1                                  45   
1   1   50                             2                                  73   
2   2   32                             3                                 158   
3   3   54                             3                                  77   
4   4   54                             1                                  55   

   diet_score  sleep_hours_per_day  screen_time_hours_per_day   bmi  \
0         7.7                  6.8                        6.1  33.4   
1         5.7                  6.5                        5.8  23.8   
2         8.5                  7.4                        9.1  24.1   
3         4.6                  7.0                        9.2  26.6   
4         5.7                  6.2                        5.1  28.8   

   waist_to_hip_

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# 1. Separate Target and ID
target = 'diagnosed_diabetes'
# Drop rows in train where target might be missing (just in case)
train_df = train_df.dropna(subset=[target]) 

y = train_df[target].values # The labels for training
train_ids = train_df['id']
test_ids = test_df['id']

# 2. Drop unnecessary columns
# We drop 'id' because it's just an index, not a feature
# We drop 'diagnosed_diabetes' from train_df to match test_df structure for processing
train_features = train_df.drop(['id', 'diagnosed_diabetes'], axis=1)
test_features = test_df.drop(['id'], axis=1)

# 3. Combine temporarily for consistent preprocessing
# This ensures that if 'test' has a category 'train' doesn't (or vice versa), the columns still match
all_features = pd.concat([train_features, test_features], axis=0)

# --- IDENTIFY COLUMNS ---

# Numerical columns (Continuous values)
numerical_cols = [
    'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
    'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day',
    'bmi', 'waist_to_hip_ratio', 'systolic_bp'
]

# Categorical columns (Text/Strings)
categorical_cols = [
    'gender', 'ethnicity', 'education_level', 
    'income_level', 'smoking_status', 'employment_status'
]

# Binary/Already Numeric columns (0/1) - We usually leave these alone
# (family_history_diabetes, hypertension_history, cardiovascular_history)

# --- PREPROCESSING ---

# A. Handle Categorical Data (One-Hot Encoding)
# drop_first=True helps reduce redundancy (e.g., if is_Male=0, we know is_Female=1)
all_features = pd.get_dummies(all_features, columns=categorical_cols, drop_first=True)

# B. Handle Numerical Data (Scaling)
scaler = StandardScaler()
all_features[numerical_cols] = scaler.fit_transform(all_features[numerical_cols])

# C. Handle Missing Values (Simple Imputation)
# Fill numeric NaNs with Mean, others with 0
all_features[numerical_cols] = all_features[numerical_cols].fillna(all_features[numerical_cols].mean())
all_features = all_features.fillna(0)



In [3]:
# --- SPLIT BACK TO TRAIN / TEST ---

# Split back using the original length of the train dataframe
X = all_features.iloc[:len(train_df)].values
X_kaggle_test = all_features.iloc[len(train_df):].values

# --- CREATE VALIDATION SET ---
# Essential for Neural Nets to stop training before overfitting
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Final Input Shape: {X_train.shape}")
print(f"Target Shape: {y_train.shape}")
print(f"Validation Shape: {X_val.shape}")

Final Input Shape: (560000, 36)
Target Shape: (560000,)
Validation Shape: (140000, 36)


In [5]:
# 1. Check what columns are causing the issue (Debugging)
print("Checking for non-numeric columns...")
# Convert back to DataFrame just to see dtypes
temp_df = pd.DataFrame(X_train)
# Print columns that are of type 'object'
print(temp_df.select_dtypes(include=['object']).head())

# 2. THE FIX: Force conversion to float32
# This will turn Booleans (True/False) into 1.0/0.0
# And if there are strings like '1', it converts them. 
# If there are strings like 'Male', it will crash and tell us exactly which one is wrong.
try:
    X_train = X_train.astype(np.float32)
    X_val = X_val.astype(np.float32)
    y_train = y_train.astype(np.float32)
    y_val = y_val.astype(np.float32)
    print("SUCCESS: Data converted to float32.")
except ValueError as e:
    print("ERROR: You still have text in your data that cannot be converted!")
    print(e)

Checking for non-numeric columns...
         0         1         2         3         4         5         6   \
0 -0.373175 -0.073644 -0.106648  0.368804 -1.657032  0.092278  0.635062   
1  0.989543   0.87543 -0.379345 -0.380158  1.213476  1.468802  0.844062   
2 -1.565553 -0.073644 -0.197547 -1.537644 -0.994607  1.173833  0.112563   
3  0.819204 -1.022719 -0.233907  0.232629  0.551051  1.321318 -0.758268   
4  0.734034 -0.073644 -0.342985   2.34334 -0.884203  0.928025  0.983395   

         7         8   9   ...     26     27     28     29     30     31  \
0  1.865628  0.061517  62  ...  False  False   True  False  False  False   
1  1.079128  0.061517  86  ...  False  False  False   True  False   True   
2  0.554794  0.693086  64  ...  False  False  False   True  False  False   
3 -0.231706 -1.291843  70  ...  False  False  False  False   True  False   
4  1.341294  0.422414  69  ...  False  False  False   True  False  False   

      32     33     34     35  
0  False  False  False  

In [14]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

print("--- Starting XGBoost Training ---")

# 1. Define the Model
# We use standard "starter" hyperparameters that usually work well
xgb_model = XGBClassifier(
    n_estimators=2000,          # Maximum number of trees
    learning_rate=0.02,         # Slow learning to avoid overfitting
    max_depth=6,                # Depth of each tree
    subsample=0.8,              # Use 80% of rows per tree
    colsample_bytree=0.8,       # Use 80% of columns per tree
    objective='binary:logistic',# For binary classification
    eval_metric='auc',          # The metric we care about
    random_state=42,
    early_stopping_rounds=50,   # Stop if validation score doesn't improve for 50 rounds
    n_jobs=-1                   # Use all CPU cores
)

# 2. Train the Model
# We pass the validation set here so it can calculate AUC live
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=100  # Print progress every 100 rounds
)

# 3. Check Final Score
val_preds_xgb = xgb_model.predict_proba(X_val)[:, 1] # Get probabilities, not just 0/1
final_auc = roc_auc_score(y_val, val_preds_xgb)

print(f"\nFinal XGBoost Validation AUC: {final_auc:.5f}")

--- Starting XGBoost Training ---
[0]	validation_0-auc:0.62971	validation_1-auc:0.62635
[100]	validation_0-auc:0.70705	validation_1-auc:0.70466
[200]	validation_0-auc:0.71558	validation_1-auc:0.71193
[300]	validation_0-auc:0.72085	validation_1-auc:0.71547
[400]	validation_0-auc:0.72512	validation_1-auc:0.71805
[500]	validation_0-auc:0.72829	validation_1-auc:0.71960
[600]	validation_0-auc:0.73118	validation_1-auc:0.72086
[700]	validation_0-auc:0.73373	validation_1-auc:0.72193
[800]	validation_0-auc:0.73608	validation_1-auc:0.72283
[900]	validation_0-auc:0.73818	validation_1-auc:0.72352
[1000]	validation_0-auc:0.74008	validation_1-auc:0.72400
[1100]	validation_0-auc:0.74184	validation_1-auc:0.72434
[1200]	validation_0-auc:0.74343	validation_1-auc:0.72461
[1300]	validation_0-auc:0.74503	validation_1-auc:0.72486
[1400]	validation_0-auc:0.74664	validation_1-auc:0.72502
[1500]	validation_0-auc:0.74824	validation_1-auc:0.72526
[1600]	validation_0-auc:0.74973	validation_1-auc:0.72543
[1700]	va

In [16]:
# 1. Predict on Kaggle Test Set
# Note: XGBoost uses predict_proba to get the score between 0 and 1
# We take [:, 1] because that is the probability of class "1" (Diabetes)
test_probs_xgb = xgb_model.predict_proba(X_kaggle_test)[:, 1]

# 2. Create Submission DataFrame
submission_xgb = pd.DataFrame({
    'id': test_ids,
    'diagnosed_diabetes': test_probs_xgb
})

# 3. Save
submission_xgb.to_csv('submission_xgb.csv', index=False)
print("submission_xgb.csv saved!")

submission_xgb.csv saved!
