In [1]:
import pandas as pd
import os

# 1. Load the datasets
# Assuming the files are in the same directory as your script/notebook
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

# 2. Basic sanity check - Print the size of the datasets
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# 3. Look at the first few rows to understand the features
print("\n--- Train Data Head ---")
print(train_df.head())

print("\n--- Submission Format Example ---")
print(submission_df.head())

Train shape: (700000, 26)
Test shape: (300000, 25)

--- Train Data Head ---
   id  age  alcohol_consumption_per_week  physical_activity_minutes_per_week  \
0   0   31                             1                                  45   
1   1   50                             2                                  73   
2   2   32                             3                                 158   
3   3   54                             3                                  77   
4   4   54                             1                                  55   

   diet_score  sleep_hours_per_day  screen_time_hours_per_day   bmi  \
0         7.7                  6.8                        6.1  33.4   
1         5.7                  6.5                        5.8  23.8   
2         8.5                  7.4                        9.1  24.1   
3         4.6                  7.0                        9.2  26.6   
4         5.7                  6.2                        5.1  28.8   

   waist_to_hip_

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# 1. Separate Target and ID
target = 'diagnosed_diabetes'
# Drop rows in train where target might be missing (just in case)
train_df = train_df.dropna(subset=[target]) 

y = train_df[target].values # The labels for training
train_ids = train_df['id']
test_ids = test_df['id']

# 2. Drop unnecessary columns
# We drop 'id' because it's just an index, not a feature
# We drop 'diagnosed_diabetes' from train_df to match test_df structure for processing
train_features = train_df.drop(['id', 'diagnosed_diabetes'], axis=1)
test_features = test_df.drop(['id'], axis=1)

# 3. Combine temporarily for consistent preprocessing
# This ensures that if 'test' has a category 'train' doesn't (or vice versa), the columns still match
all_features = pd.concat([train_features, test_features], axis=0)

# --- IDENTIFY COLUMNS ---

# Numerical columns (Continuous values)
numerical_cols = [
    'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week',
    'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day',
    'bmi', 'waist_to_hip_ratio', 'systolic_bp'
]

# Categorical columns (Text/Strings)
categorical_cols = [
    'gender', 'ethnicity', 'education_level', 
    'income_level', 'smoking_status', 'employment_status'
]

# Binary/Already Numeric columns (0/1) - We usually leave these alone
# (family_history_diabetes, hypertension_history, cardiovascular_history)

# --- PREPROCESSING ---

# A. Handle Categorical Data (One-Hot Encoding)
# drop_first=True helps reduce redundancy (e.g., if is_Male=0, we know is_Female=1)
all_features = pd.get_dummies(all_features, columns=categorical_cols, drop_first=True)

# B. Handle Numerical Data (Scaling)
scaler = StandardScaler()
all_features[numerical_cols] = scaler.fit_transform(all_features[numerical_cols])

# C. Handle Missing Values (Simple Imputation)
# Fill numeric NaNs with Mean, others with 0
all_features[numerical_cols] = all_features[numerical_cols].fillna(all_features[numerical_cols].mean())
all_features = all_features.fillna(0)



In [3]:
# --- SPLIT BACK TO TRAIN / TEST ---

# Split back using the original length of the train dataframe
X = all_features.iloc[:len(train_df)].values
X_kaggle_test = all_features.iloc[len(train_df):].values

# --- CREATE VALIDATION SET ---
# Essential for Neural Nets to stop training before overfitting
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Final Input Shape: {X_train.shape}")
print(f"Target Shape: {y_train.shape}")
print(f"Validation Shape: {X_val.shape}")

Final Input Shape: (560000, 36)
Target Shape: (560000,)
Validation Shape: (140000, 36)


In [5]:
# 1. Check what columns are causing the issue (Debugging)
print("Checking for non-numeric columns...")
# Convert back to DataFrame just to see dtypes
temp_df = pd.DataFrame(X_train)
# Print columns that are of type 'object'
print(temp_df.select_dtypes(include=['object']).head())

# 2. THE FIX: Force conversion to float32
# This will turn Booleans (True/False) into 1.0/0.0
# And if there are strings like '1', it converts them. 
# If there are strings like 'Male', it will crash and tell us exactly which one is wrong.
try:
    X_train = X_train.astype(np.float32)
    X_val = X_val.astype(np.float32)
    y_train = y_train.astype(np.float32)
    y_val = y_val.astype(np.float32)
    print("SUCCESS: Data converted to float32.")
except ValueError as e:
    print("ERROR: You still have text in your data that cannot be converted!")
    print(e)

Checking for non-numeric columns...
         0         1         2         3         4         5         6   \
0 -0.373175 -0.073644 -0.106648  0.368804 -1.657032  0.092278  0.635062   
1  0.989543   0.87543 -0.379345 -0.380158  1.213476  1.468802  0.844062   
2 -1.565553 -0.073644 -0.197547 -1.537644 -0.994607  1.173833  0.112563   
3  0.819204 -1.022719 -0.233907  0.232629  0.551051  1.321318 -0.758268   
4  0.734034 -0.073644 -0.342985   2.34334 -0.884203  0.928025  0.983395   

         7         8   9   ...     26     27     28     29     30     31  \
0  1.865628  0.061517  62  ...  False  False   True  False  False  False   
1  1.079128  0.061517  86  ...  False  False  False   True  False   True   
2  0.554794  0.693086  64  ...  False  False  False   True  False  False   
3 -0.231706 -1.291843  70  ...  False  False  False  False   True  False   
4  1.341294  0.422414  69  ...  False  False  False   True  False  False   

      32     33     34     35  
0  False  False  False  

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau

# 1. Define a "Deeper & Wider" Architecture
class DiabetesNN_V2(nn.Module):
    def __init__(self, input_dim):
        super(DiabetesNN_V2, self).__init__()
        
        # Layer 1: Input -> 128 (Wider)
        self.layer1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(0.2) # Lower dropout slightly to allow more learning
        
        # Layer 2: 128 -> 64
        self.layer2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.2)
        
        # Layer 3: 64 -> 32
        self.layer3 = nn.Linear(64, 32)
        self.bn3 = nn.BatchNorm1d(32)
        self.dropout3 = nn.Dropout(0.1) # Very low dropout before output
        
        # Layer 4: 32 -> 1
        self.output = nn.Linear(32, 1)
        
        # Activation: LeakyReLU (Slope 0.01 for negative values)
        self.leaky_relu = nn.LeakyReLU(negative_slope=0.01)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.dropout1(self.leaky_relu(self.bn1(self.layer1(x))))
        x = self.dropout2(self.leaky_relu(self.bn2(self.layer2(x))))
        x = self.dropout3(self.leaky_relu(self.bn3(self.layer3(x))))
        x = self.sigmoid(self.output(x))
        return x
    

# 2. Re-Initialize
model = DiabetesNN_V2(input_dim=X_train.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# FIX: Remove 'verbose=True'
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=3)

# 3. Training Loop with Scheduler
epochs = 50 
batch_size = 2048 

print(f"--- Training V2 on {device} ---")
best_auc = 0

for epoch in range(epochs):
    model.train()
    epoch_loss = 0
    
    permutation = torch.randperm(X_train_tensor.size()[0])
    for i in range(0, X_train_tensor.size()[0], batch_size):
        indices = permutation[i:i+batch_size]
        batch_x, batch_y = X_train_tensor[indices].to(device), y_train_tensor[indices].to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_x)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        
    # Validation
    model.eval()
    with torch.no_grad():
        val_preds = model(X_val_tensor.to(device)).cpu().numpy()
        val_auc = roc_auc_score(y_val, val_preds)
    
    # Update Scheduler
    before_lr = optimizer.param_groups[0]['lr']
    scheduler.step(val_auc)
    after_lr = optimizer.param_groups[0]['lr']
    
    # Print progress
    if val_auc > best_auc:
        best_auc = val_auc
        print(f"Epoch {epoch+1} | Loss: {epoch_loss/(len(X_train)//batch_size):.4f} | Val AUC: {val_auc:.4f} (New Best!)")
    elif (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1} | Loss: {epoch_loss/(len(X_train)//batch_size):.4f} | Val AUC: {val_auc:.4f}")
        
    # Manual Verbose Check: If LR changed, print it
    if before_lr != after_lr:
        print(f"    -> Scheduler: Learning Rate reduced to {after_lr:.6f}")

print(f"Final Best AUC: {best_auc:.4f}")

--- Training V2 on cpu ---
Epoch 1 | Loss: 0.6216 | Val AUC: 0.6856 (New Best!)
Epoch 2 | Loss: 0.6084 | Val AUC: 0.6936 (New Best!)
Epoch 5 | Loss: 0.6065 | Val AUC: 0.6944 (New Best!)
Epoch 9 | Loss: 0.6058 | Val AUC: 0.6952 (New Best!)
Epoch 10 | Loss: 0.6055 | Val AUC: 0.6946
    -> Scheduler: Learning Rate reduced to 0.002500
Epoch 14 | Loss: 0.6046 | Val AUC: 0.6958 (New Best!)
Epoch 15 | Loss: 0.6044 | Val AUC: 0.6960 (New Best!)
    -> Scheduler: Learning Rate reduced to 0.001250
Epoch 20 | Loss: 0.6035 | Val AUC: 0.6953
Epoch 23 | Loss: 0.6035 | Val AUC: 0.6962 (New Best!)
Epoch 25 | Loss: 0.6032 | Val AUC: 0.6954
Epoch 26 | Loss: 0.6032 | Val AUC: 0.6962 (New Best!)
    -> Scheduler: Learning Rate reduced to 0.000625
Epoch 30 | Loss: 0.6027 | Val AUC: 0.6964 (New Best!)
Epoch 32 | Loss: 0.6025 | Val AUC: 0.6965 (New Best!)
    -> Scheduler: Learning Rate reduced to 0.000313
Epoch 35 | Loss: 0.6025 | Val AUC: 0.6962
    -> Scheduler: Learning Rate reduced to 0.000156
Epoch 40 

In [13]:
import pandas as pd
import numpy as np

# 1. Prepare the Kaggle Test Data
# Apply the exact same fix we used on X_train
try:
    X_kaggle_test = X_kaggle_test.astype(np.float32)
    print("Test data converted to float32.")
except ValueError:
    print("Error converting test data.")

# 2. Convert to Tensor
X_test_tensor = torch.tensor(X_kaggle_test, dtype=torch.float32).to(device)

# 3. Predict
model.eval()
with torch.no_grad():
    # Get raw probabilities
    test_predictions = model(X_test_tensor).cpu().numpy()

# 4. Create Submission DataFrame
submission = pd.DataFrame({
    'id': test_ids,
    'diagnosed_diabetes': test_predictions.flatten()
})

# 5. Save
submission.to_csv('submission.csv', index=False)
print("submission.csv saved! Ready for upload.")

Test data converted to float32.
submission.csv saved! Ready for upload.
