In [11]:
import pandas as pd
import os

# 1. Load the datasets
# Assuming the files are in the same directory as your script/notebook
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')

# 2. Basic sanity check - Print the size of the datasets
print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

# 3. Look at the first few rows to understand the features
print("\n--- Train Data Head ---")
print(train_df.head())

print("\n--- Submission Format Example ---")
print(submission_df.head())

Train shape: (700000, 26)
Test shape: (300000, 25)

--- Train Data Head ---
   id  age  alcohol_consumption_per_week  physical_activity_minutes_per_week  \
0   0   31                             1                                  45   
1   1   50                             2                                  73   
2   2   32                             3                                 158   
3   3   54                             3                                  77   
4   4   54                             1                                  55   

   diet_score  sleep_hours_per_day  screen_time_hours_per_day   bmi  \
0         7.7                  6.8                        6.1  33.4   
1         5.7                  6.5                        5.8  23.8   
2         8.5                  7.4                        9.1  24.1   
3         4.6                  7.0                        9.2  26.6   
4         5.7                  6.2                        5.1  28.8   

   waist_to_hip_

In [12]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import pandas as pd
import numpy as np

# 2. Re-Apply Feature Engineering (Exact same as before)
train_df['is_train'] = 1
test_df['is_train'] = 0
test_df['diagnosed_diabetes'] = np.nan 
all_data = pd.concat([train_df, test_df], axis=0)

# --- Feature Engineering ---
all_data['age_bmi'] = all_data['age'] * all_data['bmi']
all_data['age_bp'] = all_data['age'] * all_data['systolic_bp']
all_data['bmi_bp'] = all_data['bmi'] * all_data['systolic_bp']
all_data['is_obese'] = (all_data['bmi'] >= 30).astype(int)
all_data['is_overweight'] = ((all_data['bmi'] >= 25) & (all_data['bmi'] < 30)).astype(int)
all_data['high_bp'] = (all_data['systolic_bp'] >= 130).astype(int)
all_data['lifestyle_risk'] = (
    (all_data['bmi'] > 30).astype(int) + 
    (all_data['physical_activity_minutes_per_week'] < 60).astype(int) +
    (all_data['sleep_hours_per_day'] < 6).astype(int)
)

# 3. Handle Missing Values (Basic fill)
numerical_cols = ['age', 'bmi', 'systolic_bp', 'diet_score', 'sleep_hours_per_day', 
                  'physical_activity_minutes_per_week', 'waist_to_hip_ratio',
                  'age_bmi', 'age_bp', 'bmi_bp', 'lifestyle_risk']
                  
all_data[numerical_cols] = all_data[numerical_cols].fillna(all_data[numerical_cols].mean())
all_data = all_data.fillna('Missing') # Fill text missings with "Missing" string



In [13]:
# 4. Split Back
train_final = all_data[all_data['is_train'] == 1].drop(['id', 'is_train', 'diagnosed_diabetes'], axis=1)
test_final = all_data[all_data['is_train'] == 0].drop(['id', 'is_train', 'diagnosed_diabetes'], axis=1)
y = train_df['diagnosed_diabetes'].values

# 5. Identify Categorical Columns Indices
# CatBoost needs to know WHICH columns are text
cat_features = ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']

print("Data Ready for CatBoost.")

Data Ready for CatBoost.


In [16]:
# --- CATBOOST CV LOOP ---
folds = 5
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

oof_preds = np.zeros(len(train_final))
test_preds = np.zeros(len(test_final))

print(f"Starting CatBoost CV...")

for fold, (train_idx, val_idx) in enumerate(skf.split(train_final, y)):
    X_train_fold = train_final.iloc[train_idx]
    y_train_fold = y[train_idx]
    X_val_fold = train_final.iloc[val_idx]
    y_val_fold = y[val_idx]
    
    # Define Model
    # iterations=2000 is usually enough with early stopping
    model = CatBoostClassifier(
        iterations=5000,
        learning_rate=0.02,
        depth=8,
        eval_metric='AUC',
        random_seed=42,
        bagging_temperature=0.2,
        od_type='Iter',
        od_wait=50,
        verbose=200
    )
    
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=(X_val_fold, y_val_fold),
        cat_features=cat_features, # CRITICAL: Tell it which cols are text
        use_best_model=True
    )
    
    val_preds = model.predict_proba(X_val_fold)[:, 1]
    oof_preds[val_idx] = val_preds
    
    test_preds += model.predict_proba(test_final)[:, 1] / folds
    
    print(f"Fold {fold+1} AUC: {roc_auc_score(y_val_fold, val_preds):.5f}")

print(f"\nOverall CatBoost CV AUC: {roc_auc_score(y, oof_preds):.5f}")

Starting CatBoost CV...
0:	test: 0.6843861	best: 0.6843861 (0)	total: 318ms	remaining: 26m 28s
200:	test: 0.7056378	best: 0.7056378 (200)	total: 58.2s	remaining: 23m 9s
400:	test: 0.7108423	best: 0.7108423 (400)	total: 2m 23s	remaining: 27m 30s
600:	test: 0.7142913	best: 0.7142913 (600)	total: 3m 18s	remaining: 24m 13s
800:	test: 0.7182151	best: 0.7182151 (800)	total: 4m 15s	remaining: 22m 18s
1000:	test: 0.7210998	best: 0.7210998 (1000)	total: 5m 13s	remaining: 20m 53s
1200:	test: 0.7225078	best: 0.7225078 (1200)	total: 6m 12s	remaining: 19m 37s
1400:	test: 0.7234717	best: 0.7234736 (1398)	total: 7m 10s	remaining: 18m 26s
1600:	test: 0.7242936	best: 0.7242937 (1598)	total: 8m 10s	remaining: 17m 20s
1800:	test: 0.7247703	best: 0.7247703 (1800)	total: 9m 9s	remaining: 16m 15s
2000:	test: 0.7252023	best: 0.7252023 (2000)	total: 10m 7s	remaining: 15m 10s
2200:	test: 0.7255408	best: 0.7255408 (2200)	total: 11m 5s	remaining: 14m 6s
2400:	test: 0.7258984	best: 0.7258984 (2400)	total: 12m 3s	

In [17]:
# Save
submission_cat = pd.DataFrame({
    'id': test_df['id'],
    'diagnosed_diabetes': test_preds
})
submission_cat.to_csv('submission_catboost.csv', index=False)
print("submission_catboost.csv saved!")


submission_catboost.csv saved!
