In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, roc_curve
import xgboost as xgb
import lightgbm as lgb
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
print(f"Train shape: {train.shape}, Test shape: {test.shape}")

Train shape: (700000, 26), Test shape: (300000, 25)


# FEATURE ENGINEERING

In [3]:
print("\nPerforming feature engineering...")

def create_features(df):
    """Create new features from existing ones"""
    df_copy = df.copy()
    
    # Age-related features
    if 'age' in df_copy.columns:
        df_copy['age_squared'] = df_copy['age'] ** 2
        df_copy['age_log'] = np.log1p(df_copy['age'])
    
    # BMI-related features (if available)
    if 'bmi' in df_copy.columns:
        df_copy['bmi_category'] = pd.cut(df_copy['bmi'], 
                                         bins=[0, 18.5, 25, 30, 35, 100],
                                         labels=['Underweight', 'Normal', 'Overweight', 'Obese', 'Severely Obese'])
        df_copy['bmi_squared'] = df_copy['bmi'] ** 2
    
    # Blood pressure features (if available)
    if 'blood_pressure' in df_copy.columns:
        df_copy['bp_category'] = pd.cut(df_copy['blood_pressure'],
                                        bins=[0, 90, 120, 130, 140, 200],
                                        labels=['Low', 'Normal', 'Elevated', 'High1', 'High2'])
    
    # Create interaction features
    if all(col in df_copy.columns for col in ['age', 'bmi']):
        df_copy['age_bmi_interaction'] = df_copy['age'] * df_copy['bmi']
    
    # Create polynomial features for key numerical columns
    numerical_cols = ['age', 'bmi', 'glucose', 'blood_pressure', 'cholesterol']
    for col in numerical_cols:
        if col in df_copy.columns:
            df_copy[f'{col}_squared'] = df_copy[col] ** 2
            df_copy[f'{col}_cubed'] = df_copy[col] ** 3
    
    return df_copy

# Apply feature engineering
train = create_features(train)
test = create_features(test)


Performing feature engineering...


# DATA PREPROCESSING

In [4]:
print("Preprocessing data...")

# Separate features and target
if 'diagnosed_diabetes' in train.columns:
    X = train.drop(['id', 'diagnosed_diabetes'], axis=1)
    y = train['diagnosed_diabetes']
else:
    raise ValueError("Target column 'diagnosed_diabetes' not found in training data")

X_test = test.drop(['id'], axis=1)
test_ids = test['id']

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {len(numerical_cols)}")


Preprocessing data...
Categorical columns: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status', 'bmi_category']
Numerical columns: 24


# ENCODING CATEGORICAL VARIABLES

In [5]:
# For categorical variables, use Label Encoding
label_encoders = {}
for col in categorical_cols:
    if col in X.columns:
        # Combine train and test to handle all categories
        combined = pd.concat([X[col].astype(str), X_test[col].astype(str)], ignore_index=True)
        le = LabelEncoder()
        le.fit(combined)
        
        X[col] = le.transform(X[col].astype(str))
        X_test[col] = le.transform(X_test[col].astype(str))
        label_encoders[col] = le

# HANDLE MISSING VALUES

In [6]:
# Fill missing values with median for numerical columns
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

# Convert back to DataFrame
X = pd.DataFrame(X_imputed, columns=X.columns)
X_test = pd.DataFrame(X_test_imputed, columns=X_test.columns)

# SCALING FEATURES

In [7]:
print("Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

Scaling features...


# TRAIN-TEST SPLIT FOR VALIDATION

In [8]:
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set: {X_train.shape}, Validation set: {X_val.shape}")
print(f"Class distribution - Train: {np.bincount(y_train)}")
print(f"Class distribution - Val: {np.bincount(y_val)}")


Training set: (560000, 31), Validation set: (140000, 31)
Class distribution - Train: [210954 349046]
Class distribution - Val: [52739 87261]


# MODEL TRAINING - XGBOOST

In [9]:
# Train XGBoost with optimal hyperparameters
xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='auc',
    use_label_encoder=False,
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1])  # Handle class imbalance
)

# Fit the model
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=20,
    verbose=False
)

# Predict on validation
y_pred_xgb = xgb_model.predict_proba(X_val)[:, 1]
xgb_auc = roc_auc_score(y_val, y_pred_xgb)
print(f"XGBoost Validation AUC: {xgb_auc:.4f}")

XGBoost Validation AUC: 0.7196


# MODEL TRAINING - LIGHTGBM

In [10]:
lgb_model = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    class_weight='balanced'
)

lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='auc',
    #early_stopping_rounds=20,
    #verbose=False
)

y_pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
lgb_auc = roc_auc_score(y_val, y_pred_lgb)
print(f"LightGBM Validation AUC: {lgb_auc:.4f}")

[LightGBM] [Info] Number of positive: 349046, number of negative: 210954
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.042421 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2566
[LightGBM] [Info] Number of data points in the train set: 560000, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
LightGBM Validation AUC: 0.7194


# MODEL TRAINING - RANDOM FOREST

In [11]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict_proba(X_val)[:, 1]
rf_auc = roc_auc_score(y_val, y_pred_rf)
print(f"Random Forest Validation AUC: {rf_auc:.4f}")


Random Forest Validation AUC: 0.6970


# MODEL ENSEMBLING (Weighted Average)

In [12]:
# Create ensemble predictions based on validation performance
ensemble_weights = {
    'xgb': xgb_auc,
    'lgb': lgb_auc,
    'rf': rf_auc
}

# Normalize weights
total_weight = sum(ensemble_weights.values())
for model in ensemble_weights:
    ensemble_weights[model] /= total_weight

print("Model weights for ensemble:")
for model, weight in ensemble_weights.items():
    print(f"  {model.upper()}: {weight:.4f}")

# Get predictions from all models on test data
print("\nGenerating test predictions...")

# Train final models on full training data
print("Retraining models on full dataset...")
X_full_scaled = np.vstack([X_scaled])
y_full = y.values

# Train final models
final_xgb = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='auc',
    use_label_encoder=False,
    scale_pos_weight=len(y_full[y_full==0]) / len(y_full[y_full==1])
)
final_xgb.fit(X_full_scaled, y_full)

final_lgb = lgb.LGBMClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    class_weight='balanced'
)
final_lgb.fit(X_full_scaled, y_full)

final_rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)
final_rf.fit(X_full_scaled, y_full)

# Generate predictions
xgb_preds = final_xgb.predict_proba(X_test_scaled)[:, 1]
lgb_preds = final_lgb.predict_proba(X_test_scaled)[:, 1]
rf_preds = final_rf.predict_proba(X_test_scaled)[:, 1]

# Create weighted ensemble
ensemble_preds = (
    xgb_preds * ensemble_weights['xgb'] +
    lgb_preds * ensemble_weights['lgb'] +
    rf_preds * ensemble_weights['rf']
)


Model weights for ensemble:
  XGB: 0.3369
  LGB: 0.3368
  RF: 0.3263

Generating test predictions...
Retraining models on full dataset...
[LightGBM] [Info] Number of positive: 436307, number of negative: 263693
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.172808 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2565
[LightGBM] [Info] Number of data points in the train set: 700000, number of used features: 31
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


# CALIBRATE PREDICTIONS

In [13]:
calibrated_preds = 1 / (1 + np.exp(-(ensemble_preds - 0.5) * 3))

# Blend with original predictions (70% calibrated, 30% original)
final_predictions = 0.7 * calibrated_preds + 0.3 * ensemble_preds

# Ensure predictions are in [0, 1] range
final_predictions = np.clip(final_predictions, 0.001, 0.999)

# CREATE SUBMISSION FILE

In [14]:
submission = pd.DataFrame({
    'id': test_ids,
    'diagnosed_diabetes': final_predictions
})

# Save submission
submission.to_csv('submission_ensemble.csv', index=False)

print(f"\n✓ Submission file saved: submission_ensemble.csv")
print(f"✓ Number of predictions: {len(submission)}")
print(f"✓ Prediction range: [{final_predictions.min():.4f}, {final_predictions.max():.4f}]")
print(f"✓ Mean prediction: {final_predictions.mean():.4f}")


✓ Submission file saved: submission_ensemble.csv
✓ Number of predictions: 300000
✓ Prediction range: [0.1711, 0.8514]
✓ Mean prediction: 0.4991


# ANALYZE PREDICTIONS

In [15]:
print("\nPrediction Statistics:")
print(submission['diagnosed_diabetes'].describe())

# Check prediction distribution
bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
hist, bin_edges = np.histogram(final_predictions, bins=bins)
print("\nPrediction Distribution:")
for i in range(len(hist)):
    print(f"  {bin_edges[i]:.1f}-{bin_edges[i+1]:.1f}: {hist[i]} ({hist[i]/len(final_predictions)*100:.1f}%)")


Prediction Statistics:
count    300000.000000
mean          0.499084
std           0.141955
min           0.171094
25%           0.390425
50%           0.484698
75%           0.594552
max           0.851393
Name: diagnosed_diabetes, dtype: float64

Prediction Distribution:
  0.0-0.1: 0 (0.0%)
  0.1-0.2: 188 (0.1%)
  0.2-0.3: 19978 (6.7%)
  0.3-0.4: 61833 (20.6%)
  0.4-0.5: 80156 (26.7%)
  0.5-0.6: 65578 (21.9%)
  0.6-0.7: 37356 (12.5%)
  0.7-0.8: 31004 (10.3%)
  0.8-0.9: 3907 (1.3%)
  0.9-1.0: 0 (0.0%)


# FEATURE IMPORTANCE

In [16]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': final_xgb.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 Most Important Features:")
print(feature_importance.head(20).to_string(index=False))

# Save feature importance
feature_importance.to_csv('feature_importance_xgb.csv', index=False)
print("\n✓ Feature importance saved to: feature_importance_xgb.csv")


Top 20 Most Important Features:
                           feature  importance
           family_history_diabetes    0.713453
               age_bmi_interaction    0.082993
physical_activity_minutes_per_week    0.041983
                       age_squared    0.027045
                               age    0.018223
                     triglycerides    0.013319
                   ldl_cholesterol    0.010475
                           age_log    0.008856
                        diet_score    0.007119
                   hdl_cholesterol    0.006629
            cardiovascular_history    0.005920
                        heart_rate    0.005736
                 cholesterol_total    0.005004
                               bmi    0.004736
                       systolic_bp    0.004385
         screen_time_hours_per_day    0.003954
                waist_to_hip_ratio    0.003901
                       bmi_squared    0.003669
                         bmi_cubed    0.003477
      alcohol_consumption_p