# Setup

In [None]:
import pandas as pd
import os

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

training = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv', index_col='id')
testing = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv', index_col='id')

training.head(5)

# EDA

In [None]:
print(training.isnull().sum())
print()
print(training.isna().sum())

In [None]:
training.info()
training.describe(include="all")

In [None]:
def get_vars(df, exclude_cols=None):
    """
    Identify variable types based on dtype + unique counts.

    Returns:
        {
            "numerical_vars": ...,
            "categorical_vars": ...,
            "continuous_vars": ...,
            "discrete_vars": ...,
            "binary_vars": ...,
            "true_numerical_vars": ...,
            "true_categorical_vars": ...,
            "all_vars": ...
        }
    """
    if exclude_cols is None:
        exclude_cols = []

    # --------------------
    # Base variable groups
    # --------------------
    numerical_vars = df.select_dtypes(include=["number"]).columns.tolist()
    categorical_vars = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

    # Remove excluded columns from each list
    numerical_vars = [c for c in numerical_vars if c not in exclude_cols]
    categorical_vars = [c for c in categorical_vars if c not in exclude_cols]

    # --------------------
    # Continuous vs Discrete
    # --------------------
    continuous_vars = []
    discrete_vars = []

    n_rows = len(df)

    for col in numerical_vars:
        n_unique = df[col].nunique(dropna=True)

        if (n_unique <= 10) or (df[col].dtype.kind in "iu" and n_unique / n_rows < 0.01):
            discrete_vars.append(col)
        else:
            continuous_vars.append(col)

    # --------------------
    # Binary detection (among discrete vars)
    # --------------------
    binary_vars = []
    nonbinary_discrete = discrete_vars.copy()

    for col in nonbinary_discrete.copy():
        unique_vals = df[col].dropna().unique()
        n_unique = len(unique_vals)

        if n_unique == 2:
            binary_vars.append(col)
            discrete_vars.remove(col)

    # --------------------
    # Final "true" groups
    # --------------------
    true_numerical_vars = continuous_vars + discrete_vars
    true_categorical_vars = categorical_vars + binary_vars
    all_vars = true_numerical_vars + true_categorical_vars

    # --------------------
    # Return everything
    # --------------------
    return {
        "numerical_vars": numerical_vars,
        "categorical_vars": categorical_vars,
        "continuous_vars": continuous_vars,
        "discrete_vars": discrete_vars,
        "binary_vars": binary_vars,
        "true_numerical_vars": true_numerical_vars,
        "true_categorical_vars": true_categorical_vars,
        "all_vars": all_vars
    }

vars_dict = get_vars(training, ["diagnosed_diabetes"])

# Splitting and Preparing the Data

In [None]:
from sklearn.model_selection import train_test_split

SEED = 42

X = training[vars_dict["all_vars"]]
y = training["diagnosed_diabetes"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=SEED
)

print(X_train.shape, X_test.shape)
print(y_train.value_counts())

In [None]:
from sklearn.utils import resample

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
train_data = pd.concat([X_train, y_train], axis=1)

majority = train_data[train_data.diagnosed_diabetes == 1]
minority = train_data[train_data.diagnosed_diabetes == 0]

minority_upsampled = resample(minority,
                              replace=True,
                              n_samples=len(majority),
                              random_state=SEED)

train_data_balanced = pd.concat([majority, minority_upsampled])
train_data_balanced = train_data_balanced.sample(frac=1, random_state=SEED).reset_index(drop=True)

X_train = train_data_balanced.drop('diagnosed_diabetes', axis=1)
y_train = train_data_balanced['diagnosed_diabetes']

print(f"Balanced Train Counts:\n{y_train.value_counts()}")

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled_num = scaler.fit_transform(X_train[vars_dict["true_numerical_vars"]])
X_train_scaled_df = pd.DataFrame(
    X_train_scaled_num,
    columns=vars_dict["true_numerical_vars"],
    index=X_train.index
)

X_test_scaled_num = scaler.transform(X_test[vars_dict["true_numerical_vars"]])
X_test_scaled_df = pd.DataFrame(
    X_test_scaled_num,
    columns=vars_dict["true_numerical_vars"],
    index=X_test.index
)

In [None]:
X_train = pd.get_dummies(X_train[vars_dict["true_categorical_vars"]], drop_first=True)
X_test = pd.get_dummies(X_test[vars_dict["true_categorical_vars"]], drop_first=True)

X_train_encoded, X_test_encoded = X_train.align(
    X_test,
    join='left',
    axis=1,
    fill_value=0
)

print(f"Encoded Shape Train: {X_train_encoded.shape}, Test: {X_test_encoded.shape}")

X_train = pd.concat([X_train_scaled_df, X_train_encoded], axis=1)
X_test = pd.concat([X_test_scaled_df, X_test_encoded], axis=1)

print(f"Final Feature Shape Train: {X_train.shape}, Test: {X_test.shape}")

# Evaluation Function

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score, average_precision_score, roc_curve, precision_recall_curve

def evaluate_and_graph(model, X_train, y_train, X_test, y_test, name):
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)

    if hasattr(model, "predict_proba"):
        y_prob_test = model.predict_proba(X_test)[:, 1]
        y_prob_train = model.predict_proba(X_train)[:, 1]
    else:
        y_prob_test = model.decision_function(X_test)
        y_prob_train = model.decision_function(X_train)

    test_acc = accuracy_score(y_test, y_pred_test)
    test_auc = roc_auc_score(y_test, y_prob_test)
    test_ap = average_precision_score(y_test, y_prob_test)

    train_acc = accuracy_score(y_train, y_pred_train)
    train_auc = roc_auc_score(y_train, y_prob_train)

    print(f"--- {name} ---")
    print(f"Train Accuracy: {train_acc:.4f} | Train AUC: {train_auc:.4f}")
    print(f"Test  Accuracy: {test_acc:.4f} | Test  AUC: {test_auc:.4f}")

    if (train_acc - test_acc) > 0.05:
         print("⚠️ Warning: Signs of Overfitting (Train is much better than Test)")
    else:
         print("✅ Model seems balanced")
    print("-" * 30)

    fpr, tpr, _ = roc_curve(y_test, y_prob_test)
    precision, recall, _ = precision_recall_curve(y_test, y_prob_test)

    return {
        "model": name,
        "accuracy": test_acc,
        "auc": test_auc,
        "average_precision": test_ap,
        "train_accuracy": train_acc,
        "train_auc": train_auc
    }

# Gradient Boosting Classifier Model

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=SEED)
evaluate_and_graph(gb, X_train, y_train, X_test, y_test, "Gradient Boosting")

# Submission for Kaggle

Manual Process, must use results to correctly call the best model

In [None]:
testing_scaled_num = scaler.transform(testing[vars_dict["true_numerical_vars"]])
testing_scaled_df = pd.DataFrame(
    testing_scaled_num,
    columns=vars_dict["true_numerical_vars"],
    index=testing.index
)

testing_dummies = pd.get_dummies(testing[vars_dict["true_categorical_vars"]], drop_first=True)

_, testing_encoded_aligned = X_train_encoded.align(
    testing_dummies,
    join='left',
    axis=1,
    fill_value=0
)

testing_final = pd.concat([testing_scaled_df, testing_encoded_aligned], axis=1)

print(f"Training shape: {testing_scaled_df.shape}")
print(f"Testing shape:  {testing_final.shape}")

In [None]:
predictions = gb.predict_proba(testing_final)[:, 1]

submission = pd.DataFrame({
    'id': testing.index,
    'diagnosed_diabetes': predictions
})

submission.to_csv('gradient_boosting_submission.csv', index=False)

print("Submission saved successfully!")
submission.head()