In [1]:
# Imports
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

RANDOM_STATE = 42


In [2]:
# 1) Load data
TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'

train_df = pd.read_csv("/kaggle/input/playground-series-s5e12/train.csv")
test_df = pd.read_csv("/kaggle/input/playground-series-s5e12/test.csv")

print('train shape:', train_df.shape)
print('test shape :', test_df.shape)
display(train_df.head())
display(test_df.head())


train shape: (700000, 26)
test shape : (300000, 25)


Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history,diagnosed_diabetes
0,0,31,1,45,7.7,6.8,6.1,33.4,0.93,112,...,Female,Hispanic,Highschool,Lower-Middle,Current,Employed,0,0,0,1.0
1,1,50,2,73,5.7,6.5,5.8,23.8,0.83,120,...,Female,White,Highschool,Upper-Middle,Never,Employed,0,0,0,1.0
2,2,32,3,158,8.5,7.4,9.1,24.1,0.83,95,...,Male,Hispanic,Highschool,Lower-Middle,Never,Retired,0,0,0,0.0
3,3,54,3,77,4.6,7.0,9.2,26.6,0.83,121,...,Female,White,Highschool,Lower-Middle,Current,Employed,0,1,0,1.0
4,4,54,1,55,5.7,6.2,5.1,28.8,0.9,108,...,Male,White,Highschool,Upper-Middle,Never,Retired,0,1,0,1.0


Unnamed: 0,id,age,alcohol_consumption_per_week,physical_activity_minutes_per_week,diet_score,sleep_hours_per_day,screen_time_hours_per_day,bmi,waist_to_hip_ratio,systolic_bp,...,triglycerides,gender,ethnicity,education_level,income_level,smoking_status,employment_status,family_history_diabetes,hypertension_history,cardiovascular_history
0,700000,45,4,100,4.3,6.8,6.2,25.5,0.84,123,...,111,Female,White,Highschool,Middle,Former,Employed,0,0,0
1,700001,35,1,87,3.5,4.6,9.0,28.6,0.88,120,...,145,Female,White,Highschool,Middle,Never,Unemployed,0,0,0
2,700002,45,1,61,7.6,6.8,7.0,28.5,0.94,112,...,184,Male,White,Highschool,Low,Never,Employed,0,0,0
3,700003,55,2,81,7.3,7.3,5.0,26.9,0.91,114,...,128,Male,White,Graduate,Middle,Former,Employed,0,0,0
4,700004,77,2,29,7.3,7.6,8.5,22.0,0.83,131,...,133,Male,White,Graduate,Low,Current,Unemployed,0,0,0


In [3]:
# 2) Define features/target
TARGET_COL = 'diagnosed_diabetes'
ID_COL = 'id'

X = train_df.drop(columns=[TARGET_COL])
y = train_df[TARGET_COL].astype(int)

print('Target positive rate:', y.mean())


Target positive rate: 0.6232957142857143


In [4]:
# 3) Identify numeric vs categorical columns
# NOTE: Some datasets store categoricals as strings; others store them as integer codes.
# We handle both by explicitly listing the known categorical feature names.
known_categoricals = [
    'gender',
    'ethnicity',
    'education_level',
    'income_level',
    'smoking_status',
    'employment_status',
]

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
for c in known_categoricals:
    if c in X.columns and c not in categorical_cols:
        categorical_cols.append(c)

numeric_cols = [c for c in X.columns if c not in categorical_cols]

print('Categorical columns:', categorical_cols)
print('Numeric columns     :', numeric_cols)


Categorical columns: ['gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status']
Numeric columns     : ['id', 'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history']


In [5]:
# 4) Build a baseline pipeline
# - Impute missing values
# - One-hot encode categoricals (handle unseen categories in test)
# - Scale numerics (with_mean=False so it stays compatible with sparse matrices)
# - Train Logistic Regression (fast, strong baseline for tabular problems)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False)),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
])

preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols),
    ],
    remainder='drop',
)

model = LogisticRegression(
    solver='saga',
    max_iter=200,
    n_jobs=-1,
)

clf = Pipeline(steps=[('preprocess', preprocess), ('model', model)])


In [6]:
# 5) Quick local validation (train/valid split)
# This gives you a rough idea of model quality before you train on all data.

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

clf.fit(X_train, y_train)
valid_pred = clf.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, valid_pred)
print(f'Validation AUC: {auc:.5f}')


Validation AUC: 0.69478


In [7]:
# 6) Train on full training data and write submission.csv
clf.fit(X, y)
test_pred = clf.predict_proba(test_df)[:, 1]

submission = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    TARGET_COL: test_pred,
})

# Sanity checks
assert submission.columns.tolist() == [ID_COL, TARGET_COL]
assert submission[TARGET_COL].between(0, 1).all()

SUBMISSION_PATH = 'submission.csv'
submission.to_csv(SUBMISSION_PATH, index=False)
print('Wrote', SUBMISSION_PATH, 'with shape', submission.shape)
display(submission.head())


Wrote submission.csv with shape (300000, 2)


Unnamed: 0,id,diagnosed_diabetes
0,700000,0.566139
1,700001,0.570171
2,700002,0.662936
3,700003,0.631223
4,700004,0.778795
