In [1]:
import pandas as pd
import pickle
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

print("Training with sklearn version:", sklearn.__version__)

# Load dataset
diabetes = pd.read_csv("diabetes.csv")

# Replace invalid zeros with median
cols_to_fix = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_to_fix:
    diabetes[col] = diabetes[col].replace(0, diabetes[col].median())

# Features & target
X = diabetes.drop("Outcome", axis=1)
y = diabetes["Outcome"]

# Pipeline
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", GaussianNB())
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

pipeline.fit(X_train, y_train)

print("Train Accuracy:", pipeline.score(X_train, y_train))
print("Test Accuracy:", pipeline.score(X_test, y_test))

# Save model
with open("diabetes_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("✅ Model trained & saved successfully")


Training with sklearn version: 1.8.0
Train Accuracy: 0.757328990228013
Test Accuracy: 0.6948051948051948
✅ Model trained & saved successfully
