In [None]:
# data manipulation
import pandas as pd 
import numpy as np 

# visulaizations
import matplotlib.pyplot as plt
import seaborn as sns


# Set figure style
sns.set_theme(style="darkgrid", palette="pastel")

import warnings
warnings.filterwarnings('ignore')

## EDA

In [None]:
# load data
data = pd.read_csv('/kaggle/input/diabetes-data-set/diabetes.csv')

# basic information
data.info()

In [None]:
# quick glance at data
data.head()

In [None]:
# descriptive statstics
data.describe()

In [None]:
# check for unique values
data.nunique()

In [None]:
# check for null values
data.isnull().sum()

In [None]:
# check for duplicates
data.duplicated().sum()

In [None]:
'''
data is labelled and target lable is "Outcome"
'''
# segregate columns
target = 'Outcome'
features = data.drop(target,axis=1).columns

In [None]:
#  Distribution of Each Feature
data.hist(figsize=(12, 10), bins=20, edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.show()

In [None]:
# Boxplot to find outlier
plt.figure(figsize=(12, 15))
for i, feature in enumerate(features,1):
    plt.subplot(3, 3, i)
    sns.boxplot(y=feature,x=target,data=data)
    plt.title(f"box Plot of {feature}")
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
#  KDE Plot for Feature Distributions 
plt.figure(figsize=(12, 15))
for i, feature in enumerate(features, 1):
    plt.subplot(3, 3, i)
    sns.kdeplot(data[data["Outcome"] == 0][feature], label="No Diabetes", fill=True, alpha=0.3)
    sns.kdeplot(data[data["Outcome"] == 1][feature], label="Diabetes", fill=True, alpha=0.3)
    plt.title(f"KDE Plot of {feature}")
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
#  Pairplot to Show Relationships
sns.pairplot(data, hue="Outcome", diag_kind="kde")
plt.show()

In [None]:
#  Correlation Heatmap 
plt.figure(figsize=(10, 6))
sns.heatmap(data.corr(), annot=True, fmt=".2f", linewidths=0.5)
plt.title("Feature Correlation Heatmap")
plt.show()

### Key Features
1. Data is Skewed and contains Outlier                        --->  using **PowerTransformer and  RobustScaler**
2. Correlation b/w Outcome and some features is very low      ---> using **SelectKBest features**.
3. Data voulme is low, task is Classifcation and features are numeric  ---> using **Perceptron**

## Split the data

In [None]:
from sklearn.model_selection import train_test_split 

X_train,X_test,y_train,y_test = train_test_split(data[features],data[target],test_size=0.2,random_state=23)

## Perceptron

In [None]:
## piepleine creation
from sklearn.pipeline import Pipeline   
## preprocessing
from sklearn.preprocessing import PowerTransformer, RobustScaler
from sklearn.linear_model import Perceptron
from sklearn.feature_selection import SelectKBest, mutual_info_classif 
from sklearn.metrics import classification_report,confusion_matrix

model = Perceptron(
    penalty='elasticnet',
    alpha=0.000035,
    l1_ratio=0.25,
    max_iter=100,
    eta0=0.35,
    early_stopping=True,
    n_iter_no_change=5,
    class_weight='balanced'
    )

# Define pipeline
pipeline = Pipeline([
    ("power", PowerTransformer(method='yeo-johnson')),       # Handles skewed data
    ("scale", RobustScaler()),                                # Handles outliers
    ("select", SelectKBest(mutual_info_classif, k=5)),        # Select top K best features
    ("clf", model)
])

# train the model and fit the data
pipeline.fit(X_train,y_train)

## predictions and metrics
y_pred = pipeline.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

## SGD with Perceptron loss

In [None]:
from sklearn.linear_model import SGDClassifier

model_sgd = SGDClassifier(
    loss="perceptron",         # classic Perceptron loss (hinge-style, not probabilistic)
    learning_rate='adaptive',  # improvise based on the gradient..
    eta0=0.01,                  # the actual learning rate value
    penalty="elasticnet",      # L1 + L2 regularization
    alpha=0.00001,              # regularization strength
    l1_ratio=0.2,              # mix between L1 and L2
    max_iter=500,
    early_stopping=True,
    class_weight="balanced",   # helps with imbalance
    random_state=42
)

# Define pipeline
pipeline_sgd = Pipeline([
    ("power", PowerTransformer(method='yeo-johnson')),       # Handles skewed data
    ("scale", RobustScaler()),                                # Handles outliers
    ("select", SelectKBest(mutual_info_classif, k=5)),        # Select top K best features
    ("clf", model_sgd)
])

# train the model and fit the data
pipeline_sgd.fit(X_train,y_train)

## predictions and metrics
y_pred = pipeline_sgd.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

**Why use SGD Classfier with loss='Perceptron when Perceptron Model**
1. You want control over learning rate (important for tuning)
2. You need early stopping, validation splits, or more robust regularization
3. You plan to switch to probabilistic loss (e.g. log_loss) later on
4. You want to integrate it into a larger pipeline or do cross-validation more easily

## Next Steps
1. Add  Data  --> **Imporves Generalization**
2. Try powerful models (trees, boosting)
3. Try model fusion (Voting, Stacking)
4. Log experiments with **MLFlow** ("**Tried 5 model combinations — MLflow will log everything and we can go back to best model if want"** )