
# Diabetes Prediction with Logistic Regression (Jupyter Notebook)

This notebook builds a **logistic regression** classifier on the classic Pima Indians Diabetes dataset (`diabetes.csv`), evaluates it with standard metrics, and visualizes results (confusion matrix and ROC curve).  
It is clean, portfolio-ready, and uses **matplotlib only** for plots.



## 1) Requirements
Run this once if needed:
```bash
pip install pandas scikit-learn matplotlib
```


In [None]:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_curve, auc
)

import matplotlib.pyplot as plt

# Display options
pd.set_option('display.max_columns', None)


## 2) Load the data

In [None]:

# Adjust the path if your file is elsewhere
DATA_PATH = "diabetes.csv"
df = pd.read_csv(DATA_PATH)
df.head()


## 3) Quick data check

In [None]:

print(df.shape)
print(df.dtypes)
df.describe()


## 4) Optional: simple histograms

In [None]:

numeric_cols = [c for c in df.columns if c != "Outcome"]
for col in numeric_cols:
    plt.figure()
    df[col].hist(bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.show()


## 5) Train / Test split and model pipeline

In [None]:

X = df.drop(columns=["Outcome"])
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Pipeline: scale features then logistic regression
pipe = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=1000))
])

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
y_prob = pipe.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.3f}\n")

print("Classification report:")
print(classification_report(y_test, y_pred))


## 6) Confusion Matrix

In [None]:

cm = confusion_matrix(y_test, y_pred)
plt.figure()
plt.imshow(cm, interpolation='nearest')
plt.title("Confusion Matrix")
plt.colorbar()
tick_marks = np.arange(2)
classes = ["No Diabetes", "Diabetes"]
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.tight_layout()
plt.show()


## 7) ROC Curve & AUC

In [None]:

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.plot([0, 1], [0, 1], linestyle="--")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.show()


## 8) Feature Importance (Coefficients)

In [None]:

# Extract coefficients from the trained logistic regression
logreg = pipe.named_steps["logreg"]
scaler = pipe.named_steps["scaler"]

# Coefficients correspond to scaled features in the same order as X columns
coef = logreg.coef_.ravel()
importance = pd.Series(coef, index=X.columns).sort_values(key=abs, ascending=False)
importance_df = importance.reset_index()
importance_df.columns = ["feature", "coefficient"]
importance_df


In [None]:

plt.figure()
plt.barh(importance.index, importance.values)
plt.title("Logistic Regression Coefficients (by absolute value)")
plt.xlabel("Coefficient")
plt.ylabel("Feature")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()
