Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

Data Collection and Analysis

PIMA Diabetes Dataset

In [None]:
# loading the diabetes dataset to a pandas DataFrame
df = pd.read_csv('diabetes.csv') 

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df["Pregnancies"].value_counts()

In [None]:
df['Outcome'].value_counts()

o0 --> Non-Diabetic

1 --> Diabetic

In [None]:
df.groupby('Outcome').mean()

In [None]:
# separating the data and labels
X = df.drop(columns = 'Outcome', axis=1)
Y = df['Outcome']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(x='Outcome', data=df)
plt.title('Count of Diabetic vs Non-Diabetic')
plt.show()


In [None]:
plt.hist(df['Age'], bins=10, color='skyblue')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()


In [None]:
sns.boxplot(x='Outcome', y='Glucose', data=df)
plt.title('Glucose Level vs Outcome')
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [None]:
sns.pairplot(df, vars=['Glucose','BMI','Age','Insulin'], hue='Outcome')
plt.show()


In [None]:
sns.histplot(df['Pregnancies'], bins=15, kde=True, color='orange')
plt.title('Distribution of Pregnancies')
plt.xlabel('Number of Pregnancies')
plt.show()


In [None]:
sns.scatterplot(x='Age', y='BMI', hue='Outcome', data=df)
plt.title('BMI vs Age colored by Outcome')
plt.show()


In [None]:
# Correlation with target
corr = df.corr()
plt.figure(figsize=(8,5))
sns.heatmap(corr[['Outcome']].sort_values(by='Outcome', ascending=False), annot=True, cmap='coolwarm')
plt.title('Feature Correlation with Outcome')
plt.show()

Data Standardization

In [None]:
scaler=StandardScaler()

In [None]:
scaled_data=scaler.fit_transform(X)

In [None]:
print(scaled_data)

In [None]:
X=scaled_data

In [None]:
print(X)
print(Y)

Train Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

*Training* the Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [None]:
from catboost import CatBoostClassifier

In [None]:
# Random Forest
rf = RandomForestClassifier()
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# SVM with RBF Kernel
svm = SVC(probability=True)   # <- Important!
svm_params = {
    'C': [0.1, 1, 10],
    'gamma': ['scale','auto'],
    'kernel': ['rbf', 'poly', 'linear']
}

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr_params = {
    'C': [0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}
# CatBoost
cat_model = CatBoostClassifier(
    iterations=500,      
    learning_rate=0.1, 
    depth=6,           
    verbose=100,        
    random_seed=42
)



In [None]:
# Example for Random Forest
grid_rf = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy')
grid_rf.fit(X_train, Y_train)

print("Best RF Parameters:", grid_rf.best_params_)
print("Best RF Accuracy (CV):", grid_rf.best_score_)

# Predict on test data
y_pred_rf = grid_rf.predict(X_test)
print("Test Accuracy:", accuracy_score(Y_test, y_pred_rf))


In [None]:
# SVM
grid_svm = GridSearchCV(svm, svm_params, cv=5, scoring='accuracy')
grid_svm.fit(X_train, Y_train)
print("Best SVM Parameters:", grid_svm.best_params_)
print("Test Accuracy:", accuracy_score(Y_test, grid_svm.predict(X_test)))

# Logistic Regression
grid_lr = GridSearchCV(lr, lr_params, cv=5, scoring='accuracy')
grid_lr.fit(X_train, Y_train)
print("Best LR Parameters:", grid_lr.best_params_)
print("Test Accuracy:", accuracy_score(Y_test, grid_lr.predict(X_test)))

# catboost
cat_model.fit(X_train, Y_train)
print("Test Accuracy:", accuracy_score(Y_test, cat_model.predict(X_test)))


## SVM gives the best result
*Accuracy on test set = 77%* - *without SMOTE*

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Suppose final model is grid_rf (Random Forest from GridSearchCV)
y_pred = grid_svm.predict(X_test)
y_prob = grid_svm.predict_proba(X_test)[:,1]

# 1. Confusion Matrix
cm = confusion_matrix(Y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# 2. Precision, Recall, F1
print(classification_report(Y_test, y_pred))

# 3. ROC Curve
fpr, tpr, thresholds = roc_curve(Y_test, y_prob)
plt.plot(fpr, tpr, label="AUC = %.2f" % roc_auc_score(Y_test, y_prob))
plt.plot([0,1],[0,1],'--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()


#### Observation

- Majority class (0) predicted very well (recall 0.91).

- Minority class (1) recall is low (0.52) → model misses almost half of diabetic patients.

##### "Now, we will implement SMOTE to balance the dataset."

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

smote = SMOTE(random_state=42)
X_smote, Y_smote = smote.fit_resample(X_train, Y_train)

In [None]:
# Train on SMOTE data
grid_lr.fit(X_smote, Y_smote)
grid_svm.fit(X_smote, Y_smote)
grid_rf.fit(X_smote, Y_smote)

In [None]:
# Predict on test set
Y_pred_lr = grid_lr.predict(X_test)
Y_pred_svm = grid_svm.predict(X_test)
Y_pred_rf = grid_rf.predict(X_test)

In [None]:
print("Test Accuracy on rf:", accuracy_score(Y_test, Y_pred_rf))
print("Test Accuracy on svm:", accuracy_score(Y_test, Y_pred_svm))
print("Test Accuracyon lr:", accuracy_score(Y_test, Y_pred_lr))

In [None]:
# Evaluate
from sklearn.metrics import classification_report
print("SMOTE Results")
print("LR--",classification_report(Y_test, Y_pred_lr))
print("SVM--",classification_report(Y_test, Y_pred_svm))
print("RF--",classification_report(Y_test, Y_pred_rf))

In [None]:
#Feature Importance (Explainability)
importances = grid_rf.best_estimator_.feature_importances_
plt.barh(df.columns[:-1], importances)
plt.title("Feature Importance (Random Forest)")
plt.xlabel("Importance Score")
plt.show()

In [None]:
def heatmap(Y_test,y_pred):
    cm = confusion_matrix(Y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [None]:
print("Logistic Regression")
heatmap(Y_test, Y_pred_lr)

In [None]:
print("SVM")
heatmap(Y_test, Y_pred_svm)

In [None]:
print("Random Forest")
heatmap(Y_test, Y_pred_rf)

### Conclusion / Best Model

- Best model: Logistic Regression after applying SMOTE.

- Reason: Highest recall (0.67) for diabetic patients while maintaining good overall accuracy (0.75).

- RF is close but slightly weaker for minority class.

- SVM performs worst for minority class detection.

# Making a predictive model

In [None]:
input_data = (5,166,72,19,175,25.8,0.587,51)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = grid_lr.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

Saving the trained model

In [None]:
import pickle
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Wrap preprocessing + model into a pipeline
pipeline = ImbPipeline(steps=[
    ('scaler', StandardScaler()),       # scaling step
    ('smote', SMOTE(random_state=42)),  # oversampling (used only during training)
    ('model', grid_lr)                    # your trained model (already fitted)
])



In [None]:
# Fit pipeline again (so that pipeline knows preprocessing + model together)
pipeline.fit(X_train, Y_train)

In [None]:
# Save the pipeline
with open("diabetes_pipeline.pkl", "wb") as f:
    pickle.dump(pipeline, f)

print("✅ Pipeline saved as diabetes_pipeline.pkl")

In [None]:
# -----------------------------
# ✅ Load and Test Pipeline
# -----------------------------
with open("diabetes_pipeline.pkl", "rb") as f:
    loaded_pipeline = pickle.load(f)

# Example input (raw values, directly without scaling)
sample = [6,148,72,35,0,33.6,0.627,50]

prediction = loaded_pipeline.predict([sample])[0]
print("Prediction:", "Diabetes" if prediction == 1 else "No Diabetes")
