# Import Required Libraries
Import the necessary libraries, including pandas, numpy, sklearn, and joblib.

In [19]:
# Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load Dataset
Load the Pima Indians Diabetes Dataset using pandas.

In [20]:
# Load Dataset
df = pd.read_csv(r"Pima Indians Diabetes Dataset.csv")  # Load the dataset into a DataFrame
df.head()  # Display the first few rows of the dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Preprocess Data
Handle missing values, normalize features, and encode categorical variables if necessary.

In [21]:
# Preprocess Data

# Handle missing values by replacing zeros with NaN (except for 'Pregnancies' and 'Outcome')
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)

# Fill missing values with the median of each column
df.fillna(df.median(), inplace=True)

# Normalize features using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']] = scaler.fit_transform(df[['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']])

# Split Dataset
Split the dataset into training and testing sets using train_test_split from sklearn.

In [22]:
# Split the data into features and target
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVC Model
Train a Support Vector Classifier (SVC) model using the training data.

In [23]:
# Train SVC Model

# Initialize the Support Vector Classifier (SVC) model
svc_model = SVC()

# Train the SVC model using the training data
svc_model.fit(X_train, y_train)

# Predict the outcomes on the test set
y_pred_svc = svc_model.predict(X_test)

# Calculate the accuracy of the SVC model
accuracy_svc = accuracy_score(y_test, y_pred_svc)
print(f"SVC Model Accuracy: {accuracy_svc:.2f}")

# Save the trained SVC model to a file
# joblib.dump(svc_model, 'models/svc_model.pkl')

SVC Model Accuracy: 0.76


# Train Logistic Regression Model
Train a Logistic Regression model using the training data.

In [24]:
# Train Logistic Regression Model

# Initialize the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)

# Train the Logistic Regression model using the training data
logistic_model.fit(X_train, y_train)

# Predict the outcomes on the test set
y_pred_logistic = logistic_model.predict(X_test)

# Calculate the accuracy of the Logistic Regression model
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
print(f"Logistic Regression Model Accuracy: {accuracy_logistic:.2f}")

# Save the trained Logistic Regression model to a file
# joblib.dump(logistic_model, 'models/logistic_model.pkl')

Logistic Regression Model Accuracy: 0.78


# Stack Models
Combine the predictions of the SVC and Logistic Regression models using a stacking method.

In [25]:
# Stack Models

# Define the base models
estimators = [
    ('svc', SVC(probability=True)),
    ('logistic', LogisticRegression(max_iter=1000))
]

# Initialize the Stacking Classifier with the base models and a final estimator
stacking_model = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

# Train the Stacking Classifier using the training data
stacking_model.fit(X_train, y_train)

# Evaluate Model Accuracy
Calculate and print the accuracy of the stacked model on the testing data.

In [26]:
# Evaluate Model Accuracy

# Predict the outcomes on the test set
y_pred_stacking = stacking_model.predict(X_test)

# Calculate the accuracy of the Stacking Classifier
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
print(f"Stacking Model Accuracy: {accuracy_stacking:.2f}")

Stacking Model Accuracy: 0.78


# Save the Model
Save the trained model to a folder using joblib.

In [27]:
# Save the Model

# Save the trained Stacking Classifier model to a file
joblib.dump(stacking_model, 'models/Pima Indians Diabetes Prediction.pkl')

# Define the columns used for training
feature_columns = X.columns.tolist()

# Print the columns used for training
print("Columns used for training the model:")
print(feature_columns)

Columns used for training the model:
['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
