# MLP Classifier for Pima Indians Diabetes Dataset
This notebook demonstrates loading the Pima Indians Diabetes dataset, preprocessing, building, training, and evaluating an MLP classifier, as well as optional hyperparameter tuning.

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


## Define Functions
We'll define modular functions for each step.

In [2]:
def load_data(filepath):
    """Load the Pima Indians Diabetes dataset from a CSV file."""
    df = pd.read_csv(filepath)
    return df

def split_features_target(df, target_col='Outcome'):
    """Split the DataFrame into features (X) and target (y)."""
    X = df.drop(columns=[target_col])
    y = df[target_col]
    return X, y

def train_test_split_data(X, y, test_size=0.2, random_state=42):
    """Perform train-test split."""
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=test_size,
                                                        random_state=random_state)
    return X_train, X_test, y_train, y_test

def scale_features(X_train, X_test):
    """Scale features using StandardScaler."""
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler

def build_mlp(hidden_layer_sizes=(50,50), activation='relu', solver='adam', random_state=42):
    """Build an MLP classifier with given parameters."""
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes,
                        activation=activation,
                        solver=solver,
                        max_iter=1000,
                        random_state=random_state)
    return mlp

def train_model(model, X_train, y_train):
    """Train the MLP model."""
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """Evaluate the trained model on the test data."""
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print("Test Accuracy:", acc)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    return acc

def tune_mlp(X_train, y_train):
    """Perform Grid Search to tune MLP hyperparameters."""
    mlp = MLPClassifier(max_iter=500, random_state=42)
    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (50,50)],
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001]
    }
    grid_search = GridSearchCV(mlp, param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score:", grid_search.best_score_)
    return grid_search.best_estimator_


## Load and Preprocess Data
Ensure `diabetes.csv` is in the same directory as this notebook.

## Build, Train and Evaluate MLP

In [5]:
# Load the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = ldiabetes_df = pd.read_csv(url, names=columns)
X, y = split_features_target(df, target_col='Outcome')
X_train, X_test, y_train, y_test = train_test_split_data(X, y, test_size=0.2, random_state=42)
X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test)
print("Data loaded and preprocessed.")

Data loaded and preprocessed.


In [6]:
mlp_model = build_mlp(hidden_layer_sizes=(64,64,32), activation='relu', solver='adam')
mlp_model = train_model(mlp_model, X_train_scaled, y_train)
evaluate_model(mlp_model, X_test_scaled, y_test)

Test Accuracy: 0.7077922077922078
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.70      0.75        99
           1       0.57      0.73      0.64        55

    accuracy                           0.71       154
   macro avg       0.70      0.71      0.70       154
weighted avg       0.73      0.71      0.71       154

Confusion Matrix:
 [[69 30]
 [15 40]]


0.7077922077922078

## (Optional) Hyperparameter Tuning
Uncomment and run the following cell to perform a grid search for better hyperparameters.

In [None]:
# best_mlp = tune_mlp(X_train_scaled, y_train)
# evaluate_model(best_mlp, X_test_scaled, y_test)