# 1. Load Data

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.dummy import DummyClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix, classification_report
import requests
import io
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
url = "https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/diabetes_scale"

response = requests.get(url)
data_io = io.BytesIO(response.content)

X, y = load_svmlight_file(data_io)

# Transform data to pandas DataFrame
df = pd.DataFrame(X.toarray())
df['target'] = y

# Show data head
df.head()

# 2. Data pre-processing

In [None]:
# Rename columns for readable
df.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Show the information of the data
print(df.info())
print(df.describe())

# 3. Splite the data

In [None]:
# Extract the features and labels
X = df.drop('Outcome', axis=1)
y = df['Outcome']

#split the data
from sklearn.model_selection import train_test_split

X_bigtrain, X_test, y_bigtrain, y_test = train_test_split(X, y, test_size=0.15, stratify=y, random_state=11)

X_train, X_val, y_train, y_val = train_test_split(X_bigtrain, y_bigtrain, test_size=0.15, stratify=y_bigtrain, random_state=11)
print([X_train.shape, y_train.shape,X_test.shape,y_test.shape,X_val.shape,y_val.shape])

# 4. Build Model

# 4.1 Build baseline model (Perceptron)

In [None]:
# Implement a logistic regression model as the baseline model
baseline_model = Perceptron(max_iter=3000, random_state = 11)

# Training the baseline model on the training set
baseline_model.fit(X_train, y_train)

# Prediction on validation sets
y_val_pred_baseline = baseline_model.predict(X_val)

# 4.2 Evaluate the performance of baseline model

In [None]:
# Evaluate the metrics of the baseline model
from sklearn.metrics import (
    accuracy_score, recall_score,
    precision_score, roc_auc_score, f1_score
)
baseline_accuracy = accuracy_score(y_val, y_val_pred_baseline)
baseline_recall = recall_score(y_val, y_val_pred_baseline, pos_label=-1)

print(f'Baseline model performance on validation set:')
print(f'Accuracy: {baseline_accuracy:.2f}')
print(f'Recall: {baseline_recall:.2f}')

In [None]:
# Confusion matrix
baseline_conf_matrix = confusion_matrix(y_val, y_val_pred_baseline, labels=[-1, 1])
# Visual Confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(baseline_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['-1', '1'], yticklabels=['-1', '1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for Baseline model')
plt.show()

# 4.3 Build MLP model

In [None]:
# Implement a Multi-layer Perceptron model
mlp_model = MLPClassifier(max_iter=3000, random_state = 11)

# Training the mlp model on the training sets
mlp_model.fit(X_train, y_train)

# Prediction on validation sets
y_val_pred_mlp= mlp_model.predict(X_val)

# 4.4 Evaluate the performance of MLP model

In [None]:
# Evaluate the metrics of the mlp model
from sklearn.metrics import (
    accuracy_score, recall_score,
    precision_score, roc_auc_score, f1_score
)
mlp_accuracy = accuracy_score(y_val, y_val_pred_mlp)
mlp_recall = recall_score(y_val, y_val_pred_mlp, pos_label=-1)
print(f'mlp model performance on validation set:')
print(f'Accuracy: {mlp_accuracy:.2f}')
print(f'Recall: {mlp_recall:.2f}')

In [None]:
# Confusion matrix
mlp_conf_matrix = confusion_matrix(y_val, y_val_pred_mlp, labels=[-1, 1])
# Visual Confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(mlp_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['-1', '1'], yticklabels=['-1', '1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for mlp model')
plt.show()

# 5 Model optimisation

# 5.1 Define parameter and training model

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import make_scorer, recall_score
import numpy as np

mlp_param_grid = {
    'hidden_layer_sizes': [(64,), (128,), (256,), (128, 64), (256, 128), (256, 128, 64)],
    'learning_rate_init': [0.0001, 0.001, 0.01, 0.1], 
    'activation': ['relu', 'tanh', 'logistic']
}

# RandomizedSearchCV
mlp_search = RandomizedSearchCV(MLPClassifier(max_iter=4000, random_state=11), 
                                mlp_param_grid,
                                n_iter=10,  
                                scoring={'accuracy': 'accuracy', 'recall': make_scorer(recall_score, pos_label=-1)},
                                refit='recall',
                                cv=10, 
                                error_score='raise')

mlp_search.fit(X_train, y_train)


# 5.2 Show results for all combinations of parameters

In [None]:
# Extract the results of recall and accuracy
cv_results = mlp_search.cv_results_

results_df = pd.DataFrame({
    'hidden_layer_sizes': [param['hidden_layer_sizes'] for param in cv_results['params']],
    'learning_rate_init': [param['learning_rate_init'] for param in cv_results['params']],
    'activation': [param['activation'] for param in cv_results['params']],
    'mean_test_accuracy': cv_results['mean_test_accuracy'],
    'mean_test_recall': cv_results['mean_test_recall'],  # recall 结果
    'std_test_accuracy': cv_results['std_test_accuracy'],
    'std_test_recall': cv_results['std_test_recall']
})

print(results_df)

In [None]:
# Visualize the results
plt.figure(figsize=(10, 6))
plt.plot(results_df.index, results_df['mean_test_accuracy'], marker='o', label='Accuracy')
plt.plot(results_df.index, results_df['mean_test_recall'], marker='x', label='Recall')
plt.title('Hyperparameter Combinations: Impact on Accuracy and Recall')
plt.xlabel('Hyperparameter Combination Index')
plt.ylabel('Score')
plt.legend()
plt.show()

# 5.3 Select the best model and evaluate the performance of best model on validation sets

In [None]:
# Extract the combinations of parameters of best model
bestmlp = mlp_search.best_estimator_
print(f"Best parameters for MLP: {bestmlp}")

best_mlp_p = bestmlp.predict(X_val)
# Evaluate the performance of best model on validation sets
best_mlp_accuracy = accuracy_score(y_val, best_mlp_p)
best_mlp_recall = recall_score(y_val, best_mlp_p, pos_label=-1)
print(f"Best MLP Model - Accuracy: {best_mlp_accuracy}, Recall: {best_mlp_recall}")

In [None]:
# Visual confusion matrix

# Confusion matrix
best_conf_matrix = confusion_matrix(y_val, best_mlp_p, labels=[-1, 1])
# Visual Confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(best_conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['-1', '1'], yticklabels=['-1', '1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix for best mlp model')
plt.show()


# 5.4 Visualize the training loss curve

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
# Assuming bestmlp already has the trained model and X_val, y_val are available
train_loss = bestmlp.loss_curve_  # Loss values during training from MLPClassifier

# Calculate validation loss
val_pred_proba = bestmlp.predict_proba(X_val)
val_loss = log_loss(y_val, val_pred_proba)

# Plotting the loss curves
plt.figure(figsize=(10, 6))
plt.plot(train_loss, label="Training Loss", color='blue')
plt.title("Training Loss Curve")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.grid(True)
plt.show()

# 5.5 Evaluate the performance of best model on test set

In [None]:

test_pred = bestmlp.predict(X_test)

test_accuracy = accuracy_score(y_test, test_pred)
test_recall = recall_score(y_test, test_pred, pos_label=-1)
print(f"Best Model on Test Set - Accuracy: {test_accuracy}, Recall: {test_recall}")


In [None]:
test_cm = confusion_matrix(y_test, test_pred)

# Visual Confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(test_cm, annot=True, fmt='d', cmap='Blues', xticklabels=['-1', '1'], yticklabels=['-1', '1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title("Best Model Confusion Matrix on Test Set")
plt.show()