In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import fbeta_score, make_scorer, accuracy_score,  confusion_matrix, cohen_kappa_score, precision_score, recall_score, classification_report, f1_score
from sklearn.preprocessing import OneHotEncoder,  StandardScaler
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier


from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor, AdaBoostClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

import optuna
import optuna.visualization as vis
import time

import scipy.stats as st

In [None]:
heart_data = "../data/clean/resampled_data.csv"
heart_df = pd.read_csv(heart_data)
heart_df.head()

In [None]:
heart_df.shape

In [None]:
df_new = heart_df.drop(['PhysHlth', 'DiffWalk','Education'],  axis=1)
df_new

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(data=heart_df, x='HeartDiseaseorAttack')
plt.title(f'Count plot')
plt.xticks(rotation=45)  # Rotate x-axis labels if needed
plt.show()

In [None]:
target = df_new['HeartDiseaseorAttack']
features = df_new.drop('HeartDiseaseorAttack', axis=1)

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)

#Normalise all columns to be 0-1
normalizer = MinMaxScaler()
normalizer.fit(x_train)

x_train_norm = normalizer.transform(x_train)
x_test_norm = normalizer.transform(x_test)

x_train_norm = pd.DataFrame(x_train_norm, columns=x_train.columns, index=x_train.index )
x_test_norm = pd.DataFrame(x_test_norm, columns=x_test.columns, index=x_test.index)

#### Adaptive boosting

#### Using classifier

In [None]:
ada_reg = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20),
                            n_estimators=100, algorithm='SAMME')
ada_reg.fit(x_train_norm, y_train)
pred = ada_reg.predict(x_test_norm)
print(f"MAE, {mean_absolute_error(pred, y_test): .2f}")
print(f"MSE, {mean_squared_error(pred, y_test): .2f}")
print(f"RMSE, {root_mean_squared_error(pred, y_test): .2f}")
print(f"R2 score, {ada_reg.score(x_test_norm, y_test): .2f}")

In [None]:
def test_normalised_data(df_new):
    """
    Split the dataset into target and features, then divide it into train and test.
    Apply Min-Max Scaling to normalize the feature values to a range of 0 to 1 for both train and test.
    Iterates over a range of n_estimators (from 6 to 14) to train an AdaBoost classifier, decision tree base(max depth = 20)
    Uses the SAMME algorithm for boosting.
    For each iteration, calculates accuracy, recall, and Cohen's Kappa score.
    Stores the results into a dataframe.
    """
    
    target = df_new['HeartDiseaseorAttack']
    features = df_new.drop('HeartDiseaseorAttack', axis=1)

    x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=0)
    
    #Normalise all columns to be 0-1
    normalizer = MinMaxScaler()
    normalizer.fit(x_train)
    
    x_train_norm = normalizer.transform(x_train)
    x_test_norm = normalizer.transform(x_test)
    
    x_train_norm = pd.DataFrame(x_train_norm, columns=x_train.columns, index=x_train.index )
    x_test_norm = pd.DataFrame(x_test_norm, columns=x_test.columns, index=x_test.index)
    
    results = []

    
    # Iterate over a range of n_estimators for AdaBoost
    for i in range(2,25): 
        ada_boost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20),
            n_estimators=i,  # number of estimators (trees)
            algorithm='SAMME'
        )
        
        # Fit the AdaBoost model
        ada_boost.fit(x_train, y_train)
        # Predict the labels on the test set
        y_pred = ada_boost.predict(x_test)
        
        # Calculate Cohen's Kappa score
        kappa = cohen_kappa_score(y_test, y_pred)
    
        #Calculate the recall
        recall = recall_score(y_test, y_pred) * 100
        
        # Calculate accuracy manually using confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        accuracy = 100 * ((cm[0][0] + cm[1][1]) / (sum(cm[0]) + sum(cm[1])))

        print(f"AdaBoost Test MAE: {mean_absolute_error(y_test, y_pred): .4f}")
        print(f"AdaBoost Test MSE: {mean_squared_error(y_test, y_pred): .4f}")
        print(f"AdaBoost Test RMSE: {root_mean_squared_error(y_test, y_pred): .4f}")
        print(f"AdaBoost Test R2 score: {ada_boost.score(x_test_norm, y_test): .4f}")
        print()
        
        results.append({
            "k": i,
            "Accuracy": accuracy,
            "Recall": recall,
            "Kappa": kappa
        })
        results_df = pd.DataFrame(results)

    return results_df

In [None]:
#store the result in df
scores_df =test_normalised_data(df_new)

#### Interpretation of the result

The MAE, MSE, and RMSE values suggest relatively small prediction errors, indicating that the model's predictions are quite close to the actual values.

#### General Performance:

- Accuracy improved with more neighbours, peaking at 74.70% in k=23 (estimators)
- Consistently improved, reaching a peak of 77.02% at k=20 making it critical for detecting true positives in heart attack risk prediction.
- Achieved its highest value of 0.4941 at k=23, indicating better agreement between predictions and actual values.

#### Best Configuration:

- 23 estimators provided the best results:
- Accuracy: 74.70%
- Recall: 76.94%
- Cohen’s Kappa: 0.4941

This configuration achieved the highest recall and Cohen's Kappa, balancing predictive accuracy and medical relevance.

Recall improvement is notable, reaching a peak of 77.02% at k=20, making it critical for detecting true positives in heart attack risk prediction.

In [None]:
import plotly.express as px
# Melt the DataFrame for easier plotting with Plotly
df_melted = scores_df.melt(id_vars="k", value_vars=["Accuracy", "Recall"], 
                    var_name="Metric", value_name="Score")

# Create the line plot with Plotly Express
fig = px.line(df_melted, x="k", y="Score", color="Metric", markers=True,
              title="Model Performance Metrics vs. k (Number of Estimators)",
              labels={"k": "Number of Estimators (k)", "Score": "Metric Score"})

# Customize the layout for better presentation
fig.update_layout(
    title_font_size=20,
    xaxis_title="Number of Estimators (k)",
    yaxis_title="Metric Score",
    template="plotly_white",
    legend_title="Metrics",
    xaxis=dict(tickmode="linear"),
)

# Show the plot
fig.show()

#### Logistic regression

In [None]:
log_reg = LogisticRegression()
log_reg.fit(x_train_norm, y_train)

In [None]:
log_reg.score(x_test_norm, y_test)

In [None]:
pred = log_reg.predict(x_test_norm)
report_dict = classification_report(y_true=y_test, y_pred=pred, output_dict=True)

# Convert the dictionary to a pandas DataFrame
report_df = pd.DataFrame(report_dict).transpose()

# Reset index for plotting
report_df = report_df.reset_index()

# Filter out 'accuracy' and 'macro avg' for class-level plotting
filtered_df = report_df[~report_df['index'].isin(['accuracy', 'macro avg', 'weighted avg'])]

In [None]:

# Prepare data for Plotly
filtered_metrics_plotly = filtered_df.melt(
    id_vars='index', 
    value_vars=metrics, 
    var_name='Metric', 
    value_name='Score'
)

# Create an interactive bar chart
fig = px.bar(
    filtered_metrics_plotly,
    x='index',
    y='Score',
    color='Metric',
    barmode='group',
    labels={'index': 'Class', 'Score': 'Score'}
)

# Customize the layout
fig.update_layout(
    xaxis_title="Class",
    yaxis_title="Score",
    yaxis=dict(range=[0, 1]),
    legend_title="Metric",
    title_font_size=20
)

# Show the interactive plot
fig.show()


#### Bayesian search

In [None]:
def objective(trial, confidence_level, folds):
    # Define the range of hyperparameters for AdaBoost
    n_estimators = trial.suggest_int("n_estimators", 10, 200)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)  # Updated for Optuna v3.0
    algorithm = trial.suggest_categorical("algorithm", ["SAMME"])  # Use SAMME to avoid deprecation issues

    # Define the base estimator for AdaBoost
    base_estimator = DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 10),
        random_state=123
    )

    # Define the AdaBoost model
    ada_boost = AdaBoostClassifier(
        estimator=base_estimator,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        algorithm=algorithm,
        random_state=123
    )

    # Perform cross-validation
    scores = cross_val_score(ada_boost, x_train_norm, y_train, cv=folds, scoring='recall')
    mean_score = np.mean(scores)
    sem = np.std(scores, ddof=1) / np.sqrt(folds)

    # Calculate confidence interval
    tc = st.t.ppf(1-((1-confidence_level)/2), df=folds-1)
    lower_bound = mean_score - (tc * sem)
    upper_bound = mean_score + (tc * sem)

    # Store confidence interval for the trial
    trial.set_user_attr("CV_score_summary", [round(lower_bound, 4), round(mean_score, 4), round(upper_bound, 4)])

    return mean_score  # Return the mean recall score


# optuna study is created to optimize hyperparameters for maximizing recall, using a custom objective function with 45 trials and a progress bar
confidence_level = 0.95
folds = 10

start_time = time.time()
study = optuna.create_study(direction="maximize")  # maximize recall
study.optimize(lambda trial: objective(trial, confidence_level, folds), n_trials=45, show_progress_bar=True)
end_time = time.time()

# Output results
print("\n")
#time taken for optimization is calculated and printed, together with the best combination of hyperparameters
print(f"Time taken to find the best combination of hyperparameters among the given ones: {end_time - start_time: .4f} seconds")
print("\n")
print("The best combination of hyperparameters found was: ", study.best_params)
# best recall score achieved during the optimization is displayed
print(f"The best Recall score found was: {study.best_value: .4f}")



In [None]:
# code extracts and sorts trials based on the upper bound of the Recall confidence interval to identify the best hyperparameter combination
# prints the recall for the best performing trial and visualizes the optimization history
results = sorted([(index,
  trial.user_attrs['CV_score_summary'][0],
  trial.user_attrs['CV_score_summary'][1],
  trial.user_attrs['CV_score_summary'][2]) for index, trial in enumerate(study.trials)], key=lambda x: x[2], reverse=True)

print(f"The Recall confidence interval for the best combination of hyperparameters is: {results[0][1:]}")
vis.plot_optimization_history(study)