## Import libraries

In [None]:
import pandas as pd
from ydata_profiling import ProfileReport
# Minmaxscaler
from sklearn.preprocessing import MinMaxScaler
# Train test split
from sklearn.model_selection import train_test_split
# Resampling minority class
from sklearn.utils import resample
# Keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
# Evaluation
import seaborn as sns
import matplotlib.pyplot as plt
# Ignore warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)
# Class weights
from sklearn.utils import class_weight
import numpy as np
# Tensorboard
import datetime
from tensorflow.keras.callbacks import TensorBoard
# Confusion matrix
from sklearn.metrics import confusion_matrix


%matplotlib inline
%load_ext tensorboard

## Store CSV as a DF

In [None]:
# Read the data from heart_attack.csv making the first row the header
data = pd.read_csv('heart_attack.csv', header=0, sep=';')

# Display the first 5 rows of the data
print(data.head())

### Clean the data

In [None]:
# Drop the id column

data = data.drop('id', axis=1)
data.head()

In [None]:
profile = ProfileReport(data, title="Profiling Report")
profile.to_notebook_iframe()

In [None]:
# Eliminate the rows form bmi that are missing
data = data.dropna(subset=['bmi'])
data.head()

## Encoding

In [None]:
data['gender'] = data['gender'].map({'Male': 0, 'Female': 1})
data['married'] = data['married'].map({'No': 0, 'Yes': 1})
data['job'] = data['job'].map({'Unemployed': 0, 'Private': 1, 'Self-employed': 2, 'Government employee': 3, 'Minor': 4})
data['residence'] = data['residence'].map({'Rural': 0, 'Urban': 1})
data['tobacco_use'] = data['tobacco_use'].map({'Former smoker': 0, 'Never': 1, 'Smoker': 2, 'Unknown': 3})

In [None]:
data.head()

In [None]:
# Normalize the data
scaler = MinMaxScaler()
data_normalized = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
data_normalized.head()

In [None]:
profile = ProfileReport(data, title="Profiling Report")
profile.to_notebook_iframe()

## Correct imbalances

In [None]:
# Drop hypertension and coronary_heart_disease
#data_normalized = data_normalized.drop(['hypertension', 'coronary_heart_disease'], axis=1)


## MLP

### Tensorboard Configuration

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
# Separate labels and features
y = data_normalized['heart_attack']
X = data_normalized.drop('heart_attack', axis=1)  #  'heart_attack' is the value we want to predict

In [None]:
# Divide the data into training, validation and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

print(y_train.value_counts())


In [None]:
# Reset the index of the dataframes
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

In [None]:
# Define the model
model = Sequential()
model.add(Dense(18, input_shape=(X_train.shape[1],), activation='relu'))  # Input layer
model.add(Dropout(0.2))  # Dropout layer to prevent overfitting
model.add(Dense(36, activation='relu'))  # Hidden layer
model.add(Dense(36, activation='relu'))  # Hidden layer
model.add(Dense(18, activation='relu'))  # Hidden layer
model.add(Dense(1, activation='sigmoid'))  # Output layer

# Compile the model
optimizer = SGD(learning_rate=0.1, momentum=0.9)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=1000, batch_size=50, callbacks=[tensorboard_callback])

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {test_acc*100:.2f}%')

In [None]:
# Making predictions to represent the confusion matrix
y_pred_prob = model.predict(X_test)
y_pred = [1 if prob > 0.5 else 0 for prob in y_pred_prob]
# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
# Labels for the cells
class_names = ['No Heart Attack', 'Heart Attack']

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

## Grid Search

In [None]:
# Divide the data into training, validation and test sets
y = data_normalized['heart_attack']
X = data_normalized.drop('heart_attack', axis=1)  #  'heart_attack' is the value we want to predict

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

In [None]:
# Combinations of hyperparameters
num_hidden_layers = [1, 2, 3]  # Number of hidden layers
learning_rates = [0.001, 0.01, 0.1]  # Learning rates
momentums = [0.0, 0.5, 0.9]  # Number of "momentums"
neurons_per_layer = [16, 32, 64]  # Number of nuerons per hidden layer

# List to store the results
results = []

for layers in num_hidden_layers:
    for lr in learning_rates:
        for momentum in momentums:
            for neurons in neurons_per_layer:
                # Define the model
                model = Sequential()
                model.add(Dense(neurons, input_shape=(X_train.shape[1],), activation='relu'))
                for _ in range(layers - 1):  # Add the hidden layers previously defined
                    model.add(Dense(neurons, activation='relu'))
                model.add(Dense(1, activation='sigmoid'))  # Output layer
                
                # Compile the model
                optimizer = SGD(learning_rate=lr, momentum=momentum)
                model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
                
                # Train the model
                history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=50, verbose=1)
                
                # Evaluate the model
                test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
                
                # Store the results
                results.append({'layers': layers, 'learning_rate': lr, 'momentum': momentum, 'neurons': neurons, 'test_acc': test_acc})

# Transform the results to a DataFrame
results_df = pd.DataFrame(results)
print(results_df.sort_values(by='test_acc', ascending=False))

In [None]:
plt.figure(figsize=(10, 6))
plot = sns.scatterplot(data=results_df, x='neurons', y='test_acc', hue='learning_rate', style='momentum', size='layers', palette='viridis', sizes=(50, 200))
plt.title('Accuracy given a number of neurons (coloured by learning rate)')
plt.xlabel('Number of neurons')
plt.ylabel('Test accuracy')
plt.legend(title='Learning rate', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()  # Improve the display of the plot
plt.show()


In [None]:
# Filtrando para un número específico de capas y momento para simplificar
filtered_df = results_df[results_df['layers'] == 2]
filtered_df = filtered_df[filtered_df['momentum'] == 0.9]

plt.figure(figsize=(10, 6))
sns.lineplot(data=filtered_df, x='learning_rate', y='test_acc', hue='neurons', marker='o', palette='coolwarm')
plt.title('Accuracy vs learning rate for a given number of neurons')
plt.xlabel('Learning rate')
plt.ylabel('Test accuracy')
plt.legend(title='Neurons')
plt.show()


In [None]:
heatmap_data = results_df.pivot_table(index="learning_rate", columns="momentum", values="test_acc", aggfunc='mean')

plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data, annot=True, cmap="YlGnBu", fmt=".2f")
plt.title('Accuracy given a learning rate and momentum')
plt.xlabel('Momentum')
plt.ylabel('Learning rate')
plt.show()

