In [None]:
# importar librerias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Procesamiento de Datos

In [None]:
# obtener el conjunto de datos
df = pd.read_csv('data.csv')

In [None]:
df.info()

In [None]:
df.head(5)

In [None]:
df["Diet"].unique()

In [None]:
# Verify if there are any null values
print(df.isna().sum())

## Transformación de Datos


In [None]:
from sklearn.preprocessing import LabelEncoder

# Seleccionar las características relevantes para el SOM (Age, Cholesterol, Blood Pressure, ...)
features = [
    # "Patient ID",
    "Age",
    "Sex",
    "Blood Pressure",
    "Smoking",
    "Cholesterol",
    "Heart Rate",
    "Diabetes",
    "Family History",
    "Alcohol Consumption",
    "Exercise Hours Per Week",
    "Diet",
    "Previous Heart Problems",
    "Sedentary Hours Per Day",
    "BMI",
    "Physical Activity Days Per Week",
    "Sleep Hours Per Day",
    "Heart Attack Risk",
    "Country",
    "Continent",
    "Hemisphere",
]
df = df[features]

# Dividir la columna 'Blood Pressure' en dos columnas separadas
df[["Systolic Pressure", "Diastolic Pressure"]] = df["Blood Pressure"].str.split(
    "/", expand=True
)
df.drop(columns=["Blood Pressure"], inplace=True)
df["Systolic Pressure"] = df["Systolic Pressure"].astype("int64")
df["Diastolic Pressure"] = df["Diastolic Pressure"].astype("int64")

# Convertir las columnas a valores numéricos
le = LabelEncoder()
df["Sex"] = le.fit_transform(df["Sex"])
region = df.pop("Sex")

le = LabelEncoder()
df["Diet"] = le.fit_transform(df["Diet"])
region = df.pop("Diet")

le = LabelEncoder()
df["Country"] = le.fit_transform(df["Country"])
region = df.pop("Country")

le = LabelEncoder()
df["Continent"] = le.fit_transform(df["Continent"])
region = df.pop("Continent")

le = LabelEncoder()
df["Hemisphere"] = le.fit_transform(df["Hemisphere"])
region = df.pop("Hemisphere")

df["Chol_BMI_ratio"] = df.apply(lambda r: float(r["Cholesterol"] / r["BMI"]), axis=1)

In [None]:
# graph of the corr
import seaborn as sns
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.show()

# Modelate

In [None]:
df.head()

## Split Data

In [None]:
# Split the dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    ConfusionMatrixDisplay,
)


X_train, X_test, y_train, y_test = train_test_split(
    df.drop("Heart Attack Risk", axis=1),
    df["Heart Attack Risk"],
    random_state=42,
    test_size=0.2,
)

In [None]:
# Eliminamos el Id del pasajero en el conjunto de datos de prueba
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Model Perceptron


In [None]:
from sklearn.linear_model import Perceptron

In [None]:
# Perceptron
perceptron = Perceptron()
perceptron.fit(X_train, y_train)
predictions = perceptron.predict(X_test)
# Calculate his accuracy of the perceptron
acc_perceptron = accuracy_score(y_test, predictions)
# Print the accuracy
print(acc_perceptron)

## Model Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
# print accuracy
acc_gnb =accuracy_score(y_test, y_pred)
print(acc_gnb)

## Model Regresion Logistic

In [None]:
from sklearn.linear_model import LogisticRegression, SGDClassifier


### SGDClassifier

In [None]:
logistic_model_SGD = SGDClassifier(loss='log_loss',learning_rate='constant',eta0=0.1 ) # investicar los parámetros en la documentacion y variar el learning_rate
logistic_model_SGD.fit(X_train, y_train)

In [None]:
print(f'Clases de la variable dependiente: {logistic_model_SGD.classes_}')
print('\n')
print('Vectores de coeficientes:')
print(logistic_model_SGD.coef_)

In [None]:
y_pred = logistic_model_SGD.predict(X_train)

In [None]:
print(f'Accuracy entrenamiento: {accuracy_score(y_train, y_pred)}')
print('Matriz de confusión:')
matriz_confusion = confusion_matrix(y_train, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=matriz_confusion, display_labels=logistic_model_SGD.classes_)
disp.plot()
plt.show()

In [None]:
y_pred_test = logistic_model_SGD.predict(X_test)

In [None]:
print(f'Accuracy testing: {accuracy_score(y_test, y_pred_test)}')
print('Matriz de confusión:')
matriz_confusion_test = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=matriz_confusion_test, display_labels=logistic_model_SGD.classes_)
disp.plot()
plt.show()

### LogisticRegression

In [None]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

In [None]:
print(f'Clases de la variable dependiente: {logistic_model.classes_}')
print('\n')
print('Vectores de coeficientes:')
print(logistic_model.coef_)

In [None]:
y_pred_2 = logistic_model.predict(X_train)

In [None]:
print(f'Accuracy entrenamiento: {accuracy_score(y_train, y_pred_2)}')
print('Matriz de confusión:')
matriz_confusion_2 = confusion_matrix(y_train, y_pred_2)
disp = ConfusionMatrixDisplay(confusion_matrix=matriz_confusion_2, display_labels=logistic_model.classes_)
disp.plot()
plt.show()

In [None]:
y_pred_test_2 = logistic_model.predict(X_test)

In [None]:
print(f'Accuracy testing: {accuracy_score(y_test, y_pred_test_2)}')
print('Matriz de confusión:')
matriz_confusion_test_2 = confusion_matrix(y_test, y_pred_test_2)
disp = ConfusionMatrixDisplay(confusion_matrix=matriz_confusion_test_2, display_labels=logistic_model.classes_)
disp.plot()
plt.show()

### ANN

In [None]:
# Use network neural
from keras.models import Sequential
from keras.layers import Dense, Input

In [None]:
## Con sequetial podemos construir una red neuronal apilando capas
modelsequ = Sequential()
modelsequ.add(Input(shape=(X_train.shape[1],)))
modelsequ.add(Dense(15, activation='relu'))
modelsequ.add(Dense(10, activation='relu'))
modelsequ.add(Dense(1, activation='sigmoid'))
modelsequ.compile(loss='binary_crossentropy',  optimizer='adam', metrics=['accuracy'])
modelsequ.summary()

In [None]:
historyseq = modelsequ.fit(
    X_train, y_train, validation_data=(X_test, y_test), epochs=500, batch_size=16
)

In [None]:
# graph of loss 
plt.plot(historyseq.history['loss'])
plt.plot(historyseq.history['val_loss'])
plt.title('Loss vs Epochs')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')
plt.show()

### CNN

In [None]:
# Import lib for convolutional neural network
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dropout

In [None]:
X_train.shape

In [None]:
modelconv = Sequential()
# modelconv.add(tf.keras.Input(shape=(10,)))
modelconv.add(Conv1D(32, 3, activation="relu", input_shape=(X_train.shape[1], 1)))
modelconv.add(MaxPooling1D())
modelconv.add(Conv1D(64, 3, activation="relu"))
modelconv.add(MaxPooling1D())
modelconv.add(Flatten())
modelconv.add(Dense(64, activation="relu"))
modelconv.add(Dropout(0.5))
modelconv.add(Dense(1, activation="sigmoid"))
modelconv.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
modelconv.summary()

In [None]:
# Train the model
historyconv = modelconv.fit(
    X_train, y_train, validation_data=(X_test, y_test), epochs=500, batch_size=16
)