In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.callbacks import EarlyStopping

# Load dataset
df = pd.read_csv('cardio_train.csv', delimiter=';')

# Drop 'id' column if present
if 'id' in df.columns:
    df = df.drop('id', axis=1)

# Create a new column for Cardiovascular Disease Status
df['Cardiovascular Disease Status'] = df['cardio'].map({1: 'Cardiovascular Disease', 0: 'No Cardiovascular Disease'})

# Generate the report
cardio_report = df[['cardio', 'Cardiovascular Disease Status']]

# Print the report
print("Cardiovascular Disease Status Report:")
print(cardio_report)

# Save the report to a CSV file
cardio_report.to_csv('cardiovascular_disease_status_report.csv', index=False)
print("\nReport saved to 'cardiovascular_disease_status_report.csv'.")

# Features and target
X = df.drop(['cardio', 'Cardiovascular Disease Status'], axis=1).values
y = df['cardio'].values

# Split dataset with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Random Forest Model
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    class_weight='balanced',
    random_state=42
)
rf_model.fit(X_train_scaled, y_train)

# Generate predictions from Random Forest
rf_train_pred = rf_model.predict(X_train_scaled)
rf_test_pred = rf_model.predict(X_test_scaled)

# RNN (using LSTM)
X_train_rnn = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_rnn = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

rnn_model = Sequential([
    LSTM(64, activation='tanh', input_shape=(1, X_train_scaled.shape[1]), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

rnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Add early stopping to prevent overfitting
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train RNN model
history = rnn_model.fit(
    X_train_rnn, y_train,
    epochs=20,
    batch_size=64,
    validation_data=(X_test_rnn, y_test),
    callbacks=[early_stop],
    verbose=1
)

# Generate predictions from RNN
rnn_train_pred = (rnn_model.predict(X_train_rnn) > 0.5).astype(int).flatten()
rnn_test_pred = (rnn_model.predict(X_test_rnn) > 0.5).astype(int).flatten()

# Create stacked dataset
stacked_train = np.column_stack((rf_train_pred, rnn_train_pred))
stacked_test = np.column_stack((rf_test_pred, rnn_test_pred))

# Stacking model (Logistic Regression)
stack_model = LogisticRegression(
    C=0.1,
    solver='liblinear',
    class_weight='balanced',
    random_state=42
)
stack_model.fit(stacked_train, y_train)

# Final predictions
final_pred = stack_model.predict(stacked_test)

# Evaluate the model
print("\nFinal Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, final_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, final_pred))
print("\nClassification Report:")
print(classification_report(y_test, final_pred))

# Plot training history
plt.figure(figsize=(10, 5))
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('RNN Training History')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.show()

# Confusion Matrix Visualization
cm = confusion_matrix(y_test, final_pred)
plt.figure(figsize=(6, 5))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()

classes = ['No Cardiovascular Disease', 'Cardiovascular Disease']
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'),
                 ha="center", va="center",
                 color="white" if cm[i, j] > thresh else "black")

plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.tight_layout()
plt.show()

Cardiovascular Disease Status Report:
       cardio Cardiovascular Disease Status
0           0     No Cardiovascular Disease
1           1        Cardiovascular Disease
2           1        Cardiovascular Disease
3           1        Cardiovascular Disease
4           0     No Cardiovascular Disease
...       ...                           ...
69995       0     No Cardiovascular Disease
69996       1        Cardiovascular Disease
69997       1        Cardiovascular Disease
69998       1        Cardiovascular Disease
69999       0     No Cardiovascular Disease

[70000 rows x 2 columns]

Report saved to 'cardiovascular_disease_status_report.csv'.
Epoch 1/20


  super().__init__(**kwargs)


[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6704 - loss: 0.6155 - val_accuracy: 0.7284 - val_loss: 0.5593
Epoch 2/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7340 - loss: 0.5533 - val_accuracy: 0.7271 - val_loss: 0.5568
Epoch 3/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7305 - loss: 0.5501 - val_accuracy: 0.7284 - val_loss: 0.5524
Epoch 4/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7345 - loss: 0.5463 - val_accuracy: 0.7296 - val_loss: 0.5528
Epoch 5/20
[1m875/875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.7310 - loss: 0.5487 - val_accuracy: 0.7271 - val_loss: 0.5520
Epoch 6/20
[1m  5/875[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1s[0m 2ms/step - accuracy: 0.7615 - loss: 0.5132  