In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV

In [10]:
data = pd.read_csv('/content/heart.csv')
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat


In [12]:
from google.colab import drive
drive.mount('/content/heart.csv')

MessageError: Error: credential propagation was unsuccessful

**EDA Analysis**

In [None]:
data.tail()

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data['HeartDisease'].value_counts()

# 1--> Defective heart
# 0--> Healthy heart

In [None]:
data['ChestPainType'].unique()

In [None]:
data.nunique()

In [None]:
{col: data[col].unique() for col in data.columns}

In [None]:
# Duplicate values
data.duplicated().sum()

In [None]:
correlation_matrix = data.select_dtypes(include=np.number).corr()  # Select only numerical columns
plt.figure(figsize=(14, 9))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

1. **OUTLIERS DETECTION**

In [None]:
from scipy import stats
# Select numerical columns for outlier detection
numerical_cols = data[data.columns]

# Calculate Z-scores
numerical_cols = data.select_dtypes(include=np.number).columns
z_scores = np.abs(stats.zscore(data[numerical_cols]))

# Identify outliers
outliers = np.where(z_scores > 3)
outliers_df = data.iloc[outliers[0]]

print("Outliers detected using Z-score method:")
print(outliers_df)

In [None]:
# Select numerical columns for outlier detection
numerical_cols = data[data.columns]

# Calculate Z-scores
numerical_cols = data.select_dtypes(include=np.number).columns
z_scores = np.abs(stats.zscore(data[numerical_cols]))

# Identify outliers
outliers = np.where(z_scores > 3)

# Replace outliers with median
for col in numerical_cols:
    median = data[col].median()
    data.loc[outliers[0], col] = median

print("Dataset after replacing outliers with the median:")
print(data)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select numerical features for outlier detection
# Verify the column names in your DataFrame ('df_combined')
numerical_features = ['Cholesterol', 'Oldpeak']

# Create box plots to visualize outliers
plt.figure(figsize=(15, 5))
for i, feature in enumerate(numerical_features):
    plt.subplot(1, len(numerical_features), i + 1)
    # Use 'df_combined' instead of 'data'
    sns.boxplot(y=data[feature])
    plt.title(feature)

plt.tight_layout()
plt.show()

# Create scatter plots to visualize outliers in relation to other features
# Example: MaxHR vs. Oldpeak
plt.figure(figsize=(8, 6))
# Use 'df_combined' instead of 'data'
sns.scatterplot(x='MaxHR', y='Oldpeak', data=data)
plt.title('MaxHR vs. Oldpeak')
plt.show()

# Print the available columns to double check
print(data.columns)

**2. Droping and Transforming Features
'Age','RestingBP','CHolesterol'**

In [None]:
data['young'] = np.where(data['Age'] < 35, 1, 0)
data['adult'] = np.where((data['Age'] >= 35) & (data['Age'] < 60), 1, 0)
data['elder'] = np.where(data['Age'] >= 60, 1, 0)
data.drop('Age', axis=1, inplace=True)
data

In [None]:
data['lowBP'] = np.where(data['RestingBP'] < 120, 1, 0)
data['mediumBP'] = np.where((data['RestingBP'] >= 120) & (data['RestingBP'] < 140), 1, 0)
data['highBP'] = np.where(data['RestingBP'] >= 140, 1, 0)
data.drop('RestingBP', axis=1, inplace=True)
data

In [None]:
data['low_chol'] = np.where(data['Cholesterol'] < 200, 1, 0)
data['medium_chol'] = np.where((data['Cholesterol'] >= 200) & (data['Cholesterol'] < 240), 1, 0)
data['high_chol'] = np.where(data['Cholesterol'] >= 240, 1, 0)
data.drop('Cholesterol', axis=1, inplace=True)
data

In [None]:
data.shape

In [None]:
data.head(116)

**2.One-Hot Encoding:
This technique was applied to categorical features to convert them into binary (0 or 1) format.
Features: ChestPainType, RestingECG, and ST Slope were one-hot encoded.**

In [None]:
df_chestpain = pd.get_dummies(data['ChestPainType'], prefix='ChestPainType')
df_restingecg = pd.get_dummies(data['RestingECG'], prefix='RestingECG')
df_st_slope = pd.get_dummies(data['ST_Slope'], prefix='ST_Slope')
df_chestpain.astype(int)

In [None]:
df_restingecg.astype(int)

In [None]:
df_st_slope.astype(int)

In [None]:
df_combined = pd.concat([data, df_chestpain.astype(int), df_restingecg.astype(int), df_st_slope.astype(int)], axis=1)
df_combined.drop(['ChestPainType', 'RestingECG', 'ST_Slope'], axis=1, inplace=True)
df_combined

In [None]:
df_combined.info()

**# 3.Label Encoding:
This technique was used to convert categorical data into numerical format by assigning a unique integer to each category.
Features: 'Sex' and 'ExerciseAngina' were label encoded.**

In [None]:
label_encoder = LabelEncoder()

In [None]:
data['ChestPainType'] = label_encoder.fit_transform(data['ChestPainType'])
data['RestingECG'] = label_encoder.fit_transform(data['RestingECG'])
data['ST_Slope'] = label_encoder.fit_transform(data['ST_Slope'])
data['Sex'] = label_encoder.fit_transform(data['Sex'])
data['ExerciseAngina'] = label_encoder.fit_transform(data['ExerciseAngina'])
data

In [None]:
df_combined['Sex'] = label_encoder.fit_transform(df_combined['Sex'])
df_combined['ExerciseAngina'] = label_encoder.fit_transform(df_combined['ExerciseAngina'])
df_combined

In [None]:
# prompt: downloload the above output

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV

# Load the dataset (assuming it's in your Google Drive)
data = pd.read_csv('/content/heart.csv')

# Feature Engineering (as shown in the provided code)
data['young'] = np.where(data['Age'] < 35, 1, 0)
data['adult'] = np.where((data['Age'] >= 35) & (data['Age'] < 60), 1, 0)
data['elder'] = np.where(data['Age'] >= 60, 1, 0)
data.drop('Age', axis=1, inplace=True)

data['lowBP'] = np.where(data['RestingBP'] < 120, 1, 0)
data['mediumBP'] = np.where((data['RestingBP'] >= 120) & (data['RestingBP'] < 140), 1, 0)
data['highBP'] = np.where(data['RestingBP'] >= 140, 1, 0)
data.drop('RestingBP', axis=1, inplace=True)

data['low_chol'] = np.where(data['Cholesterol'] < 200, 1, 0)
data['medium_chol'] = np.where((data['Cholesterol'] >= 200) & (data['Cholesterol'] < 240), 1, 0)
data['high_chol'] = np.where(data['Cholesterol'] >= 240, 1, 0)
data.drop('Cholesterol', axis=1, inplace=True)

# One-Hot Encoding
df_chestpain = pd.get_dummies(data['ChestPainType'], prefix='ChestPainType')
df_restingecg = pd.get_dummies(data['RestingECG'], prefix='RestingECG')
df_st_slope = pd.get_dummies(data['ST_Slope'], prefix='ST_Slope')

df_combined = pd.concat([data, df_chestpain.astype(int), df_restingecg.astype(int), df_st_slope.astype(int)], axis=1)
df_combined.drop(['ChestPainType', 'RestingECG', 'ST_Slope'], axis=1, inplace=True)

# Label Encoding
label_encoder = LabelEncoder()
df_combined['Sex'] = label_encoder.fit_transform(df_combined['Sex'])
df_combined['ExerciseAngina'] = label_encoder.fit_transform(df_combined['ExerciseAngina'])

# Save the processed data to a CSV file
df_combined.to_csv('processed_heart_data.csv', index=False)

# Download the file
from google.colab import files
files.download('processed_heart_data.csv')

In [None]:
# df_combined.to_csv('heart2.csv', index=False)

In [None]:
df = pd.read_csv('/content/heart2.csv')
df

In [None]:
df_combined.shape

In [None]:
X = df_combined.drop('HeartDisease', axis=1)
y = df_combined['HeartDisease']
X

In [None]:
y

**# Data Splitting and K-fold Cross Validation**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

**# K-Fold Cross validation**

In [None]:
clf = RandomForestClassifier()
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
kfold
scores = cross_val_score(clf, X, y, cv=kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (scores.mean()*100, scores.std()*100))

In [None]:
rfecv = RFECV(estimator=clf, step=1, cv=5)
rfecv.fit(X, y)

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train , y_train)

In [None]:
clf.score(X_train , y_train)

In [None]:
clf_pred = clf.predict(X_test)

In [None]:
accuracy_score(y_test , clf_pred)

In [None]:
classification_report(y_test , clf_pred)

**# 5.	Hyperparameter Tuning:
 An extensive hyperparameter grid search was conducted to find the best hyperparameter configuration.
 Purpose: This step optimizes the model's performance by finding the best combination of hyperparameters **

In [None]:
param_grid = {'n_estimators': [100, 200, 300],'max_depth': [None, 10, 20, 30],'min_samples_split': [2, 5, 10],'min_samples_leaf': [1, 2, 4]}
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

In [None]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy with best model: {test_accuracy * 100:.2f}")

In [None]:
# prompt: code for SAE

from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Define the input layer
n_features = X_train.shape[1]  # Number of features
input_layer = Input(shape=(n_features,))

# Convert X_train to floating-point values
X_train = X_train.astype('float32')

# Define the encoding layers
encoding_dim1 = 64
encoding_dim2 = 32
encoded1 = Dense(encoding_dim1, activation='relu')(input_layer)
encoded2 = Dense(encoding_dim2, activation='relu')(encoded1)

# Define the decoding layers
decoded1 = Dense(encoding_dim1, activation='relu')(encoded2)
decoded2 = Dense(n_features, activation='sigmoid')(decoded1)

# Define the SAE model
sae_model = Model(input_layer, decoded2)

# Compile the SAE model
sae_model.compile(optimizer='adam', loss='binary_crossentropy')

# Train the SAE model

sae_model.fit(X_train, X_train, epochs=50, batch_size=50)

# Convert X_test to floating-point values before prediction
X_test = X_test.astype('float32')
# Use the encoder model to encode the data
encoded_data = sae_model.predict(X_train)
encoded_data = sae_model.predict(X_test)

In [None]:
pip install --upgrade xgboost scikit-learn

In [None]:
!pip install scikit-learn==1.0.2
!pip install xgboost==1.7.1
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
import warnings
from sklearn.exceptions import ConvergenceWarning # Import ConvergenceWarning

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning) # Now you can use ConvergenceWarning

# Sample Data (Replace with your dataset loading code)
# Assuming 'X' and 'y' are your features and target variables
# Example:
# from sklearn.datasets import load_iris
# data = load_iris()
# X = data.data
# y = data.target

# Define the models
models = [
    RandomForestClassifier(n_estimators=100, random_state=42),
    MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=1000),
    AdaBoostClassifier(),
    XGBClassifier(),  # Ensure you have the latest version of xgboost installed
    DecisionTreeClassifier(),
    KNeighborsClassifier(),
    GaussianNB()
]

# Define the cross-validation strategy
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Evaluate each model using cross-validation
scores = []
for model in models:
    print(f"Training {type(model).__name__}...")  # Debug: Indicating which model is being trained
    # Set n_jobs=1 to disable parallelism during cross-validation
    cv_scores = cross_val_score(model, X, y, cv=kfold, n_jobs=1)
    scores.append(cv_scores.mean())

# Print the average scores for each model
for model, score in zip(models, scores):
    print(f"Model: {type(model).__name__}, Score: {score:.3f}") # Use __name__ to get the class name

# Create a bar plot of the model accuracies
model_names = [type(model).__name__ for model in models]  # Use __name__ to get the class name
plt.figure(figsize=(15, 6))
bars = plt.bar(model_names, scores, color='skyblue', width=0.4)
plt.title('Model Accuracies')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0.7, 0.9)  # Adjust the y-axis limit as necessary

# Add accuracy labels above the bars
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{score * 100:.3f}", ha='center', va='bottom')

# Show the plot
plt.show()

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Convert DataFrames to numpy arrays if necessary
X = X.values if isinstance(X, pd.DataFrame) else X
y = y.values if isinstance(y, pd.Series) else y

# Initialize KFold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# List of latent dimensions to evaluate
latent_dims = [100, 200, 300, 400, 500, 600]

# Initialize dictionary to store accuracies for each latent dimension
accuracies_per_latent_dim = {dim: [] for dim in latent_dims}

# Perform cross-validation
for latent_dim in latent_dims:
    print(f"\nEvaluating latent_dim = {latent_dim}")

    fold_accuracies = []

    for train_index, test_index in kfold.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Standardize data
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Define SAE
        input_shape = (X_train.shape[1],)
        input_img = Input(shape=input_shape)
        x = Dense(128, activation='relu')(input_img)
        x = Dense(64, activation='relu')(x)
        encoded = Dense(latent_dim, activation='relu')(x)

        x = Dense(64, activation='relu')(encoded)
        x = Dense(128, activation='relu')(x)
        decoded = Dense(input_shape[0], activation='linear')(x)

        autoencoder = Model(input_img, decoded)
        autoencoder.compile(optimizer='adam', loss='mse')

        # Train the autoencoder
        autoencoder.fit(X_train, X_train, epochs=100, batch_size=128, verbose=0)

        # Encoder Model
        encoder = Model(input_img, encoded)

        # Classifier Model
        encoded_input = Input(shape=(latent_dim,))
        x = Dense(128, activation='relu')(encoded_input)
        output = Dense(1, activation='sigmoid')(x)
        classifier = Model(encoded_input, output)
        classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        # Encode training data
        encoded_train_data = encoder.predict(X_train)

        # Train the classifier
        classifier.fit(encoded_train_data, y_train, epochs=95, batch_size=130, verbose=0)

        # Encode test data
        encoded_test_data = encoder.predict(X_test)

        # Evaluate classifier
        loss, accuracy = classifier.evaluate(encoded_test_data, y_test, verbose=0)
        fold_accuracies.append(accuracy)

        print(f"Fold Accuracy: {accuracy * 100:.2f}%")

    # Calculate mean and std for this latent dimension
    mean_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    accuracies_per_latent_dim[latent_dim] = (mean_accuracy, std_accuracy)

    print(f"\nLatent_dim = {latent_dim}")
    print(f"Mean Accuracy: {mean_accuracy * 100:.2f}%")
    print(f"Standard Deviation: {std_accuracy * 100:.2f}%")

# Optionally, print all accuracies for each latent dimension
for latent_dim, (mean_accuracy, std_accuracy) in accuracies_per_latent_dim.items():
    print(f"Latent_dim = {latent_dim}: Mean Accuracy = {mean_accuracy * 100:.2f}%, Std = {std_accuracy * 100:.2f}%")

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras import layers, models

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# List of latent dimensions to evaluate
latent_dims = [100, 200, 300, 400, 500, 600]

# Initialize lists to store accuracies
accuracies = []

# Iterate through each latent dimension size
for latent_dim in latent_dims:
    print(f"\nTraining with latent_dim = {latent_dim}")

    # Define SAE
    input_dim = X_train.shape[1]
    input_layer = layers.Input(shape=(input_dim,))
    accuracies=[]
    encoded = layers.Dense(latent_dim, activation='relu', activity_regularizer=tf.keras.regularizers.l1(1e-5))(input_layer)
    decoded = layers.Dense(input_dim, activation='sigmoid')(encoded)
    autoencoder = models.Model(input_layer, decoded)
    encoder = models.Model(input_layer, encoded)

    # Compile SAE
    autoencoder.compile(optimizer='adam', loss='mse')

    # Train SAE
    autoencoder.fit(X_train, X_train, epochs=100, batch_size=132, validation_split=0.2, verbose=0)

    # # Define MLP Classifier
    MLPClassifier = models.Sequential([
        layers.Input(shape=(latent_dim,)),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])

    # Combine SAE and MLP
    encoded_input = encoder(input_layer)
    classification_output = MLPClassifier(encoded_input)
    combined_model = models.Model(input_layer, classification_output)

    # Compile the combined model
    combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the combined model
    combined_model.fit(X_train, y_train, epochs=95, batch_size=50, validation_split=0.2, verbose=0)

    # Evaluate the combined model
    y_pred = combined_model.predict(X_test)
    y_pred = np.round(y_pred).astype(int).flatten()
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    print(classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy * 100:.2f}")



In [None]:
import matplotlib.pyplot as plt

# Ensure accuracies and latent_dims have the same length
if len(latent_dims) != len(accuracies):
    print("Error: Number of accuracies does not match the number of latent dimensions.")
else:
    # Create a bar plot of the model accuracies
    plt.figure(figsize=(12, 5))
    bars = plt.bar(latent_dims, accuracies, width=30, color='coral')

    # Add titles and labels
    plt.title('SAE + MLP Accuracy with Different Latent Dimensions')
    plt.xlabel('Latent Dimension Size')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)  # Adjust based on accuracy range

    # Display accuracy values on each bar
    for bar, score in zip(bars, accuracies):
        plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(),
                 f"{score * 100:.3f}", ha='center', va='bottom', fontsize=12)

    # Show the plot
    plt.show()

In [None]:
MLPClassifier.save("mlp_model.h5")

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import (Input, Conv1D, MaxPooling1D, UpSampling1D,
                                     Flatten, Dense, Reshape, Dropout, BatchNormalization)
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Assuming X is tokenized text data converted into numerical sequences
max_sequence_length = 100  # Adjust based on dataset
X_padded = pad_sequences(X, maxlen=max_sequence_length, padding='post')

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Normalize Data (MinMax Scaler works better for neural networks)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = np.expand_dims(X_train, axis=-1)  # Add channel dimension for Conv1D
X_test = np.expand_dims(X_test, axis=-1)

latent_dims = [100, 200, 300, 400, 500, 600]  # Optimized latent dimensions
accuracies = []

for latent_dim in latent_dims:
    print(f"Training with latent dimension: {latent_dim}")

    # Encoder
    input_text = Input(shape=(max_sequence_length, 1))
    x = Conv1D(128, kernel_size=3, activation='relu', padding='same')(input_text)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Conv1D(64, kernel_size=3, activation='relu', padding='same')(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Flatten()(x)
    encoded = Dense(latent_dim, activation='relu')(x)

    # Decoder
    x = Dense((max_sequence_length // 4) * 64, activation='relu')(encoded)
    x = Reshape((max_sequence_length // 4, 64))(x)
    x = UpSampling1D(size=2)(x)
    x = Conv1D(128, kernel_size=3, activation='relu', padding='same')(x)
    x = UpSampling1D(size=2)(x)
    decoded = Conv1D(1, kernel_size=3, activation='linear', padding='same')(x)

    # Autoencoder Model
    autoencoder = Model(input_text, decoded)
    autoencoder.compile(optimizer='adam', loss='mse')

    # Learning Rate Scheduler
    lr_scheduler = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=5, verbose=1)

    # Train Autoencoder
    autoencoder.fit(X_train, X_train, epochs=100, batch_size=64, verbose=1, callbacks=[lr_scheduler])

    # Encoder Model
    encoder = Model(input_text, encoded)

    # Classifier Model
    encoded_input = Input(shape=(latent_dim,))
    x = Dense(256, activation='relu')(encoded_input)
    x = Dropout(0.4)(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    output = Dense(1, activation='sigmoid')(x)
    classifier = Model(encoded_input, output)
    classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train Classifier
    encoded_train_data = encoder.predict(X_train)
    classifier.fit(encoded_train_data, y_train, epochs=100, batch_size=64, verbose=1, callbacks=[lr_scheduler])

    # Evaluate Classifier
    encoded_test_data = encoder.predict(X_test)
    loss, accuracy = classifier.evaluate(encoded_test_data, y_test, verbose=1)
    accuracies.append(accuracy)
    print(f"Test Accuracy for latent dimension {latent_dim}: {accuracy * 100:.2f}%")

# Summarize accuracies
for i, acc in enumerate(accuracies):
    print(f"Accuracy for latent dimension {latent_dims[i]}: {acc * 100:.2f}%")

In [None]:
# prompt: provide bar chart code for the above accuracies which contains accuracy on each bar

# Create a bar plot of the model accuracies
plt.figure(figsize=(15, 6))
bars = plt.bar(latent_dims, accuracies, width=30, color='lightgreen')
plt.title('CNN Accuracy with Different Latent Sizes')
plt.xlabel('Latent Sizes')
plt.ylabel('Accuracy')
plt.ylim(0.7, 1)
for bar, score in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{score * 100:.3f}", ha='center', va='bottom')

plt.show()


In [None]:
def predict_new_data(new_data):
    # Standardize new data
    new_data_scaled = scaler.transform(new_data)
    # Encode new data using the encoder
    encoded_new_data = encoder.predict(new_data_scaled)
    # Predict using the classifier
    predictions = classifier.predict(encoded_new_data)
    predictions = np.round(predictions).astype(int).flatten()
    return predictions
new_data = df.drop(columns=['HeartDisease']).iloc[:2].values
predictions = predict_new_data(new_data)
print("Predictions for new data:", predictions)

In [None]:
classifier.summary()

In [None]:
# prompt: save the cnn model

classifier.save('/content/drive/MyDrive/my_model.h5')  # Save the model to a HDF5 file


In [None]:
# prompt: load the saved model

from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('/content/drive/MyDrive/my_model.h5')

# Print the model summary
model.summary()


In [None]:
# prompt: download the above model

from google.colab import files
files.download('/content/drive/MyDrive/my_model.h5')


In [None]:
# prompt: Comparison of our proposal multi task neural networks with the classical MLP and Random Forest
# models
from sklearn.neural_network import MLPClassifier
# Define the models
models = [
    RandomForestClassifier(),
    MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=1000),

    # Add your proposed multi task neural network model here
]
# Define the cross-validation strategy
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Evaluate each model using cross-validation
scores = []
for model in models:
    cv_scores = cross_val_score(model, X, y, cv=kfold)
    scores.append(cv_scores.mean())

# Print the average scores
for model, score in zip(models, scores):
    print(f"Model: {type(model).__name__}, Score: {score:.3f}")

# Create a bar plot of the model accuracies
model_names = [type(model).__name__ for model in models]
plt.figure(figsize=(10, 6))
bars = plt.bar(model_names, scores, color='skyblue')
plt.title('Model Accuracies')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.ylim(0, 1)  # Since accuracy is between 0 and 1
for bar, score in zip(bars, scores):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f"{score * 100:.3f}", ha='center', va='bottom')

plt.show()


In [None]:
# prompt: provide confusion matrix

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming y_test and y_pred are defined from your model's prediction
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
# prompt: provide classification report for the entire code

print(classification_report(y_test, y_pred))


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Use the encoder to transform X_test into the latent space (encoded test data)
encoded_test_data = encoder.predict(X_test)

# Predict probabilities (instead of class labels) using the classifier
y_pred_prob = classifier.predict(encoded_test_data)

# Compute ROC curve and ROC area (AUC)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# prompt: provide code for ROC curve in a tabular format

from sklearn.metrics import roc_curve, auc
import pandas as pd

# Assuming you have y_true and y_pred_prob from your model
# Replace these with your actual values
y_true = y_test
y_pred_prob = classifier.predict(encoded_test_data)  # Make sure this is the probability output

# Compute ROC curve and ROC area (AUC)
fpr, tpr, thresholds = roc_curve(y_true, y_pred_prob)
roc_auc = auc(fpr, tpr)

# Create a DataFrame from the ROC curve data
roc_df = pd.DataFrame({'False Positive Rate': fpr, 'True Positive Rate': tpr, 'Thresholds': thresholds})

# Print the DataFrame
print(roc_df)
print(f"\nAUC: {roc_auc:.2f}")
