In [1]:
import pandas as pd
import numpy as np
import random
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam
import tensorflow as tf
import seaborn as sns
from sklearn.ensemble import IsolationForest
import joblib
os.chdir('Resources/')

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('10_SP_Preprocessed_Data.csv')

X = df.drop(['HeartDisease'], axis='columns')
Y = df[['HeartDisease']]

In [3]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import joblib

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2427)

f1_scores = []
test_indices_all_folds = []  # To store test indices for all folds as a list of lists

# Open a text file to write F1-scores
with open("12_SP_F1_Scores_RF.txt", "w") as f1_file:
    for fold, (train_index, test_index) in enumerate(skf.split(X, Y)):
        print(f"Processing fold {fold + 1}")

        # Split data into train and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

        # Append the test indices for this fold
        test_indices_all_folds.append(list(X.index[test_index]))

        # Train the RandomForest model
        model = RandomForestClassifier(random_state=2427, n_estimators=93)
        model.fit(X_train, Y_train)

        # Make predictions
        Y_pred = model.predict(X_test)

        # Calculate F1 score
        f1 = f1_score(Y_test, Y_pred)
        f1_scores.append(f1)

        # Save the model
        joblib.dump(model, f'12_SP_Model_RF_{fold + 1}.joblib')

        # Write F1 score to the text file
        f1_file.write(f"F1-score for fold {fold + 1}: {f1*100:.2f}\n")

    # Calculate and write the average F1-score to the text file
    avg_f1 = sum(f1_scores) / len(f1_scores)
    std_f1 = pd.Series(f1_scores).std()
    f1_file.write(f"\nAverage F1-score: {avg_f1*100:.2f} ± {std_f1*100:.2f}\n")

# Save the list of test indices to a file
with open("12_SP_Test_Data_RF.txt", "w") as test_file:
    test_file.write(str(test_indices_all_folds))

# Print F1-scores for each fold and their average
print(f"F1-scores for each fold: {f1_scores}")
print(f"Average F1-score: {avg_f1} ± {std_f1}")

Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
Processing fold 5
F1-scores for each fold: [0.9425028184892897, 0.9494949494949495, 0.9187358916478555, 0.9519553072625698, 0.9402480270574972]
Average F1-score: 0.9405873987904323 ± 0.013132461429435189


In [4]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import joblib

# Initialize PCA with desired number of components (e.g., 95% variance explained)
pca = PCA(n_components=0.95, random_state=378)

# Initialize Random Forest
rf_model = RandomForestClassifier(random_state=378, n_estimators=99)

# StratifiedKFold initialization
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=378)

f1_scores = []
test_indices_all_folds = []  # To store test indices for all folds as a list of lists

# Open a text file to write the F1 scores
with open("12_SP_F1_Scores_RF_PCA.txt", "w") as f1_file:
    # Loop over each fold for StratifiedKFold
    for fold, (train_index, test_index) in enumerate(skf.split(X, Y)):
        print(f"Processing fold {fold + 1}")

        # Split data into training and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

        # Append test indices for this fold
        test_indices_all_folds.append(list(X.index[test_index]))

        # Apply PCA on the training and test data
        X_train_pca = pca.fit_transform(X_train)
        X_test_pca = pca.transform(X_test)

        # Train the RandomForest model
        rf_model.fit(X_train_pca, Y_train)

        # Predict on the test set
        Y_pred = rf_model.predict(X_test_pca)

        # Compute F1 score
        f1 = f1_score(Y_test, Y_pred)
        f1_scores.append(f1)

        # Save the trained model for the current fold
        joblib.dump(rf_model, f'12_SP_Model_RF_PCA_{fold + 1}.joblib')

        # Write F1 score for the current fold to the text file
        f1_file.write(f"F1-score for fold {fold + 1}: {f1*100:.2f}\n")

    # Calculate and write the average F1-score to the text file
    avg_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)
    f1_file.write(f"\nAverage F1-score: {avg_f1*100:.2f} ± {std_f1*100:.2f}\n")

# Save the list of test indices to a file
with open("12_SP_Test_Data_RF_PCA.txt", "w") as test_file:
    test_file.write(str(test_indices_all_folds))

# Print F1-scores for each fold and their average
print(f"F1-scores for each fold: {f1_scores}")
print(f"Average F1-score: {avg_f1} ± {std_f1}")

Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
Processing fold 5
F1-scores for each fold: [0.896, 0.903370786516854, 0.9145496535796767, 0.8871508379888268, 0.8945054945054945]
Average F1-score: 0.8991153545181705 ± 0.00927827772710779


In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import joblib

# Initialize Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=1649, max_samples=256)  # Adjust contamination to expected outlier fraction

# Fit Isolation Forest on the features
outlier_predictions = iso_forest.fit_predict(X)

# Identify and remove outliers (label -1 indicates outliers)
X_filtered = X[outlier_predictions != -1].copy()
Y_filtered = Y[outlier_predictions != -1].copy()

# Retain original indices
X_filtered['Original_Index'] = X.index[outlier_predictions != -1]

print(f"Original dataset size: {X.shape[0]}")
print(f"Filtered dataset size: {X_filtered.shape[0]} (outliers removed)")

# Initialize Random Forest and Stratified K-Fold
rf_model = RandomForestClassifier(random_state=1649, n_estimators=339)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1649)

f1_scores = []
test_indices_all_folds = []  # To store test indices for all folds

# Open a text file to write the results
with open("12_SP_F1_Scores_RF_IF.txt", "w") as f1_file:
    for fold, (train_index, test_index) in enumerate(skf.split(X_filtered.iloc[:, :-1], Y_filtered)):
        print(f"Processing fold {fold + 1}")

        # Split data into train and test sets
        X_train, X_test = X_filtered.iloc[train_index, :-1], X_filtered.iloc[test_index, :-1]
        Y_train, Y_test = Y_filtered.iloc[train_index], Y_filtered.iloc[test_index]

        # Append original test indices for this fold
        test_indices_all_folds.append(list(X_filtered['Original_Index'].iloc[test_index]))

        # Train the RandomForest model
        rf_model.fit(X_train, Y_train)

        # Predict on the test set
        Y_pred = rf_model.predict(X_test)

        # Compute F1 score
        f1 = f1_score(Y_test, Y_pred)
        f1_scores.append(f1)

        # Save the trained model for the current fold
        joblib.dump(rf_model, f'12_SP_Model_RF_IF_{fold + 1}.joblib')

        # Write F1 score for the current fold to the text file
        f1_file.write(f"F1-score for fold {fold + 1}: {f1*100:.2f}\n")

    # Calculate and write the average F1-score to the text file
    avg_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)
    f1_file.write(f"\nAverage F1-score: {avg_f1*100:.2f} ± {std_f1*100:.2f}\n")

# Save the original test indices of all folds to a file
with open("12_SP_Test_Data_RF_IF.txt", "w") as test_file:
    test_file.write(str(test_indices_all_folds))

# Print F1-scores for each fold and their average
print(f"F1-scores for each fold: {f1_scores}")
print(f"Average F1-score: {avg_f1} ± {std_f1}")

Original dataset size: 4044
Filtered dataset size: 3842 (outliers removed)
Processing fold 1
Processing fold 2
Processing fold 3
Processing fold 4
Processing fold 5
F1-scores for each fold: [0.964200477326969, 0.9290012033694344, 0.9359605911330049, 0.9459783913565426, 0.9360189573459715]
Average F1-score: 0.9422319241063845 ± 0.012244475422902513


In [6]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import os
import joblib

# Set random seeds for reproducibility
seed = 417
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)

# Initialize StratifiedKFold for 10-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

# List to hold F1-scores for each fold
f1_scores = []

# List to store test indices for all folds
test_indices_all_folds = []

# Loop over each fold
for fold, (train_index, test_index) in enumerate(skf.split(X, Y)):
    print(f"Processing fold {fold + 1}")

    # Split data into training and test sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Append test indices for this fold
    test_indices_all_folds.append(list(X.index[test_index]))

    # Reshape data for LSTM (3D input)
    X_train_reshaped = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_test_reshaped = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

    # Build the LSTM model
    model = Sequential()
    model.add(LSTM(11, input_shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])))
    model.add(Dropout(0.01 * 5))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification

    optimizer = Adam(learning_rate=1 / 15)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    # Train the LSTM model
    model.fit(X_train_reshaped, Y_train, epochs=42, batch_size=40, verbose=0)

    # Use the trained LSTM to extract features (predictions)
    X_train_features = model.predict(X_train_reshaped)
    X_test_features = model.predict(X_test_reshaped)

    # Train Random Forest on LSTM features
    rf_lstm = RandomForestClassifier(n_estimators=76, random_state=seed)
    rf_lstm.fit(X_train_features, Y_train.values.ravel())

    # Predict on the test set and compute F1 score
    Y_pred = rf_lstm.predict(X_test_features)
    performance_lstm = f1_score(Y_test, Y_pred)

    f1_scores.append(performance_lstm)

    # Save both LSTM and RF model in a single joblib file per fold
    joblib.dump({'LSTM': model, 'RF': rf_lstm}, f'12_SP_Model_RF_LSTM_{fold + 1}.joblib')

# Save F1-scores to a text file (10 F1 scores for each fold)
with open('12_SP_F1_Scores_RF_LSTM.txt', 'w') as f:
    for i, score in enumerate(f1_scores):
        f.write(f"F1-Score for fold {i + 1}: {score * 100: .2f}\n")

    # Calculate and write the average F1 score at the end of the file
    avg_f1 = np.mean(f1_scores)
    std_f1 = np.std(f1_scores)
    f.write(f"\nAverage F1-score: {avg_f1*100: .2f} ± {std_f1*100: .2f}\n")

# Save test indices of all folds to a file
with open("12_SP_Test_Data_RF_LSTM.txt", "w") as test_file:
    test_file.write(str(test_indices_all_folds))

# Print F1-scores and average
print(f"F1-scores for each fold: {f1_scores}")
print(f"Average F1-score: {avg_f1} ± {std_f1}")

Processing fold 1
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Processing fold 2
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Processing fold 3
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Processing fold 4
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Processing fold 5
[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
F1-scores for each fold: [0.8711111111111111, 0.8619909502262444, 0.8529076396807298, 0.8843995510662177, 0.8834080717488789]
Average F1-score: 0.8707634647666364 ± 0.01