In [46]:
import warnings
warnings.filterwarnings("ignore")

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import fetch_openml
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

In [64]:
# Load the MNIST dataset
mnist = fetch_openml('mnist_784')

# Extract the feature matrix X and the target vector y
X = mnist.data[:3000]
y = mnist.target[:3000]

In [65]:
# Select the first 10 features
X = X.iloc[:, :30]

In [66]:
print("X shape : ",X.shape)
print("X shape : ",y.shape)

X shape :  (3000, 30)
X shape :  (3000,)


In [70]:
# Read the data from the CSV file
data = pd.read_csv('heart_data_updated.csv')
column_names = data.columns.tolist()
print(column_names)

['age', 'gender', 'blood_pressure', 'cholesterol', 'max_heart_rate', 'exercise_angina', 'st_depression', 'heart_disease', 'blood_pressure_above_120', 'chest_pain_asymptomatic', 'chest_pain_atypical', 'chest_pain_non-anginal', 'chest_pain_typical', 'rest_ecg_LVH', 'rest_ecg_ST-T', 'rest_ecg_normal']


In [71]:
X = data.drop('rest_ecg_normal', axis=1)
y = data['rest_ecg_normal']


In [72]:
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)

In [73]:
X_train,X_test,y_train,y_test = train_test_split(X_scaled,y,test_size=.33,shuffle=True,random_state=22)

## **SVM**

In [74]:
# Define the parameter grid to search
param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2,4],
    'C': [0.1, 1, 5],
    'gamma': ['scale', 'auto']
}

# Create a grid search object with cross-validation
grid_search = GridSearchCV(SVC(), param_grid, cv=5)

# Fit the grid search object to the data
svm = grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
# Print the best hyperparameters and corresponding accuracy
print("Best Hyperparameters: ", best_params)
print("Best Accuracy: ", grid_search.best_score_)

Best Hyperparameters:  {'C': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
Best Accuracy:  1.0


## ***Best SVM Model***

In [75]:
svm = SVC(C=best_params["C"], degree=best_params["degree"], gamma='scale', kernel=best_params['kernel'], probability=True)
# Train the SVM model on the training data
svm.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = svm.predict(X_test)

# Calculate the accuracy of the SVM model
accuracy = accuracy_score(y_test, y_pred)
print('SVM Accuracy:', accuracy)

SVM Accuracy: 1.0


## ***DecisionTreeClassifier***

In [76]:
# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 4, 5],
    'random_state': [22]
}

# Create the decision tree classifier
clf = DecisionTreeClassifier()

# Perform grid search
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Train the decision tree with the best parameters
DecisionTree = DecisionTreeClassifier(**best_params)
DecisionTree.fit(X_train, y_train)

In [77]:
predict = DecisionTree.predict(X_test)
accuracy = accuracy_score(y_test, predict)
print("Accuracy:", accuracy)
precision = precision_score(y_test, predict, average='weighted')
recall = recall_score(y_test, predict, average='weighted')
f1 = f1_score(y_test, predict, average='weighted')

Accuracy: 1.0


## **RandomForestClassifier**

In [78]:
# Define the parameter grid
param_grid = {
    'n_estimators': [_ for _ in range(100,500,150)],
    'max_depth': [3,5,10],
    'random_state': [22,15,33]
}

# Create the random forest classifier
clf = RandomForestClassifier()

# Perform grid search
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Train the random forest with the best parameters
RandomForest = RandomForestClassifier(**best_params)
RandomForest.fit(X_train, y_train)


In [81]:
# Make predictions on the test set using the trained classifier
y_pred = RandomForest.predict(X_test)

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Compute precision, recall, f1-score, and support
classification_rep = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("Accuracy:", accuracy)

Accuracy: 1.0


# ***LogisticRegression***

In [82]:
# Define the parameter grid
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100]
}

# Create the Logistic Regression model
logreg = LogisticRegression()

# Perform grid search
grid_search = GridSearchCV(logreg, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Train the Logistic Regression model with the best parameters
logreg_best = LogisticRegression(**best_params)
logreg_best.fit(X_train, y_train)
# Make predictions on the testing set
y_pred = logreg_best.predict(X_test)
# Calculate the accuracy of the Logistic Regression model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Accuracy: 1.0
Best Parameters: {'C': 0.1, 'penalty': 'l2'}
Best Score: 1.0


## ***AdaBoostClassifier***

***find best_base_estimator***

In [83]:
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'learning_rate': [0.01, 0.1, 0.5, 1, 2,4]
}

# Create a grid search object with cross-validation
grid_search = GridSearchCV(AdaBoostClassifier(), param_grid, cv=5)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and corresponding accuracy
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Accuracy: ", grid_search.best_score_)

Best Hyperparameters:  {'learning_rate': 0.01, 'n_estimators': 50}
Best Accuracy:  1.0


In [84]:
results = []

# Loop over all hyperparameters combinations
for params in grid_search.cv_results_['params']:
    n_estimators = params['n_estimators']
    learning_rate = params['learning_rate']
    
    # Train an AdaBoost classifier with the current hyperparameters
    ada = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, random_state=42)
    ada.fit(X_train, y_train)
    
    # Predict the labels of the training data using the ensemble of weak learners
    y_pred_train = ada.predict(X_train)
    
    # Calculate the exponential loss of the training data
    exp_loss_train = log_loss(y_train, ada.decision_function(X_train), labels=ada.classes_)
    
    # Predict the labels of the testing data using the ensemble of weak learners
    y_pred_test = ada.predict(X_test)
    
    # Calculate the exponential loss of the testing data
    exp_loss_test = log_loss(y_test, ada.decision_function(X_test), labels=ada.classes_)
    
    results.append({
        "n_estimators": n_estimators,
        "learning_rate": learning_rate,
        "exp_loss_train": exp_loss_train,
        "exp_loss_test": exp_loss_test
    })


In [85]:
sorted_results = sorted(results, key=lambda x: x['exp_loss_test'])[0]
sorted_results

{'n_estimators': 50,
 'learning_rate': 0.01,
 'exp_loss_train': 2.2204460492503136e-16,
 'exp_loss_test': 2.2204460492503136e-16}

In [86]:
# Define a list of base estimators to try
base_estimators = [
    logreg_best,
    svm,
    DecisionTree,
    RandomForest
]

best_accuracy = 0.0
best_base_estimator = None

# Iterate over the base estimators
for base_estimator in base_estimators:
    # Create the AdaBoost classifier with the current base estimator
    ada = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=sorted_results["n_estimators"],learning_rate =sorted_results["learning_rate"], algorithm='SAMME', random_state=1)
    
    # Perform cross-validation to evaluate the performance
    scores = cross_val_score(ada, X_train, y_train, cv=5)
    avg_accuracy = scores.mean()
    
    # Check if the current base estimator gives a better accuracy
    if avg_accuracy > best_accuracy:
        best_accuracy = avg_accuracy
        best_base_estimator = base_estimator

# Train the AdaBoost classifier with the best base estimator on the training set
ada_best = AdaBoostClassifier(base_estimator=best_base_estimator, n_estimators=100, algorithm='SAMME', random_state=1)
ada_best.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = ada_best.predict(X_test)

# Calculate the accuracy of the AdaBoost classifier with the best base estimator
accuracy = accuracy_score(y_test, y_pred)
print('Best Base Estimator:', best_base_estimator)
print('Accuracy:', accuracy)


Best Base Estimator: DecisionTreeClassifier(max_depth=3, random_state=22)
Accuracy: 1.0


In [87]:
ada = AdaBoostClassifier(base_estimator=best_base_estimator, n_estimators=sorted_results["n_estimators"], learning_rate=sorted_results["learning_rate"], random_state=1)

# Train the AdaBoost classifier on the training set
ada.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = ada.predict(X_test) 

# Calculate the accuracy of the AdaBoost classifier
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 1.0


In [88]:
from sklearn.ensemble import VotingClassifier

# Define the individual models
logreg = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'])
ada = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=sorted_results["n_estimators"], learning_rate=sorted_results["learning_rate"], random_state=1)
rf = RandomForestClassifier(max_depth=10, random_state=15)
dt = DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=22)

# Create the Voting Classifier with the individual models
voting_classifier = VotingClassifier(
    estimators=[('logreg', logreg), ('ada', ada), ('rf', rf), ('dt', dt)],
    voting='hard'  # Use majority voting
)

# Train the Voting Classifier
voting_classifier.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = voting_classifier.predict(X_test)

# Calculate the accuracy of the Voting Classifier
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)


Accuracy: 1.0
