<a href="https://colab.research.google.com/github/KSharif/Deep_learning/blob/main/Heart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
# Step 1: Import necessary libraries
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from imblearn.over_sampling import SMOTE

# fetch dataset
heart_disease = fetch_ucirepo(id=45)

# data (as pandas dataframes)
X = heart_disease.data.features
y = heart_disease.data.targets

# metadata
print(heart_disease.metadata)

# variable information
print(heart_disease.variables)

{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M. Pfisterer, J. Schmid, S. Sa

In [None]:
# Step 2: Data Exploration and Cleaning
# Combine X and y for exploration
df = pd.concat([X, y], axis=1)

In [None]:
# Display basic information about the dataset
print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        299 non-null    float64
 12  thal      301 non-null    float64
 13  num       303 non-null    int64  
dtypes: float64(3), int64(11)
memory usage: 33.3 KB
None
              age         sex          cp    trestbps        chol         fbs  \
count  303.000000  303.000000  303.000000  303.000000  303.000000  303.000000   
mean    54.438944    0.679868    3.158416  131.68976

In [None]:
# Check for missing values
print(df.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
num         0
dtype: int64


In [None]:
# Handle missing values (if any)
# Example: Filling missing values with mean (you can change as per dataset requirements)
df.fillna(df.mean(), inplace=True)

In [None]:
# Check for missing values
print(df.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64


In [None]:
# Separate features and target after cleaning
X_clean = df.drop(columns=['num'])  # Corrected to use 'num' as the target column
y_clean = df['num']

In [None]:
# Step 3: Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)

In [None]:
# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [None]:
# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_clean)

In [None]:
# Random Forest Tuning
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_tuned = RandomizedSearchCV(rf, rf_params, n_iter=20, cv=3, scoring='accuracy', random_state=42)
rf_tuned.fit(X_train, y_train)

In [None]:
# SVM Tuning
svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}

svm = SVC(class_weight='balanced', probability=True, random_state=42)
svm_tuned = RandomizedSearchCV(svm, svm_params, n_iter=20, cv=3, scoring='accuracy', random_state=42)
svm_tuned.fit(X_train, y_train)

In [None]:
# XGBoost Tuning
xgb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# Removing the use_label_encoder parameter
xgb = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_tuned = RandomizedSearchCV(xgb, xgb_params, n_iter=20, cv=3, scoring='accuracy', random_state=42)
xgb_tuned.fit(X_train, y_train)

In [None]:
lr = LogisticRegression(class_weight='balanced', max_iter=1000)
lr.fit(X_train, y_train)

In [None]:
# Random Forest Evaluation
rf_best = rf_tuned.best_estimator_
y_pred_rf = rf_best.predict(X_test)
print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# SVM Evaluation
svm_best = svm_tuned.best_estimator_
y_pred_svm = svm_best.predict(X_test)
print("SVM - Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

# XGBoost Evaluation
xgb_best = xgb_tuned.best_estimator_
y_pred_xgb = xgb_best.predict(X_test)
print("XGBoost - Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))

y_pred_lr = lr.predict(X_test)
print("Logistic Regression - Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Random Forest - Accuracy: 0.8841463414634146
              precision    recall  f1-score   support

           0       0.96      0.82      0.88        28
           1       0.85      0.83      0.84        35
           2       0.86      0.89      0.87        35
           3       0.83      0.91      0.87        33
           4       0.94      0.97      0.96        33

    accuracy                           0.88       164
   macro avg       0.89      0.88      0.88       164
weighted avg       0.89      0.88      0.88       164

SVM - Accuracy: 0.8658536585365854
              precision    recall  f1-score   support

           0       0.60      1.00      0.75        28
           1       1.00      0.80      0.89        35
           2       1.00      0.83      0.91        35
           3       0.90      0.82      0.86        33
           4       1.00      0.91      0.95        33

    accuracy                           0.87       164
   macro avg       0.90      0.87      0.87       1

In [None]:
# Step 1: Import necessary libraries
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import ADASYN
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

# Step 2: Fetch and prepare the dataset
heart_disease = fetch_ucirepo(id=45)
X = heart_disease.data.features
y = heart_disease.data.targets

# Combine X and y for exploration
df = pd.concat([X, y], axis=1)

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Separate features and target
X_clean = df.drop(columns=['num'])  # Assuming 'num' is the target column
y_clean = df['num']

In [None]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import ADASYN
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

# Step 3: Feature Engineering
# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)

# Polynomial Features for interactions
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Feature Selection using Random Forest
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
X_selected = selector.fit_transform(X_poly, y_clean)

# Step 4: Handle Imbalance using ADASYN
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_selected, y_clean)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 5: Define and Tune Models

# Random Forest Tuning
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_tuned_model = RandomizedSearchCV(rf_model, rf_params, n_iter=20, cv=3, scoring='accuracy', random_state=42)
rf_tuned_model.fit(X_train, y_train)

# Evaluate Random Forest
rf_best_model = rf_tuned_model.best_estimator_
y_pred_rf = rf_best_model.predict(X_test)
print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

# SVM Tuning
svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}
svm_model = SVC(class_weight='balanced', probability=True, random_state=42)
svm_tuned_model = RandomizedSearchCV(svm_model, svm_params, n_iter=20, cv=3, scoring='accuracy', random_state=42)
svm_tuned_model.fit(X_train, y_train)

# Evaluate SVM
svm_best_model = svm_tuned_model.best_estimator_
y_pred_svm = svm_best_model.predict(X_test)
print("SVM - Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

# XGBoost Tuning
xgb_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_tuned_model = RandomizedSearchCV(xgb_model, xgb_params, n_iter=20, cv=3, scoring='accuracy', random_state=42)
xgb_tuned_model.fit(X_train, y_train)

# Evaluate XGBoost
xgb_best_model = xgb_tuned_model.best_estimator_
y_pred_xgb = xgb_best_model.predict(X_test)
print("XGBoost")
print(classification_report(y_test, y_pred_xgb))

# Logistic Regression
lr_model = LogisticRegression(class_weight='balanced', max_iter=1000)
lr_model.fit(X_train, y_train)

# Evaluate Logistic Regression
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression")
print(classification_report(y_test, y_pred_lr))

Random Forest - Accuracy: 0.8827160493827161
              precision    recall  f1-score   support

           0       0.86      0.86      0.86        29
           1       0.87      0.79      0.82        42
           2       0.85      0.93      0.89        30
           3       0.86      0.86      0.86        29
           4       0.97      1.00      0.98        32

    accuracy                           0.88       162
   macro avg       0.88      0.89      0.88       162
weighted avg       0.88      0.88      0.88       162

SVM - Accuracy: 0.9382716049382716
              precision    recall  f1-score   support

           0       1.00      0.72      0.84        29
           1       0.85      0.95      0.90        42
           2       0.97      1.00      0.98        30
           3       0.94      1.00      0.97        29
           4       1.00      1.00      1.00        32

    accuracy                           0.94       162
   macro avg       0.95      0.94      0.94       1

In [None]:
# Step 1: Import necessary libraries
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import classification_report, accuracy_score
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel

# Step 2: Fetch and prepare the dataset
heart_disease = fetch_ucirepo(id=45)
X = heart_disease.data.features
y = heart_disease.data.targets

# Combine X and y for exploration
df = pd.concat([X, y], axis=1)

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Separate features and target
X_clean = df.drop(columns=['num'])  # Assuming 'num' is the target column
y_clean = df['num']

# Step 3: Feature Engineering
# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_clean)

# Polynomial Features for interactions
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_scaled)

# Feature Selection using Random Forest
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
X_selected = selector.fit_transform(X_poly, y_clean)

# Step 4: Handle Imbalance using SMOTEENN
smoteenn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smoteenn.fit_resample(X_selected, y_clean)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Step 5: Define and Tune Models

# Random Forest Tuning
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_tuned_model = RandomizedSearchCV(rf_model, rf_params, n_iter=30, cv=5, scoring='accuracy', random_state=42)
rf_tuned_model.fit(X_train, y_train)

# Evaluate Random Forest
rf_best_model = rf_tuned_model.best_estimator_
y_pred_rf = rf_best_model.predict(X_test)
print("Random Forest")
print(classification_report(y_test, y_pred_rf))

# SVM Tuning
svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}
svm_model = SVC(class_weight='balanced', probability=True, random_state=42)
svm_tuned_model = RandomizedSearchCV(svm_model, svm_params, n_iter=30, cv=5, scoring='accuracy', random_state=42)
svm_tuned_model.fit(X_train, y_train)

# Evaluate SVM
svm_best_model = svm_tuned_model.best_estimator_
y_pred_svm = svm_best_model.predict(X_test)
print("SVM")
print(classification_report(y_test, y_pred_svm))

# XGBoost Tuning
xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_tuned_model = RandomizedSearchCV(xgb_model, xgb_params, n_iter=30, cv=5, scoring='accuracy', random_state=42)
xgb_tuned_model.fit(X_train, y_train)

# Evaluate XGBoost
xgb_best_model = xgb_tuned_model.best_estimator_
y_pred_xgb = xgb_best_model.predict(X_test)
print("XGBoost")
print(classification_report(y_test, y_pred_xgb))

# Logistic Regression
lr_model = LogisticRegression(class_weight='balanced', max_iter=1000)
lr_model.fit(X_train, y_train)

# Evaluate Logistic Regression
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression")
print(classification_report(y_test, y_pred_lr))

Random Forest
              precision    recall  f1-score   support

           0       0.94      0.89      0.91        18
           1       0.92      0.89      0.91        27
           2       0.96      1.00      0.98        26
           3       0.96      1.00      0.98        27
           4       1.00      1.00      1.00        32

    accuracy                           0.96       130
   macro avg       0.96      0.96      0.96       130
weighted avg       0.96      0.96      0.96       130

SVM
              precision    recall  f1-score   support

           0       0.89      0.94      0.92        18
           1       0.96      0.93      0.94        27
           2       1.00      1.00      1.00        26
           3       1.00      1.00      1.00        27
           4       1.00      1.00      1.00        32

    accuracy                           0.98       130
   macro avg       0.97      0.97      0.97       130
weighted avg       0.98      0.98      0.98       130

XGBo