In [None]:
# -*- coding: utf-8 -*-
"""Heart Disease Label Noise Analysis (with Random Forest Included)
"""

!pip install ucimlrepo
!pip install imbalanced-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

###############################################################################
# 1) Load and Preprocess the Heart Disease Data
###############################################################################

RANDOM_STATE = 42

print("Fetching Heart Disease dataset from UCIML...")
heart_disease = fetch_ucirepo(id=45)
X = heart_disease.data.features
y = heart_disease.data.targets

# Combine features and target into a single DataFrame
data = pd.concat([X, y], axis=1)

# Convert all columns to numeric
for col in data.columns:
    data[col] = pd.to_numeric(data[col], errors='coerce')

# Drop rows with missing values
data = data.dropna()

print("First 5 rows of the dataset after numeric conversion and dropping NAs:")
print(data.head())

print("\nDataset Info:")
print(data.info())

data['num'] = data['num'].apply(lambda x: 1 if x > 0 else 0)

target = 'num'
features = data.columns.drop(target)

categorical_vars = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal', 'ca']
data_encoded = pd.get_dummies(data, columns=[c for c in categorical_vars if c in data.columns], drop_first=True)

X = data_encoded.drop(columns=[target], errors='ignore')
y = data_encoded[target].astype(int)

print("\nClass Distribution in Dataset (0 = no disease, 1 = disease):")
print(y.value_counts())

###############################################################################
# 2) Split, Scale, and Define Models
###############################################################################

# Split data (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, stratify=y, random_state=RANDOM_STATE
)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_colors = {
    'Logistic Regression': 'blue',
    'Decision Tree': 'green',
    'Naive Bayes': 'purple',
    'K-Nearest Neighbors': 'brown',
    'Support Vector Machine': 'red',
    'Random Forest': 'orange'
}

models = {
    'Support Vector Machine': SVC(probability=True, random_state=RANDOM_STATE),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
}

###############################################################################
# 3) Simulate Label Noise, Train, and Collect Metrics
###############################################################################

noise_levels = np.arange(0.00, 0.45, 0.05)

# Data structures to store metrics
ACC_values = {m: {nl: [] for nl in noise_levels} for m in models}
TPR_values = {m: {nl: [] for nl in noise_levels} for m in models}
TNR_values = {m: {nl: [] for nl in noise_levels} for m in models}

print("\nStarting model training and evaluation with simulated label noise...\n")

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    for noise_level in noise_levels:
        conf_matrix_sum = np.zeros((2, 2), dtype=int)

        for iteration in range(10):
            iter_seed = RANDOM_STATE + iteration

            X_train_, X_test_, y_train_, y_test_ = train_test_split(
                X, y, test_size=0.5, stratify=y, random_state=iter_seed
            )

            scaler_iter = StandardScaler()
            X_train_scaled_ = scaler_iter.fit_transform(X_train_)
            X_test_scaled_ = scaler_iter.transform(X_test_)

            # Introduce label noise in *training* set
            y_train_noisy = y_train_.copy()
            num_noisy = int(noise_level * len(y_train_noisy))
            np.random.seed(iter_seed)
            noisy_indices = np.random.choice(len(y_train_noisy), size=num_noisy, replace=False)
            y_train_noisy.iloc[noisy_indices] = 1 - y_train_noisy.iloc[noisy_indices]

            smote = SMOTE(random_state=iter_seed)
            X_train_res, y_train_res = smote.fit_resample(X_train_scaled_, y_train_noisy)

            model.fit(X_train_res, y_train_res)
            y_pred = model.predict(X_test_scaled_)

            cm = confusion_matrix(y_test_, y_pred, labels=[0, 1])
            conf_matrix_sum += cm

            TN, FP, FN, TP = cm.ravel()
            acc = accuracy_score(y_test_, y_pred)
            tpr = TP / (TP + FN) if (TP + FN) > 0 else 0.0
            tnr = TN / (TN + FP) if (TN + FP) > 0 else 0.0

            ACC_values[model_name][noise_level].append(acc)
            TPR_values[model_name][noise_level].append(tpr)
            TNR_values[model_name][noise_level].append(tnr)

        # Print the aggregated confusion matrix for this noise level
        print(f"  Noise Level: {int(noise_level * 100)}%")
        print("  Confusion Matrix (sum over runs):\n", conf_matrix_sum)

    print(f"  -> Completed label noise evaluations for {model_name}\n")

print("All models evaluated under different noise levels.")

Fetching Heart Disease dataset from UCIML...
First 5 rows of the dataset after numeric conversion and dropping NAs:
   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   1       145   233    1        2      150      0      2.3      3   
1   67    1   4       160   286    0        2      108      1      1.5      2   
2   67    1   4       120   229    0        2      129      1      2.6      2   
3   37    1   3       130   250    0        0      187      0      3.5      3   
4   41    0   2       130   204    0        2      172      0      1.4      1   

    ca  thal  num  
0  0.0   6.0    0  
1  3.0   3.0    2  
2  2.0   7.0    1  
3  0.0   3.0    0  
4  0.0   3.0    0  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 297 entries, 0 to 301
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       297 non-null    int64  
 1   sex       297 non-null    int64  
 2   c