In [2]:
# -*- coding: utf-8 -*-
"""
Pima Indians Diabetes Label Noise Analysis
"""

!pip install imbalanced-learn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import warnings
warnings.filterwarnings('ignore')

###############################################################################
# 1) Load and Preprocess the Pima Indians Diabetes Dataset
###############################################################################
RANDOM_STATE = 42

# Define a consistent color palette for each model
model_colors = {
    'Logistic Regression': 'blue',
    'Decision Tree': 'green',
    'Random Forest': 'orange',
    'Naive Bayes': 'purple',
    'K-Nearest Neighbors': 'brown',
    'Support Vector Machine': 'red'
}

print("Loading Pima Indians Diabetes dataset...")

url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
column_names = [
    'Pregnancies','Glucose','BloodPressure','SkinThickness',
    'Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome'
]

data = pd.read_csv(url, header=None, names=column_names)

print("\nFirst 5 rows of the dataset:")
print(data.head())

print("\nDataset Info:")
data.info()

#.
missing_value_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[missing_value_columns] = data[missing_value_columns].replace(0, np.nan)

print("\nMissing Values in Each Column (Before Handling):")
print(data.isnull().sum())

# Impute missing values with median
for column in missing_value_columns:
    median_val = data[column].median()
    data[column].fillna(median_val, inplace=True)

print("\nMissing Values in Each Column (After Imputation):")
print(data.isnull().sum())

# Separate features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome'].astype(int)

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

X_np = X.values
y_np = y.values

print("\nClass Distribution in Original Dataset:")
print(y.value_counts())

###############################################################################
# 2) Define Noise Levels, Models, and Data Structures for Metric Storage
###############################################################################

noise_levels = np.arange(0.00, 0.45, 0.05)  # 0%, 5%, 10%, ..., 40%

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=RANDOM_STATE),
    'Decision Tree': DecisionTreeClassifier(random_state=RANDOM_STATE),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(probability=True, random_state=RANDOM_STATE)
}

# Prepare dictionaries for storing metric values
ACC_values = {m: {nl: [] for nl in noise_levels} for m in models}
TPR_values = {m: {nl: [] for nl in noise_levels} for m in models}
TNR_values = {m: {nl: [] for nl in noise_levels} for m in models}
SENS_values = {m: {nl: [] for nl in noise_levels} for m in models}
SPEC_values = {m: {nl: [] for nl in noise_levels} for m in models}


confusion_matrix_sums = {
    m: {nl: np.zeros((2, 2), dtype=int) for nl in noise_levels} for m in models
}

###############################################################################
# 3) Train/Test Split, Introduce Label Noise, SMOTE, and Collect Metrics
###############################################################################

print("\nStarting model training and evaluation with simulated label noise...\n")

# We'll do multiple runs at each noise level to reduce randomness
NUM_RUNS = 10

for model_name, model in models.items():
    print(f"Evaluating {model_name}...\n")

    for noise_level in noise_levels:
        conf_matrix_sum = np.zeros((2, 2), dtype=int)

        for run_idx in range(NUM_RUNS):
            random_seed = RANDOM_STATE + run_idx

            # Split data fresh each run
            X_train, X_test, y_train, y_test = train_test_split(
                X_np, y_np, test_size=0.5, stratify=y_np, random_state=random_seed
            )


            y_train_noisy = y_train.copy()
            num_noisy = int(noise_level * len(y_train_noisy))
            np.random.seed(random_seed)
            noisy_indices = np.random.choice(len(y_train_noisy), size=num_noisy, replace=False)

            y_train_noisy[noisy_indices] = 1 - y_train_noisy[noisy_indices]


            smote = SMOTE(random_state=random_seed)
            X_train_res, y_train_res = smote.fit_resample(X_train, y_train_noisy)

            # Train the model
            model.fit(X_train_res, y_train_res)

            # Predict on the test set (which is always *noise-free*)
            y_pred = model.predict(X_test)

            # Confusion matrix
            cm = confusion_matrix(y_test, y_pred, labels=[0,1])
            conf_matrix_sum += cm

            # Calculate metrics
            TN, FP, FN, TP = cm.ravel()
            acc = accuracy_score(y_test, y_pred)

            # TPR (Recall for class=1)
            tpr = TP / (TP + FN) if (TP + FN) > 0 else 0.0
            # TNR (Recall for class=0)
            tnr = TN / (TN + FP) if (TN + FP) > 0 else 0.0

            ACC_values[model_name][noise_level].append(acc)
            TPR_values[model_name][noise_level].append(tpr)
            TNR_values[model_name][noise_level].append(tnr)
            SENS_values[model_name][noise_level].append(tpr)  # Sensitivity = TPR
            SPEC_values[model_name][noise_level].append(tnr)  # Specificity = TNR

        confusion_matrix_sums[model_name][noise_level] = conf_matrix_sum

    print(f" -> Completed label noise evaluations for {model_name}\n")

print("All models evaluated under different noise levels.")

###############################################################################
# 4) Print Out Confusion Matrices for Each Model and Noise Level
###############################################################################
print("\n=== Confusion Matrices (Summed Over Runs) for Each Model & Noise Level ===")
for model_name in models.keys():
    print(f"\nModel: {model_name}")
    for noise_level in noise_levels:
        cm_sum = confusion_matrix_sums[model_name][noise_level]
        print(f"  Noise Level: {int(noise_level * 100)}%")
        print(cm_sum)


Loading Pima Indians Diabetes dataset...

First 5 rows of the dataset:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0 