In [1]:
from google.colab import files
uploaded = files.upload()

Saving pima.csv to pima.csv


In [2]:
from google.colab import files
uploaded1 = files.upload()

Saving diabetes2019.csv to diabetes2019.csv


In [3]:
!pip install pandas scikit-learn imbalanced-learn




In [4]:
pip install pandas numpy scikit-learn imbalanced-learn xgboost



In [5]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


Paper's models implementation

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load datasets (replace with actual file paths)
pima_data = pd.read_csv('pima.csv')  # Example dataset path
diabetes_data = pd.read_csv('diabetes2019.csv')  # Example dataset path

# Data Preprocessing Function
def preprocess_data(data):
    # Remove null values and create a copy to avoid warnings
    data = data.dropna().copy()

    # Encode categorical variables if any (example: 'gender' column)
    label_encoder = LabelEncoder()
    for col in data.select_dtypes(include=['object']).columns:
        data.loc[:, col] = label_encoder.fit_transform(data[col])  # Use .loc to avoid SettingWithCopyWarning

    # Check if the target column is "Outcome" or "Diabetic"
    target_column = "Outcome" if "Outcome" in data.columns else "Diabetic"

    # Convert target column to int type before scaling
    data[target_column] = data[target_column].astype(int)

    # Scale features using StandardScaler
    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(data.drop(target_column, axis=1))

    return pd.DataFrame(scaled_features, columns=data.columns.drop(target_column)), data[target_column]

# Preprocess both datasets
X_pima, y_pima = preprocess_data(pima_data)
X_diabetes, y_diabetes = preprocess_data(diabetes_data)

# Balancing the dataset using SMOTE
def balance_data(X, y):
    # Get the minimum number of samples in any class
    min_samples = y.value_counts().min()

    # Ensure k_neighbors is less than the smallest class size
    k_neighbors_value = min(5, min_samples - 1)

    # If min_samples is 1 or less, SMOTE can't be applied
    if min_samples <= 1:
        print("Warning: A class has only one sample. Returning original data without balancing.")
        return X, y

    smote = SMOTE(k_neighbors=k_neighbors_value)
    return smote.fit_resample(X, y)

X_balanced_pima, y_balanced_pima = balance_data(X_pima, y_pima)
X_balanced_diabetes, y_balanced_diabetes = balance_data(X_diabetes, y_diabetes)

# Splitting the datasets into training and testing sets (80% train, 20% test)
X_train_pima, X_test_pima, y_train_pima, y_test_pima = train_test_split(X_balanced_pima, y_balanced_pima, test_size=0.2, random_state=42)
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(X_balanced_diabetes, y_balanced_diabetes, test_size=0.2, random_state=42)

# Model Training and Evaluation Function
def train_and_evaluate_model(model_name, X_train, X_test, y_train, y_test):
    if model_name == 'Logistic Regression':
        model = LogisticRegression(max_iter=1000)  # Increased max_iter to ensure convergence
    elif model_name == 'Decision Tree':
        model = DecisionTreeClassifier()
    elif model_name == 'Random Forest':
        model = RandomForestClassifier()
    elif model_name == 'SVM':
        model = SVC()
    elif model_name == 'KNN':
        model = KNeighborsClassifier()
    elif model_name == 'Naive Bayes':
        model = GaussianNB()

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    # Evaluate performance
    accuracy = accuracy_score(y_test, predictions)
    confusion_mat = confusion_matrix(y_test, predictions)

    print(f"{model_name} Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:")
    print(confusion_mat)

    # Use zero_division=1 to avoid UndefinedMetricWarning for precision/recall calculations.
    print("Classification Report:")
    print(classification_report(y_test, predictions, zero_division=1))

# Training and evaluating all models on PIMA dataset
models = ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVM', 'KNN', 'Naive Bayes']
for model in models:
    print(f"Evaluating {model} on PIMA Dataset")
    train_and_evaluate_model(model, X_train_pima, X_test_pima, y_train_pima, y_test_pima)

# Training and evaluating all models on Diabetes dataset 2019
for model in models:
    print(f"Evaluating {model} on Diabetes Dataset 2019")
    train_and_evaluate_model(model, X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes)

Evaluating Logistic Regression on PIMA Dataset
Logistic Regression Accuracy: 0.76
Confusion Matrix:
[[73 26]
 [23 78]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.74      0.75        99
           1       0.75      0.77      0.76       101

    accuracy                           0.76       200
   macro avg       0.76      0.75      0.75       200
weighted avg       0.76      0.76      0.75       200

Evaluating Decision Tree on PIMA Dataset
Decision Tree Accuracy: 0.71
Confusion Matrix:
[[72 27]
 [31 70]]
Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.73      0.71        99
           1       0.72      0.69      0.71       101

    accuracy                           0.71       200
   macro avg       0.71      0.71      0.71       200
weighted avg       0.71      0.71      0.71       200

Evaluating Random Forest on PIMA Dataset
Random Forest Accuracy: 0.79
Conf


Additional models

all models on imbalance dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer

# Preprocessing Function
def preprocess_data(data, target_column):
    """
    Preprocess the dataset:
    - Handle missing values
    - Normalize numeric features
    - Encode categorical features
    """
    # Drop rows with missing target values
    data = data.dropna(subset=[target_column]).reset_index(drop=True)

    # Separate features and target
    features = data.drop(columns=[target_column])
    target = data[target_column]

    # Encode the target if it's categorical
    if target.dtype == 'object':
        target = LabelEncoder().fit_transform(target)

    # Impute missing numeric values
    numeric_features = features.select_dtypes(include=np.number)
    imputer = SimpleImputer(strategy='mean')
    features_scaled = imputer.fit_transform(numeric_features)

    # Normalize numeric features
    scaler = MinMaxScaler()
    features_scaled = scaler.fit_transform(features_scaled)
    features_scaled_df = pd.DataFrame(features_scaled, columns=numeric_features.columns)

    # Handle categorical features
    categorical_features = features.select_dtypes(include=['object'])
    if not categorical_features.empty:
        features_encoded = pd.get_dummies(categorical_features, drop_first=True)
        features = pd.concat([features_scaled_df, features_encoded], axis=1).reset_index(drop=True)
    else:
        features = features_scaled_df

    return features, pd.Series(target)

# Model Evaluation
def evaluate_models(X_train, X_test, y_train, y_test):
    """
    Train and evaluate multiple machine learning models.
    """
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Linear Regression": LinearRegression(),  # For comparison (requires thresholding)
        "SVM": SVC(),
        "Random Forest": RandomForestClassifier(),
        "Decision Tree": DecisionTreeClassifier(),
        "KNN": KNeighborsClassifier(),
        "Naive Bayes": GaussianNB(),
        "Gradient Boosting": GradientBoostingClassifier(),
        "XGBoost": xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False),
        "LightGBM": LGBMClassifier(),
        "CatBoost": CatBoostClassifier(verbose=0)
    }

    results = {}

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Convert continuous predictions to binary for Linear Regression
        if model_name == "Linear Regression":
            y_pred = (y_pred > 0.5).astype(int)

        # Determine if the problem is binary or multiclass
        average_type = 'binary' if len(np.unique(y_train)) == 2 else 'macro'

        # Evaluation Metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, zero_division=0, average=average_type)
        recall = recall_score(y_test, y_pred, zero_division=0, average=average_type)
        f1 = f1_score(y_test, y_pred, zero_division=0, average=average_type)
        cm = confusion_matrix(y_test, y_pred)
        false_negatives = cm[1][0] if len(cm) > 1 else 0  # FN = (row 1, col 0)

        results[model_name] = {
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1-score": f1,
            "False Negatives": false_negatives,
            "Confusion Matrix": cm
        }

    return results

# Main Script
if __name__ == "__main__":
    # Load Dataset
    # Replace these with actual dataset paths
    pima_data = pd.read_csv("pima.csv")
    diabetes_data = pd.read_csv("diabetes2019.csv")

    # Preprocess Data
    pima_features, pima_target = preprocess_data(pima_data, "Outcome")
    diabetes_features, diabetes_target = preprocess_data(diabetes_data, "Diabetic")

    # Align features and target sizes (if there are discrepancies)
    diabetes_features = diabetes_features.iloc[:len(diabetes_target)].reset_index(drop=True)

    # Split Datasets
    pima_X_train, pima_X_test, pima_y_train, pima_y_test = train_test_split(
        pima_features, pima_target, test_size=0.3, random_state=42
    )
    diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test = train_test_split(
        diabetes_features, diabetes_target, test_size=0.3, random_state=42
    )

    # Evaluate Models on PIMA Dataset
    print("PIMA Dataset Results:")
    pima_results = evaluate_models(pima_X_train, pima_X_test, pima_y_train, pima_y_test)
    for model, metrics in pima_results.items():
        print(f"{model}: Accuracy={metrics['Accuracy']:.4f}, Precision={metrics['Precision']:.4f}, "
              f"Recall={metrics['Recall']:.4f}, F1-score={metrics['F1-score']:.4f}, False Negatives={metrics['False Negatives']}")

    # Evaluate Models on Diabetes 2019 Dataset
    print("\nDiabetes 2019 Dataset Results:")
    diabetes_results = evaluate_models(diabetes_X_train, diabetes_X_test, diabetes_y_train, diabetes_y_test)
    for model, metrics in diabetes_results.items():
        print(f"{model}: Accuracy={metrics['Accuracy']:.4f}, Precision={metrics['Precision']:.4f}, "
              f"Recall={metrics['Recall']:.4f}, F1-score={metrics['F1-score']:.4f}, False Negatives={metrics['False Negatives']}")


PIMA Dataset Results:


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 188, number of negative: 349
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 604
[LightGBM] [Info] Number of data points in the train set: 537, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.350093 -> initscore=-0.618630
[LightGBM] [Info] Start training from score -0.618630
Logistic Regression: Accuracy=0.7489, Precision=0.6618, Recall=0.5625, F1-score=0.6081, False Negatives=35
Linear Regression: Accuracy=0.7316, Precision=0.6154, Recall=0.6000, F1-score=0.6076, False Negatives=32
SVM: Accuracy=0.7532, Precision=0.6575, Recall=0.6000, F1-score=0.6275, False Negatives=32
Random Forest: Accuracy=0.7403, Precision=0.6351, Recall=0.5875, F1-score=0.6104, False Negatives=33
Decision Tree: Accuracy=0.6883, Precision=0.5408, Recall=0.6625, F1-score=0.5955, False Negatives=27
KNN: Accu

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000403 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 665, number of used features: 25
[LightGBM] [Info] Start training from score -6.499787
[LightGBM] [Info] Start training from score -0.321843
[LightGBM] [Info] Start training from score -1.295780
Logistic Regression: Accuracy=0.9021, Precision=0.8914, Recall=0.8681, F1-score=0.8785, False Negatives=18
Linear Regression: Accuracy=0.7063, Precision=0.3531, Recall=0.5000, F1-score=0.4139, False Negatives=84
SVM: Accuracy=0.9441, Precision=0.9382, Recall=0.9256, F1-score=0.9316, False Negatives=10
Random Forest: Accuracy=0.9720, Precision=0.9663, Recall=0.9663, F1-score=0.9663, False Negatives=4
Decision Tree: Accuracy=0.9476, Precision=0.6276, Recall=0.6303, F1-score=0.6288, Fals

another code with all the models without using PCA

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load datasets
pima_data = pd.read_csv('pima.csv')  # Example dataset path
diabetes_data = pd.read_csv('diabetes2019.csv')  # Example dataset path

# Data Preprocessing Function
def preprocess_data(data):
    """
    Preprocesses the input dataset:
    - Encodes categorical columns
    - Scales numeric columns
    - Separates target variable from features
    """
    data = data.dropna().copy()
    label_encoder = LabelEncoder()

    for col in data.select_dtypes(include=['object']).columns:
        data[col] = label_encoder.fit_transform(data[col])

    target_column = "Outcome" if "Outcome" in data.columns else "Diabetic"
    data[target_column] = data[target_column].astype(int)

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(data.drop(target_column, axis=1))

    return pd.DataFrame(scaled_features, columns=data.columns.drop(target_column)), data[target_column]

# Preprocess both datasets
X_pima, y_pima = preprocess_data(pima_data)
X_diabetes, y_diabetes = preprocess_data(diabetes_data)

# Balance the dataset using SMOTE
def balance_data(X, y):
    """
    Balances the dataset using SMOTE.
    """
    smote = SMOTE(random_state=42)
    try:
        X_resampled, y_resampled = smote.fit_resample(X, y)
        return X_resampled, y_resampled
    except ValueError:
        print("Warning: SMOTE could not be applied due to insufficient samples.")
        return X, y

X_balanced_pima, y_balanced_pima = balance_data(X_pima, y_pima)
X_balanced_diabetes, y_balanced_diabetes = balance_data(X_diabetes, y_diabetes)

# Splitting the datasets into training and testing sets
X_train_pima, X_test_pima, y_train_pima, y_test_pima = train_test_split(X_balanced_pima, y_balanced_pima, test_size=0.2, random_state=42)
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(X_balanced_diabetes, y_balanced_diabetes, test_size=0.2, random_state=42)

# Model Training and Evaluation Function
def train_and_evaluate_model(model_name, model_class, X_train, X_test, y_train, y_test):
    """
    Trains and evaluates a model on the given dataset.
    Outputs accuracy, confusion matrix, and classification report.
    """
    model = model_class()
    model.fit(X_train, y_train)

    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    confusion_mat = confusion_matrix(y_test, predictions)
    false_positives = confusion_mat[0][1]

    print(f"{model_name} Accuracy: {accuracy:.2f}")
    print("Confusion Matrix:")
    print(confusion_mat)
    print(f"False Positives: {false_positives}")
    print("Classification Report:")
    print(classification_report(y_test, predictions, zero_division=0))
    print()

    return accuracy, false_positives

# List of models to evaluate
models = [
    ('Logistic Regression', LogisticRegression),
    ('Decision Tree', DecisionTreeClassifier),
    ('Random Forest', RandomForestClassifier),
    ('SVM', SVC),
    ('KNN', KNeighborsClassifier),
    ('Naive Bayes', GaussianNB),
    ('XGBoost', XGBClassifier),
    ('Gradient Boosting', GradientBoostingClassifier),
    ('LightGBM', lgb.LGBMClassifier),
    ('CatBoost', CatBoostClassifier)
]

# Store results for comparison
results = {"PIMA": [], "Diabetes2019": []}

# Evaluate models on PIMA dataset
print("Evaluating models on PIMA Dataset:")
for model_name, model_class in models:
    accuracy, false_positives = train_and_evaluate_model(model_name, model_class, X_train_pima, X_test_pima, y_train_pima, y_test_pima)
    results["PIMA"].append((model_name, accuracy, false_positives))

# Evaluate models on Diabetes 2019 dataset
print("Evaluating models on Diabetes Dataset 2019:")
for model_name, model_class in models:
    accuracy, false_positives = train_and_evaluate_model(model_name, model_class, X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes)
    results["Diabetes2019"].append((model_name, accuracy, false_positives))

# Summary of results
print("\nSummary of Results:")
for dataset, dataset_results in results.items():
    print(f"\n{dataset} Dataset:")
    for model_name, accuracy, false_positives in dataset_results:
        print(f"{model_name}: Accuracy={accuracy:.4f}, False Positives={false_positives}")


Evaluating models on PIMA Dataset:
Logistic Regression Accuracy: 0.75
Confusion Matrix:
[[72 27]
 [23 78]]
False Positives: 27
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.73      0.74        99
           1       0.74      0.77      0.76       101

    accuracy                           0.75       200
   macro avg       0.75      0.75      0.75       200
weighted avg       0.75      0.75      0.75       200


Decision Tree Accuracy: 0.72
Confusion Matrix:
[[73 26]
 [29 72]]
False Positives: 26
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.74      0.73        99
           1       0.73      0.71      0.72       101

    accuracy                           0.72       200
   macro avg       0.73      0.73      0.72       200
weighted avg       0.73      0.72      0.72       200


Random Forest Accuracy: 0.79
Confusion Matrix:
[[73 26]
 [17 84]]
False Positives: 26

**Final** **Code**

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

# Function to preprocess data with PCA and SMOTE
def preprocess_data(X, y):
    """
    Preprocess the dataset:
    - Impute missing values
    - Scale numeric data
    - Apply PCA for dimensionality reduction
    - Balance data using SMOTE
    """
    # Separate numeric and categorical features
    numeric_features = X.select_dtypes(include=np.number).columns
    categorical_features = X.select_dtypes(exclude=np.number).columns

    # Impute missing values for numeric features
    if len(numeric_features) > 0:
        imputer_numeric = SimpleImputer(strategy='mean')
        X[numeric_features] = imputer_numeric.fit_transform(X[numeric_features])

    # Impute missing values for categorical features
    if len(categorical_features) > 0:
        imputer_categorical = SimpleImputer(strategy='most_frequent')
        X[categorical_features] = imputer_categorical.fit_transform(X[categorical_features])

    # Scale numeric data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X[numeric_features])
    X_scaled_df = pd.DataFrame(X_scaled, columns=numeric_features, index=X.index)
    X_processed = pd.concat([X_scaled_df, X[categorical_features]], axis=1)

    # Apply PCA (only to numeric features after scaling)
    pca = PCA(n_components=0.95)
    X_pca = pca.fit_transform(X_scaled)

    # Balance data using SMOTE
    smote = SMOTE(random_state=42)
    try:
        X_resampled, y_resampled = smote.fit_resample(X_pca, y)
    except ValueError:
        print("Skipping SMOTE due to insufficient samples in the minority class.")
        X_resampled, y_resampled = X_pca, y

    return X_resampled, y_resampled

# Load datasets
X_pima = pd.read_csv("pima.csv").drop(columns="Outcome")
y_pima = pd.read_csv("pima.csv")["Outcome"]
X_diabetes = pd.read_csv("diabetes2019.csv").drop(columns="Diabetic")
y_diabetes = pd.read_csv("diabetes2019.csv")["Diabetic"]

# Encode target variable
y_pima = LabelEncoder().fit_transform(y_pima)
y_diabetes = LabelEncoder().fit_transform(y_diabetes)

# Preprocess datasets
X_pima_resampled, y_pima_resampled = preprocess_data(X_pima, y_pima)
X_diabetes_resampled, y_diabetes_resampled = preprocess_data(X_diabetes, y_diabetes)

# Split datasets into training and testing sets
X_train_pima, X_test_pima, y_train_pima, y_test_pima = train_test_split(X_pima_resampled, y_pima_resampled, test_size=0.2, random_state=42)
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(X_diabetes_resampled, y_diabetes_resampled, test_size=0.2, random_state=42)

# Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Linear Regression": LinearRegression(),  # Regression model
    "SVM": SVC(),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0)
}

# Train and evaluate each model
results = {"PIMA": [], "Diabetes2019": []}

for model_name, model in models.items():
    print(f"Evaluating {model_name}")

    # Train on PIMA dataset
    model.fit(X_train_pima, y_train_pima)
    y_pred_pima = model.predict(X_test_pima)

    # Convert continuous predictions to binary for Linear Regression
    if model_name == "Linear Regression":
        y_pred_pima = (y_pred_pima > 0.5).astype(int)

    acc_pima = accuracy_score(y_test_pima, y_pred_pima)
    cm_pima = confusion_matrix(y_test_pima, y_pred_pima)
    fp_pima = cm_pima[0, 1]  # False Positives
    print(f"PIMA Indian Dataset Accuracy: {acc_pima:.2f}")
    print(f"PIMA Indian Dataset False Positives: {fp_pima}")
    print(f"Confusion Matrix for PIMA Indian Dataset:\n{cm_pima}")
    results["PIMA"].append((model_name, acc_pima, fp_pima))

    # Train on Diabetes 2019 dataset
    model.fit(X_train_diabetes, y_train_diabetes)
    y_pred_diabetes = model.predict(X_test_diabetes)

    # Convert continuous predictions to binary for Linear Regression
    if model_name == "Linear Regression":
        y_pred_diabetes = (y_pred_diabetes > 0.5).astype(int)

    acc_diabetes = accuracy_score(y_test_diabetes, y_pred_diabetes)
    cm_diabetes = confusion_matrix(y_test_diabetes, y_pred_diabetes)
    fp_diabetes = cm_diabetes[0, 1]  # False Positives
    print(f"Diabetes 2019 Dataset Accuracy: {acc_diabetes:.2f}")
    print(f"Diabetes 2019 Dataset False Positives: {fp_diabetes}")
    print(f"Confusion Matrix for Diabetes 2019 Dataset:\n{cm_diabetes}")
    results["Diabetes2019"].append((model_name, acc_diabetes, fp_diabetes))
    print()

# Summary of results
print("Summary of Results:")
for dataset, dataset_results in results.items():
    print(f"\n{dataset} Dataset:")
    for model_name, accuracy, false_positives in sorted(dataset_results, key=lambda x: (-x[1], x[2])):  # Sort by accuracy (descending) and then false positives
        print(f"{model_name}: Accuracy = {accuracy:.2f}, False Negative = {false_positives}")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Skipping SMOTE due to insufficient samples in the minority class.
Evaluating Logistic Regression
PIMA Indian Dataset Accuracy: 0.75
PIMA Indian Dataset False Positives: 27
Confusion Matrix for PIMA Indian Dataset:
[[72 27]
 [23 78]]
Diabetes 2019 Dataset Accuracy: 0.68
Diabetes 2019 Dataset False Positives: 6
Confusion Matrix for Diabetes 2019 Dataset:
[[118   6]
 [ 55  12]]

Evaluating Linear Regression
PIMA Indian Dataset Accuracy: 0.76
PIMA Indian Dataset False Positives: 25
Confusion Matrix for PIMA Indian Dataset:
[[74 25]
 [24 77]]
Diabetes 2019 Dataset Accuracy: 0.65
Diabetes 2019 Dataset False Positives: 0
Confusion Matrix for Diabetes 2019 Dataset:
[[124   0]
 [ 67   0]]

Evaluating SVM
PIMA Indian Dataset Accuracy: 0.80
PIMA Indian Dataset False Positives: 28
Confusion Matrix for PIMA Indian Dataset:
[[71 28]
 [12 89]]
Diabetes 2019 Dataset Accuracy: 0.72
Diabetes 2019 Dataset False Positives: 3
Confusion Matrix for Diabetes 2019 Dataset:
[[121   3]
 [ 50  17]]

Evaluating Na

Parameters: { "use_label_encoder" } are not used.



PIMA Indian Dataset Accuracy: 0.79
PIMA Indian Dataset False Positives: 26
Confusion Matrix for PIMA Indian Dataset:
[[73 26]
 [17 84]]


Parameters: { "use_label_encoder" } are not used.



Diabetes 2019 Dataset Accuracy: 0.85
Diabetes 2019 Dataset False Positives: 7
Confusion Matrix for Diabetes 2019 Dataset:
[[117   7]
 [ 22  45]]

Evaluating LightGBM
[LightGBM] [Info] Number of positive: 399, number of negative: 401
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000353 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2040
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498750 -> initscore=-0.005000
[LightGBM] [Info] Start training from score -0.005000
PIMA Indian Dataset Accuracy: 0.81
PIMA Indian Dataset False Positives: 24
Confusion Matrix for PIMA Indian Dataset:
[[75 24]
 [15 86]]
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000039 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 652
[LightGBM] [Info