In [4]:
import pandas as pd
import numpy as np
from statsmodels.stats.weightstats import ztest
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Load dataset (ensure correct path)
df = pd.read_csv("/content/diabetes_data_upload.csv")

# Encode categorical features
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':  # Encode categorical columns
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

# Define features and target
X = df.drop(columns=["class"])
y = df["class"]

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

# Train models and evaluate performance
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Compute evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    # Confusion matrix to compute Type I & Type II errors
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    type_i_error = fp / (fp + tn)  # False Positive Rate
    type_ii_error = fn / (fn + tp)  # False Negative Rate

    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "False Positive Rate (Type I Error)": type_i_error,
        "False Negative Rate (Type II Error)": type_ii_error
    }

# Perform Z-Test on mean age of correctly classified vs. misclassified diabetic patients
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)

# Extract correctly and incorrectly classified patients
correct_classified = X_test[y_pred_log == y_test]['Age']
misclassified = X_test[y_pred_log != y_test]['Age']

# Perform Z-Test
z_stat, p_value = ztest(correct_classified, misclassified)

# Print results
print("Model Evaluation Results:")
print(pd.DataFrame(results).T)
print("\nZ-Test Results for Mean Age of Correctly vs. Misclassified Cases:")
print(f"Z-Statistic: {z_stat}, P-Value: {p_value}")

# Analyze false positive rate of Random Forest and perform One-Sample Z-Test
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
tn_rf, fp_rf, fn_rf, tp_rf = confusion_matrix(y_test, y_pred_rf).ravel()
false_positive_rate_rf = fp_rf / (fp_rf + tn_rf)

if false_positive_rate_rf > 0.2:
    z_stat_rf, p_value_rf = ztest([false_positive_rate_rf], value=0.2)
    print(f"\nOne-Sample Z-Test for False Positive Rate of Random Forest:")
    print(f"Z-Statistic: {z_stat_rf}, P-Value: {p_value_rf}")


Model Evaluation Results:
                     Accuracy  Precision    Recall  \
Logistic Regression  0.923077   0.931507  0.957746   
Decision Tree        0.932692   0.984848  0.915493   
Random Forest        0.990385   1.000000  0.985915   

                     False Positive Rate (Type I Error)  \
Logistic Regression                            0.151515   
Decision Tree                                  0.030303   
Random Forest                                  0.000000   

                     False Negative Rate (Type II Error)  
Logistic Regression                             0.042254  
Decision Tree                                   0.084507  
Random Forest                                   0.014085  

Z-Test Results for Mean Age of Correctly vs. Misclassified Cases:
Z-Statistic: 1.7686101020183942, P-Value: 0.07695896375275615


In [3]:
df = pd.read_csv("/content/diabetes_data_upload.csv")
print(df.columns)


Index(['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity', 'class'],
      dtype='object')
