In [74]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ishandutta/early-stage-diabetes-risk-prediction-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/ishandutta/early-stage-diabetes-risk-prediction-dataset/versions/1


In [106]:
import pandas as pd
import numpy as np
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from statsmodels.stats.proportion import proportions_ztest

In [76]:
df = pd.read_csv(f"{path}/diabetes_data_upload.csv")

In [77]:
df.head()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive


In [78]:
df.columns

Index(['Age', 'Gender', 'Polyuria', 'Polydipsia', 'sudden weight loss',
       'weakness', 'Polyphagia', 'Genital thrush', 'visual blurring',
       'Itching', 'Irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'Alopecia', 'Obesity', 'class'],
      dtype='object')

In [79]:
le = LabelEncoder()
for col in df.columns:
    df[col] = le.fit_transform(df[col])

In [80]:
X = df.drop(columns=["class"])
y = df["class"]

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [108]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

In [109]:
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'False Negative Rate (Type II Error)': fn / (fn + tp)
    }

In [110]:
for model, metrics in results.items():
    print(f"{model}: {metrics}")

Logistic Regression: {'Accuracy': 0.9230769230769231, 'Precision': 0.9315068493150684, 'Recall': 0.9577464788732394, 'False Negative Rate (Type II Error)': np.float64(0.04225352112676056)}
Decision Tree: {'Accuracy': 0.9423076923076923, 'Precision': 1.0, 'Recall': 0.9154929577464789, 'False Negative Rate (Type II Error)': np.float64(0.08450704225352113)}
Random Forest: {'Accuracy': 0.9903846153846154, 'Precision': 1.0, 'Recall': 0.9859154929577465, 'False Negative Rate (Type II Error)': np.float64(0.014084507042253521)}


In [111]:
best_model = min(results, key=lambda x: results[x]['False Negative Rate (Type II Error)'])
print(f'Best model for diabetes detection (low Type II error): {best_model}\n', results)

Best model for diabetes detection (low Type II error): Random Forest
 {'Logistic Regression': {'Accuracy': 0.9230769230769231, 'Precision': 0.9315068493150684, 'Recall': 0.9577464788732394, 'False Negative Rate (Type II Error)': np.float64(0.04225352112676056)}, 'Decision Tree': {'Accuracy': 0.9423076923076923, 'Precision': 1.0, 'Recall': 0.9154929577464789, 'False Negative Rate (Type II Error)': np.float64(0.08450704225352113)}, 'Random Forest': {'Accuracy': 0.9903846153846154, 'Precision': 1.0, 'Recall': 0.9859154929577465, 'False Negative Rate (Type II Error)': np.float64(0.014084507042253521)}}


Train a Logistic Regression model and perform a Z-Test on the mean age of correctly classified vs. misclassified diabetic patients.
Is there a significant difference in mean age between these two groups? What does this imply about age as a feature?
Should a different model be considered for better classification?


In [100]:
from statsmodels.stats.weightstats import ztest

In [96]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

# Identify correctly and misclassified diabetic patients
correct_indices = (y_test == y_pred)
misclassified_indices = (y_test != y_pred)

correct_ages = X_test.loc[correct_indices, "Age"]
misclassified_ages = X_test.loc[misclassified_indices, "Age"]

# Perform Z-Test
z_stat, p_value = ztest(correct_ages, misclassified_ages)

# Display Z-Test results
print(f"Z-Test Statistic: {z_stat}, P-Value: {p_value}")

# Determine significance
if p_value < 0.05:
    print("Significant difference in mean age between correctly classified and misclassified diabetic patients.")
    print("This suggests age is an influential factor in classification.")
else:
    print("No significant difference in mean age between correctly classified and misclassified diabetic patients.")
    print("Age may not be a strong distinguishing factor.")


Z-Test Statistic: 1.8121654030023497, P-Value: 0.06996064306949588
No significant difference in mean age between correctly classified and misclassified diabetic patients.
Age may not be a strong distinguishing factor.


In [112]:
y_pred_logistic = LogisticRegression().fit(X_train, y_train).predict(X_test)
correct_indices = np.where(y_pred_logistic == y_test)[0]
incorrect_indices = np.where(y_pred_logistic != y_test)[0]

test_stat, p_value = stats.ttest_ind(X_test[correct_indices, 0], X_test[incorrect_indices, 0], equal_var=False)
print(f'Z-Test on mean age: p-value = {p_value}')
if p_value < 0.05:
    print("Significant difference in mean age of correctly vs. misclassified cases.")


Z-Test on mean age: p-value = 0.00023728463474212143
Significant difference in mean age of correctly vs. misclassified cases.


Train a Random Forest model and analyze its false positive rate (Type I error).
If the false positive rate is higher than 20%, perform a One-Sample Z-Test to check if this rate is significantly different from 20%.
What changes in feature selection or threshold tuning can reduce Type I errors?
Compare the results with a Gradient Boosting Model. Which model performs better in reducing Type I errors?

In [113]:
y_pred_rf = RandomForestClassifier().fit(X_train, y_train).predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
false_positive_rate = fp / (fp + tn)
if false_positive_rate > 0.2:
    z_stat, p_val = stats.norm.cdf(false_positive_rate, loc=0.2, scale=np.std(false_positive_rate))
    print(f'One-Sample Z-Test for FPR > 20%: p-value = {p_val}')

Compare the false negative rates (Type II errors) of SVM, KNN, and Logistic Regression models.
Perform a Z-Test to determine if the false negative rates of any two models are significantly different.
Which model minimizes the risk of undiagnosed diabetes cases?
Based on the test results, which model should be recommended for real-world deployment?

In [104]:
from statsmodels.stats.proportion import proportions_ztest

In [114]:
models_2 = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression()
}

fn_rates = {}
for name, model in models_2.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    fn_rates[name] = fn / (fn + tp)

# Get false negatives and total positives for SVM and Logistic Regression
fn_svm = fn_rates['SVM'] * (y_test == 1).sum()
fn_lr = fn_rates['Logistic Regression'] * (y_test == 1).sum()
total_pos = (y_test == 1).sum()

# Perform Z-Test for proportions
count = [fn_svm, fn_lr]
nobs = [total_pos, total_pos]  # Total number of actual positive cases

z_stat, p_value = proportions_ztest(count, nobs, alternative='two-sided')

print(f'Z-Test on Type II Error Rates: Z-Statistic = {z_stat:.4f}, p-value = {p_value:.4f}')


Z-Test on Type II Error Rates: Z-Statistic = -1.7506, p-value = 0.0800


Train a Gradient Boosting Model and examine the misclassification of diabetic patients.
Perform a Z-Test to determine if the mean age of misclassified cases is significantly different from correctly classified cases.
If the test is significant, how can you adjust the model to reduce Type II errors?
Compare it with a Random Forest model. Which one has fewer Type II errors, and which should be chosen for medical use?
Train three different models (e.g., Logistic Regression, SVM, Random Forest) and compare their Type I and Type II error

In [115]:
y_pred_gbm = GradientBoostingClassifier().fit(X_train, y_train).predict(X_test)
correct_indices_gbm = np.where(y_pred_gbm == y_test)[0]
incorrect_indices_gbm = np.where(y_pred_gbm != y_test)[0]

test_stat_gbm, p_value_gbm = stats.ttest_ind(X_test[correct_indices_gbm, 0], X_test[incorrect_indices_gbm, 0], equal_var=False)
print(f'Gradient Boosting Z-Test on mean age: p-value = {p_value_gbm}')

Gradient Boosting Z-Test on mean age: p-value = 0.1998955572525326


Train three different models (e.g., Logistic Regression, SVM, Random Forest) and compare their Type I and Type II error rates.
Perform a Z-Test to determine if there is a statistically significant difference between their error rates.
Based on the results, which model should be selected for diagnosing diabetes in a medical setting?

In [118]:
from statsmodels.stats.proportion import proportions_ztest

In [119]:
final_models = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier()
}

errors = {}

for name, model in final_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    errors[name] = {
        'Type I Error': fp / (fp + tn),
        'Type II Error': fn / (fn + tp),
        'False Positives': fp,
        'False Negatives': fn,
        'Total Negatives': (y_test == 0).sum(),
        'Total Positives': (y_test == 1).sum()
    }

In [120]:
count_type1 = [errors['Logistic Regression']['False Positives'], errors['Random Forest']['False Positives']]
nobs_type1 = [errors['Logistic Regression']['Total Negatives'], errors['Random Forest']['Total Negatives']]

z_stat_type1, p_value_type1 = proportions_ztest(count_type1, nobs_type1, alternative='two-sided')
print(f'Z-Test on Type I Errors: Z-Statistic = {z_stat_type1:.4f}, p-value = {p_value_type1:.4f}')

# Z-Test for Type II Errors (False Negatives)
count_type2 = [errors['SVM']['False Negatives'], errors['Random Forest']['False Negatives']]
nobs_type2 = [errors['SVM']['Total Positives'], errors['Random Forest']['Total Positives']]

z_stat_type2, p_value_type2 = proportions_ztest(count_type2, nobs_type2, alternative='two-sided')
print(f'Z-Test on Type II Errors: Z-Statistic = {z_stat_type2:.4f}, p-value = {p_value_type2:.4f}')

Z-Test on Type I Errors: Z-Statistic = 2.3259, p-value = 0.0200
Z-Test on Type II Errors: Z-Statistic = -1.0035, p-value = 0.3156
