In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style="whitegrid")

print("--- 1. ‡∏Å‡∏≤‡∏£‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏°‡∏ï‡πà‡∏≠ (Database Connection) ---")
df = pd.read_csv('diabetes.csv')
print(f"‡πÇ‡∏´‡∏•‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à. ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡πÅ‡∏ñ‡∏ß: {df.shape[0]}, ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå: {df.shape[1]}")
print(df.head())

print("\n--- 2.1 ‡∏Å‡∏≤‡∏£‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•: Target Distribution (‡∏™‡∏±‡∏î‡∏™‡πà‡∏ß‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•) ---")
target_counts = df['Outcome'].value_counts()
labels = ['No Diabetes (0)', 'Diabetes (1)']
colors = ['#1f77b4', '#ff7f0e']
explode = [0.05, 0]

plt.figure(figsize=(7, 7))
plt.pie(target_counts,
        labels=labels,
        autopct='%1.1f%%',
        startangle=90,
        colors=colors,
        explode=explode,
        wedgeprops={'edgecolor': 'black', 'linewidth': 1})
plt.title('Target Distribution (Outcome)', fontsize=16)
plt.show()

print("\n--- 2.2 ‡∏Å‡∏≤‡∏£‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•: ‡πÅ‡∏ú‡∏ô‡∏†‡∏≤‡∏û‡∏Ñ‡∏ß‡∏≤‡∏°‡∏™‡∏±‡∏°‡∏û‡∏±‡∏ô‡∏ò‡πå (Correlation Heatmap) ---")
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap='RdBu_r', linewidths=.5, cbar_kws={'label': 'Correlation Coefficient'})
plt.title('Correlation Heatmap Before Pre-processing', fontsize=16)
plt.show()

cols_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
print("\n--- 2.3. ‡∏Å‡∏≤‡∏£‡∏ï‡∏£‡∏ß‡∏à‡∏™‡∏≠‡∏ö‡∏Ñ‡πà‡∏≤ '0' (Missing Value Check) ---")
print("‡∏´‡∏°‡∏≤‡∏¢‡πÄ‡∏´‡∏ï‡∏∏: ‡∏ä‡∏∏‡∏î‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ô‡∏µ‡πâ‡∏Ñ‡πà‡∏≤ '0' ‡πÉ‡∏ô‡∏Ñ‡∏≠‡∏•‡∏±‡∏°‡∏ô‡πå‡πÄ‡∏´‡∏•‡πà‡∏≤‡∏ô‡∏µ‡πâ‡∏ñ‡∏∑‡∏≠‡πÄ‡∏õ‡πá‡∏ô Missing Value")
print("\nStandard NaN Check (‡∏Ñ‡πà‡∏≤‡∏ó‡∏µ‡πà‡∏´‡∏≤‡∏¢‡πÑ‡∏õ‡∏ï‡∏≤‡∏°‡∏õ‡∏Å‡∏ï‡∏¥):")
print(df.isnull().sum())
print("\nDetailed '0' Value Check (‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ñ‡πà‡∏≤ '0' ‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç):")
missing_zeros = df[cols_to_replace].eq(0).sum()
print(missing_zeros)
data_size = len(df)
missing_percentage = (missing_zeros / data_size) * 100
print("\nPercentage of '0' values (‡πÄ‡∏õ‡∏≠‡∏£‡πå‡πÄ‡∏ã‡πá‡∏ô‡∏ï‡πå‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡πÅ‡∏Å‡πâ‡πÑ‡∏Ç):")
print(missing_percentage.round(2).astype(str) + '%')

print("\n--- 3. ‡∏Å‡∏≤‡∏£‡πÄ‡∏ï‡∏£‡∏µ‡∏¢‡∏°‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÅ‡∏•‡∏∞‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏±‡∏ö‡∏°‡∏≤‡∏ï‡∏£‡∏≤‡∏™‡πà‡∏ß‡∏ô (Pre-Processing & Scaling) ---")
for col in cols_to_replace:
    df[col].replace(0, df[col].median(), inplace=True)
print("\n3.1 ‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£ Missing Values (‡πÅ‡∏ó‡∏ô 0 ‡∏î‡πâ‡∏ß‡∏¢ Median) ‡πÄ‡∏£‡∏µ‡∏¢‡∏ö‡∏£‡πâ‡∏≠‡∏¢‡πÅ‡∏•‡πâ‡∏ß---------------------------------------------------------------------------------------------------")

X = df.drop('Outcome', axis=1)
y = df['Outcome']
print('------------------------------------------------------------------------------------------------------------------------------------------------------------')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"3.2 ‡πÅ‡∏ö‡πà‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÄ‡∏õ‡πá‡∏ô Train (80%): {X_train.shape} ‡πÅ‡∏•‡∏∞ Test (20%): {X_test.shape}")
print('-------------------------------------------------------------------------------------------------------------------------------------------------------------')

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("3.3 ‡∏õ‡∏£‡∏±‡∏ö‡∏°‡∏≤‡∏ï‡∏£‡∏≤‡∏™‡πà‡∏ß‡∏ô‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏î‡πâ‡∏ß‡∏¢ StandardScaler ‡πÄ‡∏™‡∏£‡πá‡∏à‡∏™‡∏°‡∏ö‡∏π‡∏£‡∏ì‡πå-------------------------------------------------------------------------------------------------------------")

print("\n--- 4. Future Engineering: ‡∏Å‡∏≤‡∏£‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏ï‡∏±‡∏ß‡πÅ‡∏õ‡∏£‡∏î‡∏±‡∏ä‡∏ô‡∏µ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÄ‡∏™‡∏µ‡πà‡∏¢‡∏á (Risk Index) ---")
X_train_fe = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_fe = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_fe['High_Risk_Index'] = (X_train_fe['Glucose'] * 0.4) + (X_train_fe['BMI'] * 0.3) + (X_train_fe['Age'] * 0.3)
X_test_fe['High_Risk_Index'] = (X_test_fe['Glucose'] * 0.4) + (X_test_fe['BMI'] * 0.3) + (X_test_fe['Age'] * 0.3)

X_train_fe['BMI2'] = X_train_fe['BMI'] ** 2
X_test_fe['BMI2'] = X_test_fe['BMI'] ** 2

X_train_fe['Glucose_BMI_Ratio'] = X_train_fe['Glucose'] / X_train_fe['BMI']
X_test_fe['Glucose_BMI_Ratio'] = X_test_fe['Glucose'] / X_test_fe['BMI']

X_train_fe['Age_BMI'] = X_train_fe['Age'] * X_train_fe['BMI']
X_test_fe['Age_BMI'] = X_test_fe['Age'] * X_test_fe['BMI']

X_train_final = X_train_fe.values
X_test_final = X_test_fe.values
print("---------------------------------------------------------------------------------------------------------------------------------------------------------------")

print(f"‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏Ñ‡∏∏‡∏ì‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏∞ 'High_Risk_Index' ‡πÅ‡∏•‡∏∞‡∏≠‡∏µ‡∏Å 3 ‡∏ï‡∏±‡∏ß‡πÅ‡∏õ‡∏£‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏™‡∏≥‡πÄ‡∏£‡πá‡∏à. ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏Ñ‡∏∏‡∏ì‡∏•‡∏±‡∏Å‡∏©‡∏ì‡∏∞‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î: {X_train_final.shape[1]}")

print("\n--- 5. ‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•: ‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö 3 ‡πÇ‡∏°‡πÄ‡∏î‡∏• ---")
models = {
    'Logistic Regression (LR)': LogisticRegression(random_state=42),
    'Decision Tree (DT)': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbors (KNN)': KNeighborsClassifier(n_neighbors=9)
}

results = {}
best_model_name = ''
best_auc = 0

for name, model in models.items():
    model.fit(X_train_final, y_train)
    y_pred = model.predict(X_test_final)
    y_proba = model.predict_proba(X_test_final)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc_score = roc_auc_score(y_test, y_proba)
    results[name] = {'Accuracy': acc, 'AUC': auc_score}

    print(f"\n--- Model: {name} ---")
    print(f"Accuracy: {acc:.4f} | ROC AUC: {auc_score:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    if auc_score > best_auc:
        best_auc = auc_score
        best_model_name = name

print(f"\n**‡∏™‡∏£‡∏∏‡∏õ: ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏µ‡πà‡∏°‡∏µ‡∏õ‡∏£‡∏∞‡∏™‡∏¥‡∏ó‡∏ò‡∏¥‡∏†‡∏≤‡∏û‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (‡∏ß‡∏±‡∏î‡∏à‡∏≤‡∏Å AUC) ‡∏Ñ‡∏∑‡∏≠: {best_model_name}**")
best_model = models[best_model_name]

print("\n--- 6. ‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏±‡∏ö‡∏à‡∏π‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏•‡πÅ‡∏•‡∏∞‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢ (Fine-Tune Parameters & Final Result) ---")
print(f"6.1 ‡πÄ‡∏£‡∏¥‡πà‡∏°‡∏Å‡∏≤‡∏£‡∏õ‡∏£‡∏±‡∏ö‡∏à‡∏π‡∏ô‡∏ö‡∏ô‡πÇ‡∏°‡πÄ‡∏î‡∏• {best_model_name}...")

if best_model_name == 'Logistic Regression (LR)':
    param_grid = {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']}
    model_to_tune = LogisticRegression(random_state=42)
elif best_model_name == 'Decision Tree (DT)':
    param_grid = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
    model_to_tune = DecisionTreeClassifier(random_state=42)
else:
    param_grid = {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']}
    model_to_tune = KNeighborsClassifier()

grid_search = GridSearchCV(model_to_tune, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_final, y_train)

final_model = grid_search.best_estimator_
y_pred_final = final_model.predict(X_test_final)
y_proba_final = final_model.predict_proba(X_test_final)[:, 1]
final_auc = roc_auc_score(y_test, y_proba_final)

print(f"\n- ‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î (Best Parameters): {grid_search.best_params_}")
print(f"- ‡∏Ñ‡πà‡∏≤ AUC ‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢: {final_auc:.4f} (‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡∏±‡∏ö AUC ‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î‡πÄ‡∏î‡∏¥‡∏°: {best_auc:.4f})")
print(f"- ‡∏Ñ‡∏ß‡∏≤‡∏°‡πÅ‡∏°‡πà‡∏ô‡∏¢‡∏≥‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢ (Final Accuracy): {accuracy_score(y_test, y_pred_final):.4f}")

cm_final = confusion_matrix(y_test, y_pred_final, normalize='true')
plt.figure(figsize=(6, 5))
sns.heatmap(cm_final, annot=True, fmt='.2f', cmap='Blues',
            xticklabels=['No Diabetes (0)', 'Diabetes (1)'],
            yticklabels=['No Diabetes (0)', 'Diabetes (1)'])
plt.title(f'Normalized Confusion Matrix - Tuned {best_model_name}', fontsize=14)
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()

fpr, tpr, thresholds = roc_curve(y_test, y_proba_final)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(7, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve - Tuned {best_model_name}')
plt.legend(loc="lower right")
plt.show()

# ============================================================
# üîπ ‡πÄ‡∏û‡∏¥‡πà‡∏°‡∏™‡πà‡∏ß‡∏ô‡πÅ‡∏™‡∏î‡∏á‡∏Å‡∏£‡∏≤‡∏ü‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö‡∏Å‡πà‡∏≠‡∏ô‚Äì‡∏´‡∏•‡∏±‡∏á Fine-tune
# ============================================================

results_before = {}
results_after = {}

param_grids = {
    'Logistic Regression (LR)': {'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear', 'lbfgs']},
    'Decision Tree (DT)': {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]},
    'K-Nearest Neighbors (KNN)': {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance']}
}

for name, model in models.items():
    # --- ‡∏Å‡πà‡∏≠‡∏ô Fine-tune ---
    model.fit(X_train_final, y_train)
    y_pred = model.predict(X_test_final)
    y_proba = model.predict_proba(X_test_final)[:, 1]
    results_before[name] = {'Accuracy': accuracy_score(y_test, y_pred),
                            'AUC': roc_auc_score(y_test, y_proba)}

    # --- Fine-tune ---
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='roc_auc', n_jobs=-1)
    grid_search.fit(X_train_final, y_train)
    best_model = grid_search.best_estimator_
    y_pred_f = best_model.predict(X_test_final)
    y_proba_f = best_model.predict_proba(X_test_final)[:, 1]
    results_after[name] = {'Accuracy': accuracy_score(y_test, y_pred_f),
                           'AUC': roc_auc_score(y_test, y_proba_f),
                           'Best_Params': grid_search.best_params_}

# ‡∏™‡∏£‡πâ‡∏≤‡∏á DataFrame ‡πÄ‡∏õ‡∏£‡∏µ‡∏¢‡∏ö‡πÄ‡∏ó‡∏µ‡∏¢‡∏ö
df_compare = pd.DataFrame([
    {'Model': m,
     'Accuracy_Before': results_before[m]['Accuracy'],
     'AUC_Before': results_before[m]['AUC'],
     'Accuracy_After': results_after[m]['Accuracy'],
     'AUC_After': results_after[m]['AUC'],
     'Best_Params': results_after[m]['Best_Params']}
    for m in models.keys()
])
df_compare['Accuracy_Change(%)'] = ((df_compare['Accuracy_After'] - df_compare['Accuracy_Before']) / df_compare['Accuracy_Before'] * 100).round(2)
df_compare['AUC_Change(%)'] = ((df_compare['AUC_After'] - df_compare['AUC_Before']) / df_compare['AUC_Before'] * 100).round(2)
print(df_compare)

x = np.arange(len(df_compare['Model']))
width = 0.35

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# --- Accuracy ---
axes[0].bar(x - width/2, df_compare['Accuracy_Before'], width, label='Before Fine-tune', color='skyblue')
axes[0].bar(x + width/2, df_compare['Accuracy_After'], width, label='After Fine-tune', color='dodgerblue')
axes[0].set_title('Accuracy Comparison (Before vs After)')
axes[0].set_xticks(x)
axes[0].set_xticklabels(df_compare['Model'], rotation=15)
axes[0].legend()

# ‡πÉ‡∏™‡πà % ‡∏Å‡∏≤‡∏£‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÅ‡∏õ‡∏•‡∏á‡πÄ‡∏´‡∏ô‡∏∑‡∏≠‡πÅ‡∏ó‡πà‡∏á Accuracy
for i in range(len(df_compare)):
    change = df_compare.loc[i, 'Accuracy_Change(%)']
    axes[0].text(i + width/2, df_compare.loc[i, 'Accuracy_After'] + 0.01, f"{change:+.2f}%", ha='center', fontsize=9)

# --- AUC ---
axes[1].bar(x - width/2, df_compare['AUC_Before'], width, label='Before Fine-tune', color='lightcoral')
axes[1].bar(x + width/2, df_compare['AUC_After'], width, label='After Fine-tune', color='tomato')
axes[1].set_title('AUC Comparison (Before vs After)')
axes[1].set_xticks(x)
axes[1].set_xticklabels(df_compare['Model'], rotation=15)
axes[1].legend()

# ‡πÉ‡∏™‡πà % ‡∏Å‡∏≤‡∏£‡πÄ‡∏õ‡∏•‡∏µ‡πà‡∏¢‡∏ô‡πÅ‡∏õ‡∏•‡∏á‡πÄ‡∏´‡∏ô‡∏∑‡∏≠‡πÅ‡∏ó‡πà‡∏á AUC
for i in range(len(df_compare)):
    change = df_compare.loc[i, 'AUC_Change(%)']
    axes[1].text(i + width/2, df_compare.loc[i, 'AUC_After'] + 0.01, f"{change:+.2f}%", ha='center', fontsize=9)

plt.tight_layout()
plt.show()