In [None]:
# ================================================================
#   ADVANCED DATABASE PROJECT – DIABETES INDICATOR ANALYSIS
#   CSV vs SQL PERFORMANCE + BASIC ML CLASSIFICATION
# ================================================================

import pandas as pd
import numpy as np
import sqlite3
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import matplotlib.pyplot as plt

# ================================================================
#  1. LOAD DATA (CSV)
# ================================================================

CSV_PATH = "diabetes.csv"   # rename to match your Kaggle CSV

print("Loading CSV...")
df = pd.read_csv(CSV_PATH)
df.head()


In [None]:
# ================================================================
# 2. BASIC STATISTICAL EXPLORATION
# ================================================================

print("\n=== BASIC STATS ===")
print(df.describe().T)

print("\n=== CLASS DISTRIBUTION ===")
print(df["Diabetes_binary"].value_counts(normalize=True))

# Visualize a few correlations
plt.figure(figsize=(10,8))
plt.imshow(df.corr(), cmap='coolwarm', interpolation='nearest')
plt.colorbar()
plt.title("Correlation Heatmap")
plt.show()


In [None]:
# ================================================================
# 3. CSV QUERY TIMING
# ================================================================

def time_csv_query(query_func, name):
    start = time.perf_counter()
    result = query_func()
    end = time.perf_counter()
    print(f"{name} → {end-start:.6f} sec")
    return result

print("\n=== CSV QUERY PERFORMANCE ===")

# Example queries
q1 = time_csv_query(lambda: df[df["BMI"] > 30], "People with BMI > 30")
q2 = time_csv_query(lambda: df.groupby("Age").size(), "Group by Age")
q3 = time_csv_query(lambda: df[(df["BMI"] > 25) & (df["HighBP"] == 1)], "Filter BMI & HighBP")


In [None]:
# ================================================================
# 4. LOAD CSV INTO SQLITE
# ================================================================

SQLITE_DB = "diabetes.db"

conn = sqlite3.connect(SQLITE_DB)
df.to_sql("diabetes", conn, if_exists="replace", index=False)

print("\nDatabase created:", SQLITE_DB)


In [None]:
# ================================================================
# 5. SQL QUERY TIMING (NO INDEX)
# ================================================================

def sql_query(query, name):
    start = time.perf_counter()
    result = pd.read_sql_query(query, conn)
    end = time.perf_counter()
    print(f"{name} → {end-start:.6f} sec")
    return result

print("\n=== SQL QUERY PERFORMANCE (NO INDEX) ===")

sql_query("SELECT * FROM diabetes WHERE BMI > 30;", "SQL BMI > 30")
sql_query("SELECT Age, COUNT(*) FROM diabetes GROUP BY Age;", "SQL group by Age")
sql_query("SELECT * FROM diabetes WHERE BMI > 25 AND HighBP = 1;", "SQL BMI + HighBP")


In [None]:
# ================================================================
# 6. CREATE INDEXES + RE-TIME QUERIES
# ================================================================

print("\n=== CREATING INDEXES ===")
conn.execute("CREATE INDEX IF NOT EXISTS idx_bmi ON diabetes(BMI);")
conn.execute("CREATE INDEX IF NOT EXISTS idx_age ON diabetes(Age);")
conn.execute("CREATE INDEX IF NOT EXISTS idx_bmi_bp ON diabetes(BMI, HighBP);")

conn.commit()

print("\n=== SQL QUERY PERFORMANCE (WITH INDEX) ===")
sql_query("SELECT * FROM diabetes WHERE BMI > 30;", "Indexed SQL BMI > 30")
sql_query("SELECT Age, COUNT(*) FROM diabetes GROUP BY Age;", "Indexed group by Age")
sql_query("SELECT * FROM diabetes WHERE BMI > 25 AND HighBP = 1;", "Indexed SQL BMI + HighBP")


In [None]:
# ================================================================
# 7. VIEW QUERY PLANS
# ================================================================

print("\n=== QUERY PLANS ===")

plans = [
    "EXPLAIN QUERY PLAN SELECT * FROM diabetes WHERE BMI > 30;",
    "EXPLAIN QUERY PLAN SELECT Age, COUNT(*) FROM diabetes GROUP BY Age;",
    "EXPLAIN QUERY PLAN SELECT * FROM diabetes WHERE BMI > 25 AND HighBP = 1;"
]

for p in plans:
    print("\n", p)
    print(pd.read_sql_query(p, conn))


In [None]:
# ================================================================
# 8. BASIC CLASSIFICATION (LOGISTIC REGRESSION)
# ================================================================

print("\n=== LOGISTIC REGRESSION MODEL ===")

target = "Diabetes_binary"
X = df.drop(columns=[target])
y = df[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

model_lr = LogisticRegression(max_iter=200)
model_lr.fit(X_train, y_train)

preds = model_lr.predict(X_test)

print("Accuracy:", accuracy_score(y_test, preds))
print("Precision:", precision_score(y_test, preds))
print("Recall:", recall_score(y_test, preds))


In [None]:
# ================================================================
# 9. SIMPLE NEURAL NETWORK CLASSIFIER
# ================================================================

print("\n=== NEURAL NETWORK CLASSIFIER ===")

nn = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

nn.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

nn.fit(X_train, y_train, epochs=10, validation_split=0.2, verbose=1)

loss, acc = nn.evaluate(X_test, y_test)
print(f"NN Accuracy: {acc:.4f}")


In [None]:
# ================================================================
# 10. FINAL NOTEBOOK SUMMARY
# ================================================================

print("\n=== PROJECT SUMMARY ===")
print("1. Loaded CSV (no indexing) and ran queries.")
print("2. Loaded data into SQLite.")
print("3. Timed SQL queries with and without indexes.")
print("4. Compared EXPLAIN QUERY PLAN results.")
print("5. Performed basic statistical analysis.")
print("6. Trained Logistic Regression classifier.")
print("7. Trained small Neural Network classifier.")
print("\nNotebook complete!")