In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
%pip install xgboost lightgbm

Note: you may need to restart the kernel to use updated packages.


In [10]:
import sqlite3
import pandas as pd
conn = sqlite3.connect(f"{base_folder}/data/housing.db")
housing = pd.read_sql_query(
    """
    SELECT
        b.block_id,
        b.longitude,
        b.latitude,
        s.housing_median_age,
        s.total_rooms,
        s.total_bedrooms,
        s.population,
        s.households,
        s.median_income,
        s.median_house_value,
        op.name AS ocean_proximity
    FROM block AS b
    JOIN block_housing_stats AS s
        ON s.block_id = b.block_id
    JOIN ocean_proximity AS op
        ON op.ocean_proximity_id = b.ocean_proximity_id
    ORDER BY b.block_id
    """,
    conn,
)
conn.close()

housing.head()

NameError: name 'base_folder' is not defined

In [6]:
%pip install -r /Users/nayanpaliwal/Desktop/Eas_final_project/housing_app_fall25/api/requirements.txt

Collecting fastapi (from -r /Users/nayanpaliwal/Desktop/Eas_final_project/housing_app_fall25/api/requirements.txt (line 1))
  Using cached fastapi-0.125.0-py3-none-any.whl.metadata (30 kB)
Collecting pandas==2.2.2 (from -r /Users/nayanpaliwal/Desktop/Eas_final_project/housing_app_fall25/api/requirements.txt (line 3))
  Using cached pandas-2.2.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (19 kB)
Collecting scikit-learn==1.6.1 (from -r /Users/nayanpaliwal/Desktop/Eas_final_project/housing_app_fall25/api/requirements.txt (line 4))
  Using cached scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting joblib==1.5.2 (from -r /Users/nayanpaliwal/Desktop/Eas_final_project/housing_app_fall25/api/requirements.txt (line 5))
  Using cached joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting xgboost==3.1.2 (from -r /Users/nayanpaliwal/Desktop/Eas_final_project/housing_app_fall25/api/requirements.txt (line 6))
  Using cached xgboost-3.1.2-py3-none-macosx_12_0_arm64.w

In [5]:
import os
import sqlite3
import pandas as pd

# -----------------------------------------------------------------------------
# ROBUST PATH FINDING (Fixes "unable to open database file")
# -----------------------------------------------------------------------------

# Get the directory where this notebook is running
current_dir = os.getcwd()
print(f"Current working directory: {current_dir}")

# Attempt 1: Check if we are in the 'notebooks' folder (standard behavior)
# Path would be ../data/churn.db
path_attempt_1 = os.path.abspath(os.path.join(current_dir, '..', 'data', 'churn.db'))

# Attempt 2: Check if we are in the project root (VS Code behavior)
# Path would be ./data/churn.db
path_attempt_2 = os.path.abspath(os.path.join(current_dir, 'data', 'churn.db'))

if os.path.exists(path_attempt_1):
    db_path = path_attempt_1
    print(f"✓ Found database at: {db_path}")
elif os.path.exists(path_attempt_2):
    db_path = path_attempt_2
    print(f"✓ Found database at: {db_path}")
else:
    # Stop execution if we can't find it
    raise FileNotFoundError(f"❌ Could not find churn.db in {path_attempt_1} OR {path_attempt_2}")

# -----------------------------------------------------------------------------
# CONNECT AND LOAD
# -----------------------------------------------------------------------------
conn = sqlite3.connect(db_path)

query = """
SELECT c.CreditScore, g.Name AS Geography, gen.Name AS Gender, c.Age, c.Tenure, c.Balance, 
       c.NumOfProducts, c.HasCrCard, c.IsActiveMember, c.EstimatedSalary, c.Exited
FROM customer c 
JOIN geography g ON c.GeographyID = g.GeographyID 
JOIN gender gen ON c.GenderID = gen.GenderID
"""

try:
    df = pd.read_sql_query(query, conn)
    print(f"✓ Successfully loaded {len(df)} rows.")
except Exception as e:
    print(f"❌ Database query failed: {e}")
finally:
    conn.close()

# Prepare X and y
X = df.drop("Exited", axis=1)
y = df["Exited"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("✓ Data split into Train/Test sets.")

Current working directory: /Users/nayanpaliwal/Desktop/Eas_final_project/housing_app_fall25
✓ Found database at: /Users/nayanpaliwal/Desktop/Eas_final_project/housing_app_fall25/data/churn.db
✓ Successfully loaded 100 rows.
✓ Data split into Train/Test sets.


In [6]:
# -----------------------------------------------------------------------------
# 2. RUN 8 EXPERIMENTS (Untuned)
# -----------------------------------------------------------------------------
mlflow.set_tracking_uri("sqlite:///mlflow.db") 
mlflow.set_experiment("churn_experiments")

algorithms = ["logistic", "random_forest", "xgboost", "lightgbm"]
preprocessing = build_preprocessing()
results = []

print(f"{'Run Name':<40} {'F1 Score':<10}")
print("-" * 55)

for algo in algorithms:
    for use_pca in [False, True]:
        run_name = f"{algo}_{'pca' if use_pca else 'no_pca'}_untuned"
        
        # Build Pipeline
        steps = [preprocessing]
        if use_pca:
            steps.append(PCA(n_components=0.95)) # Keep 95% variance
        steps.append(make_estimator_for_name(algo))
        
        pipeline = make_pipeline(*steps)

        # Train
        pipeline.fit(X_train, y_train)
        
        # Evaluate
        y_pred = pipeline.predict(X_test)
        test_f1 = f1_score(y_test, y_pred)
        
        print(f"{run_name:<40} {test_f1:.4f}")

        # Log to MLflow
        with mlflow.start_run(run_name=run_name):
            mlflow.log_params({"algo": algo, "pca": use_pca, "tuned": False})
            mlflow.log_metric("f1", test_f1)
            mlflow.sklearn.log_model(pipeline, "model")
        
        results.append({"name": run_name, "f1": test_f1, "model": pipeline})

# -----------------------------------------------------------------------------
# 3. SAVE THE BEST MODEL
# -----------------------------------------------------------------------------
best_run = max(results, key=lambda x: x['f1'])
models_dir = Path("../models")

# Robust path handling for models directory
if not models_dir.exists():
    # If running from root, try ./models
    if Path("models").exists():
        models_dir = Path("models")
    else:
        models_dir.mkdir(parents=True, exist_ok=True)

save_path = models_dir / "best_untuned_model.pkl"
joblib.dump(best_run['model'], save_path)

print("\n" + "="*55)
print(f"BEST MODEL: {best_run['name']} (F1: {best_run['f1']:.4f})")
print(f"Saved to: {save_path}")
print("="*55)

2025/12/18 14:23:22 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/12/18 14:23:22 INFO mlflow.store.db.utils: Updating database tables
2025/12/18 14:23:22 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2025/12/18 14:23:22 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2025/12/18 14:23:22 INFO alembic.runtime.migration: Running upgrade  -> 451aebb31d03, add metric step
2025/12/18 14:23:23 INFO alembic.runtime.migration: Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025/12/18 14:23:23 INFO alembic.runtime.migration: Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2025/12/18 14:23:23 INFO alembic.runtime.migration: Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2025/12/18 14:23:23 INFO alembic.runtime.migration: Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2025/12/18 14:23:23 INFO alembic.runtime.migration: Running 

Run Name                                 F1 Score  
-------------------------------------------------------
logistic_no_pca_untuned                  0.4000




logistic_pca_untuned                     0.4444




random_forest_no_pca_untuned             0.5714




random_forest_pca_untuned                0.3750


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgboost_no_pca_untuned                   0.6250


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


xgboost_pca_untuned                      0.5000




lightgbm_no_pca_untuned                  0.5882




lightgbm_pca_untuned                     0.5000

BEST MODEL: xgboost_no_pca_untuned (F1: 0.6250)
Saved to: models/best_untuned_model.pkl


In [1]:
# notebooks/02_train_model_without_optuna.ipynb
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent)) # Add project root to path

import sqlite3
import pandas as pd
import mlflow
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
from housing_pipeline import build_preprocessing, make_estimator_for_name

# 1. LOAD DATA
conn = sqlite3.connect("../data/churn.db")
query = """
SELECT c.CreditScore, g.Name AS Geography, gen.Name AS Gender, c.Age, c.Tenure, c.Balance, 
       c.NumOfProducts, c.HasCrCard, c.IsActiveMember, c.EstimatedSalary, c.Exited
FROM customer c JOIN geography g ON c.GeographyID = g.GeographyID JOIN gender gen ON c.GenderID = gen.GenderID
"""
df = pd.read_sql_query(query, conn)
conn.close()

X = df.drop("Exited", axis=1)
y = df["Exited"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 2. RUN 8 EXPERIMENTS (Untuned)
mlflow.set_tracking_uri("sqlite:///mlflow.db") # Or your Dagshub URI
mlflow.set_experiment("churn_experiments")

algorithms = ["logistic", "random_forest", "xgboost", "lightgbm"]
preprocessing = build_preprocessing()
results = []

for algo in algorithms:
    for use_pca in [False, True]:
        run_name = f"{algo}_{'pca' if use_pca else 'no_pca'}_untuned"
        print(f"Running: {run_name}")
        
        steps = [preprocessing]
        if use_pca: steps.append(PCA(n_components=0.95))
        steps.append(make_estimator_for_name(algo))
        
        pipeline = make_pipeline(*steps)
        pipeline.fit(X_train, y_train)
        
        f1 = f1_score(y_test, pipeline.predict(X_test))
        print(f"  -> F1: {f1:.4f}")
        
        with mlflow.start_run(run_name=run_name):
            mlflow.log_params({"algo": algo, "pca": use_pca, "tuned": False})
            mlflow.log_metric("f1", f1)
            mlflow.sklearn.log_model(pipeline, "model")
            
        results.append({"model": pipeline, "f1": f1})

# Save Best Untuned Model
best = max(results, key=lambda x: x['f1'])
Path("../models").mkdir(exist_ok=True)
joblib.dump(best['model'], "../models/best_untuned_model.pkl")

OperationalError: unable to open database file