In [6]:
import sys
import os

# 1. Print where we are currently running
current_dir = os.getcwd()
print(f"Current Directory: {current_dir}")

# 2. Check if the file is in the parent directory (one folder up)
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
file_path = os.path.join(parent_dir, 'housing_pipeline.py')

if os.path.exists(file_path):
    print(f"✓ Found housing_pipeline.py at: {file_path}")
    # Add the parent directory to Python's path
    if parent_dir not in sys.path:
        sys.path.insert(0, parent_dir)
        print("✓ Added parent directory to path.")
else:
    # 3. Fallback: Check if we are already in the root folder
    local_path = os.path.join(current_dir, 'housing_pipeline.py')
    if os.path.exists(local_path):
        print(f"✓ Found housing_pipeline.py in current folder.")
        if current_dir not in sys.path:
            sys.path.insert(0, current_dir)
    else:
        print("❌ CRITICAL ERROR: Could not find 'housing_pipeline.py'.")
        print("Please make sure the file exists in the project root folder.")

# 4. Now try to import
try:
    import housing_pipeline
    from housing_pipeline import build_preprocessing, make_estimator_for_name
    print("SUCCESS: Module imported correctly!")
except ImportError as e:
    print(f"Import failed: {e}")

Current Directory: /Users/nayanpaliwal/Desktop/Eas_final_project/housing_app_fall25/notebooks
✓ Found housing_pipeline.py at: /Users/nayanpaliwal/Desktop/Eas_final_project/housing_app_fall25/housing_pipeline.py
✓ Added parent directory to path.
SUCCESS: Module imported correctly!


In [1]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
base_folder = "/content/drive/MyDrive/Colab Notebooks/housing_fall2025"
%cd "{base_folder}"

/content/gdrive/MyDrive/Colab Notebooks/housing_fall2025


In [3]:
# notebooks/01_create_database.ipynb
import sys
from pathlib import Path
import sqlite3
import pandas as pd
import numpy as np
import os

# Define paths

DATA_DIR = Path("../data")
DB_PATH = DATA_DIR / "churn.db"
CSV_PATH = DATA_DIR / "Churn_Modelling.csv"
DATA_DIR.mkdir(parents=True, exist_ok=True)

# 1. GET DATA (If you don't have the CSV, this creates dummy data to ensure it runs)
if not CSV_PATH.exists():
    print("Creating dummy Churn dataset...")
    df = pd.DataFrame({
        'CustomerId': range(100),
        'CreditScore': np.random.randint(350, 850, 100),
        'Geography': np.random.choice(['France', 'Spain', 'Germany'], 100),
        'Gender': np.random.choice(['Female', 'Male'], 100),
        'Age': np.random.randint(18, 90, 100),
        'Tenure': np.random.randint(0, 10, 100),
        'Balance': np.random.uniform(0, 200000, 100),
        'NumOfProducts': np.random.choice([1, 2, 3, 4], 100),
        'HasCrCard': np.random.choice([0, 1], 100),
        'IsActiveMember': np.random.choice([0, 1], 100),
        'EstimatedSalary': np.random.uniform(20000, 150000, 100),
        'Exited': np.random.choice([0, 1], 100)
    })
    df.to_csv(CSV_PATH, index=False)
else:
    df = pd.read_csv(CSV_PATH)

# 2. NORMALIZE & SAVE TO DB
print("Building Normalized Database...")
if DB_PATH.exists(): os.remove(DB_PATH)
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()

# Create Tables
cur.executescript("""
    CREATE TABLE geography (GeographyID INTEGER PRIMARY KEY, Name TEXT UNIQUE);
    CREATE TABLE gender (GenderID INTEGER PRIMARY KEY, Name TEXT UNIQUE);
    CREATE TABLE customer (
        CustomerId INTEGER PRIMARY KEY, CreditScore INTEGER, GeographyID INTEGER, GenderID INTEGER,
        Age INTEGER, Tenure INTEGER, Balance REAL, NumOfProducts INTEGER, HasCrCard INTEGER,
        IsActiveMember INTEGER, EstimatedSalary REAL, Exited INTEGER,
        FOREIGN KEY(GeographyID) REFERENCES geography(GeographyID),
        FOREIGN KEY(GenderID) REFERENCES gender(GenderID)
    );
""")

# Insert Data
geo_dim = df[['Geography']].drop_duplicates().reset_index(drop=True)
geo_dim['ID'] = geo_dim.index + 1
gen_dim = df[['Gender']].drop_duplicates().reset_index(drop=True)
gen_dim['ID'] = gen_dim.index + 1

cur.executemany("INSERT INTO geography VALUES (?, ?)", list(zip(geo_dim['ID'], geo_dim['Geography'])))
cur.executemany("INSERT INTO gender VALUES (?, ?)", list(zip(gen_dim['ID'], gen_dim['Gender'])))

df = df.merge(geo_dim, on='Geography').merge(gen_dim, on='Gender')
cols = ['CustomerId', 'CreditScore', 'ID_x', 'ID_y', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited']
cur.executemany(f"INSERT INTO customer VALUES ({','.join(['?']*12)})", df[cols].values.tolist())

conn.commit()
conn.close()
print(f"✓ Database created at {DB_PATH}")

Building Normalized Database...
✓ Database created at ../data/churn.db


In [7]:
import housing_pipeline
import inspect
import os

print("=== DEBUGGING INFO ===")
print(f"1. File Location: {housing_pipeline.__file__}")
print(f"2. Last Modified: {os.path.getmtime(housing_pipeline.__file__)}")

print("\n3. Checking code inside 'make_estimator_for_name':")
try:
    print(inspect.getsource(housing_pipeline.make_estimator_for_name))
except Exception as e:
    print(f"Could not read source: {e}")

=== DEBUGGING INFO ===
1. File Location: /Users/nayanpaliwal/Desktop/Eas_final_project/housing_app_fall25/housing_pipeline.py
2. Last Modified: 1766080779.7209566

3. Checking code inside 'make_estimator_for_name':
def make_estimator_for_name(name: str):
    """
    Given a model name, return an unconfigured estimator instance.
    Used in PCA variants and (optionally) elsewhere.
    """
    if name == "ridge":
        return Ridge()
    elif name == "histgradientboosting":
        return HistGradientBoostingRegressor(random_state=42)
    elif name == "xgboost":
        return XGBRegressor(
            objective="reg:squarederror",
            random_state=42,
            n_estimators=300,
            learning_rate=0.1,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            tree_method="hist",
            n_jobs=-1,
        )
    elif name == "lightgbm":
        return LGBMRegressor(
            random_state=42,
            n_estimators=300,
 