# #  ML Project Template
# 
# Reusable workflow:
# 1. Imports & config
# 2. Load data
# 3. Inspect & clean
# 4. EDA (visualization)
# 5. Preprocessing & feature engineering
# 6. Train/test split
# 7. Model training
# 8. Evaluation & residual analysis
# 9. Cross-validation
# 10. Iteration hooks



In [None]:
# 1. IMPORTS & CONFIG
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    mean_squared_error
)

In [None]:
# Plot style (optional)
plt.style.use("default")
sns.set_theme()

RANDOM_STATE = 42

# ## 2. Load Data
# - Adjust file path and loader as needed.
# - This cell is the only one that should be very dataset-specific.

# 2. LOAD DATA

# TODO: Change this to your actual data file
DATA_PATH = "AmesHousing.csv"

df = pd.read_csv(DATA_PATH)

print("Data shape:", df.shape)
df.head()



In [None]:
# ## 3. Basic Inspection & Sanity Checks
# - `info`, `describe`, missing values
# - Categorical vs numeric overview

# 3. INSPECT DATA

print("=== INFO ===")
df.info()

print("\n=== DESCRIBE (NUMERIC) ===")
display(df.describe())

print("\n=== MISSING VALUES PER COLUMN ===")
print(df.isna().sum())

print("\n=== UNIQUE VALUES PER COLUMN ===")
print(df.nunique())



In [None]:
# ## 4. Quick EDA: Distributions & Outliers
# - Histograms
# - Boxplots
# - Basic skewness check
#
# NOTE: Adjust `numeric_cols` list to your dataset.
# 4. EDA - DISTRIBUTIONS

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric columns:", numeric_cols)

# Histograms
df[numeric_cols].hist(bins=30, figsize=(15, 10))
plt.tight_layout()
plt.show()

# Boxplots for numeric columns
plt.figure(figsize=(12, 6))
df[numeric_cols].boxplot()
plt.xticks(rotation=45)
plt.title("Boxplots of Numeric Features")
plt.tight_layout()
plt.show()

# Skewness
print("\n=== SKEWNESS ===")
print(df[numeric_cols].skew())



In [None]:
# ## 5. Relationships: Scatter, Correlation, Pairplot
# - Use this cell when dataset is not huge.
# - Adjust `key_features` to a smaller subset if needed.
# 5. EDA - RELATIONSHIPS

# TODO: Set your target column
TARGET_COL = "SalePrice" #This will always be Y, or desired prediction

# Choose a few key features for scatter / pairplot
key_features = [col for col in numeric_cols if col != TARGET_COL][:4]
print("Key features for pairplot:", key_features)

if TARGET_COL in df.columns:
    # Simple scatter with target if numeric
    if np.issubdtype(df[TARGET_COL].dtype, np.number):
        for col in key_features:
            plt.figure()
            plt.scatter(df[col], df[TARGET_COL], alpha=0.5)
            plt.xlabel(col)
            plt.ylabel(TARGET_COL)
            plt.title(f"{col} vs {TARGET_COL}")
            plt.show()

# Correlation matrix
corr = df[numeric_cols].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=False, cmap="coolwarm", center=0)
plt.title("Correlation Matrix")
plt.tight_layout()
plt.show()

# Pairplot (small feature subset)
if len(key_features) > 1:
    sns.pairplot(df[key_features + [TARGET_COL]].dropna(), corner=True)
    plt.show()


In [25]:
# ## 6. Handle Missing Data & Obvious Cleaning
# - Light-touch template. Customize as needed.
# - Options: drop rows, impute, custom rules.

# 6. CLEANING

# Standardize column names to snake_case
import re

def to_snake(col):
    # Replace spaces and hyphens with underscore
    col = re.sub(r"[ -]+", "_", col)

    # Insert underscore before capital letters (except first)
    # col = re.sub(r"(?<!^)([A-Z])", r"_\1", col)

    # Convert to lowercase
    # col = col.lower()

    return col

df.columns = [to_snake(col) for col in df.columns]

# print("Updated column names:")
# print(df.columns.tolist())

# Identify missing data
missing_summary = df.isna().sum().sort_values(ascending=False)
print("Missing values per column:")
print(missing_summary.head(20))


Missing values per column:
Mas_Vnr_Area     23
BsmtFin_SF_2      1
Bsmt_Unf_SF       1
Total_Bsmt_SF     1
BsmtFin_SF_1      1
Garage_Cars       1
Garage_Area       1
Electrical        1
Lot_Area          0
PID               0
MS_SubClass       0
MS_Zoning         0
Order             0
Land_Slope        0
Neighborhood      0
Condition_2       0
Condition_1       0
House_Style       0
Overall_Qual      0
Overall_Cond      0
dtype: int64


6.1 DROP ROWS ONLY WHEN NECESSARY

Only drop:

rows missing the target

rows that are mostly empty

rows containing invalid data (e.g., negative square footage)

6.2 IMPUTE NUMERIC COLUMNS

Why median?

Median is robust to outliers
Mean gets distorted badly.

6.3 IMPUTE CATEGORICAL COLUMNS

Most common strategies:

Mode (most frequent value)

Works well when missing data is small.

Binary flag existence, set missing to None.  Use smart logic to determine if data was missing or just entered incorrectly.
This is unique to each Category.  Use the information you have.

A missing value indicator can be extremely predictive.

Example:

A missing basement quality often means “no basement”

A missing fireplace quality often means “no fireplace”

for col in df.columns:
    df[col + "_was_missing"] = df[col].isna().astype(int)


This adds search-light columns that say:

1 = was missing

0 = was present


Will oneHotOff or ordinal later.  This is all data cleaning.

6.5 CLEAN OBVIOUS BAD VALUES

Examples:

Square footage cannot be negative.

Lot area cannot be zero.

Quality ratings must be one of the known categories.

df['Gr_Liv_Area'] = df['Gr_Liv_Area'].clip(lower=0)

6.6 NORMALIZE TEXT DATA (easy mistakes)

Common cleaning:

df[col] = df[col].str.strip().str.lower()
df[col] = df[col].str.replace('-', '_')


Cleaning avoids:

duplicates (“Ex”, “ex”, “ EX”)

weird Unicode

inconsistent punctuation

6.8 Final Check: Data Must Be Fully Clean

After imputation:

print(df.isna().sum().sum())   # should be 0

If NOT zero → model will crash.


In [None]:
# Lets see if there is a positive correlation between Lot_Area and Lot_Frontage

sns.scatterplot(x=df["Lot_Area"], y=df["Lot_Frontage"])
plt.xscale("log")  # because Lot_Area spans huge ranges
plt.show()

df[["Lot_Area", "Lot_Frontage"]].corr()

# There is, so lets impute missing Lot_Frontage values based on Lot_Area



In [None]:
# ========================================
# IMPUTE LOT_FRONTAGE USING LOT_AREA RATIO
# ========================================

print("Missing Lot_Frontage BEFORE:", df["Lot_Frontage"].isna().sum())

# 1. Create missingness flag (always useful for models)
df["Lot_Frontage_missing"] = df["Lot_Frontage"].isna().astype(int)

# 2. Compute typical depth = Lot_Area / Lot_Frontage
#    (only using rows where both values exist)
valid = df.dropna(subset=["Lot_Frontage", "Lot_Area"])

# Avoid divide-by-zero rows
valid = valid[valid["Lot_Frontage"] > 0]

median_depth = (valid["Lot_Area"] / valid["Lot_Frontage"]).median()
print("Median lot depth (Lot_Area / Lot_Frontage):", median_depth)

# 3. Impute missing frontage as Lot_Area / median_depth
mask_missing = df["Lot_Frontage"].isna()

df.loc[mask_missing, "Lot_Frontage"] = (
    df.loc[mask_missing, "Lot_Area"] / median_depth
)

print("Missing Lot_Frontage AFTER:", df["Lot_Frontage"].isna().sum())
print("Imputed Lot_Frontage for", mask_missing.sum(), "rows")


In [None]:
# Handle pool weirdness

print(df["Pool_QC"].value_counts(dropna=False))
print(df["Pool_Area"].describe())
# Impute Pool_QC based on Pool_Area
df["Has_Pool"] = (
    (df["Pool_Area"] > 0) | df["Pool_QC"].notna()
).astype(int)
# For rows where we believe there is NO pool, set quality to "None"
no_pool_mask = (df["Has_Pool"] == 0) & (df["Pool_QC"].isna())
df.loc[no_pool_mask, "Pool_QC"] = "None"
print("After imputation of Pool_QC based on Pool_Area:")
print(df["Pool_QC"].value_counts(dropna=False))

In [None]:
#Handle Hassing Misc_Feature, add Boolean flag column
# Replace NA with None, strip whitespace

print(df["Misc_Feature"].value_counts(dropna=False))
df["Has_Misc_Feature"] = df["Misc_Feature"].notna().astype(int)
df["Misc_Feature"] = df["Misc_Feature"].fillna("None")
df["Misc_Feature"] = df["Misc_Feature"].str.strip()
df["Misc_Feature"].value_counts()

In [None]:
#Handle Alley
print(df["Alley"].value_counts(dropna=False))
# ============================
# CLEANING: Alley
# ============================

print("Before:")
print(df["Alley"].value_counts(dropna=False))

# 1. Boolean flag—does the house have alley access?
df["Has_Alley"] = df["Alley"].notna().astype(int)

# 2. Replace NA with explicit category "None"
df["Alley"] = df["Alley"].fillna("None")

# 3. Clean whitespace just in case
df["Alley"] = df["Alley"].str.strip()

print("\nAfter:")
print(df["Alley"].value_counts())
print(df["Has_Alley"].value_counts())


In [None]:
# ============================
# CLEANING: Fence
# ============================

print("Before:")
print(df["Fence"].value_counts(dropna=False))

# 1. Feature flag — does the house have ANY fenced enclosure?
df["Has_Fence"] = df["Fence"].notna().astype(int)

# 2. Replace NaN with explicit category "None"
df["Fence"] = df["Fence"].fillna("None")

# 3. Clean whitespace (always safe)
df["Fence"] = df["Fence"].str.strip()

print("\nAfter:")
print(df["Fence"].value_counts())
print(df["Has_Fence"].value_counts())


In [None]:
# ============================
# CLEANING: Mas_Vnr_Type
# ============================

# Ai defiiniely help with this one.  Didn't really think to include Masonry Veneer
# area as a qualifier for what the data meant.  If 0, NA means None.  If >0, NA means Unknown.

print("Before:")
print(df["Mas_Vnr_Type"].value_counts(dropna=False))

# 1. Feature flag — does the house have a masonry veneer?
df["Has_Masonry_Veneer"] = (df["Mas_Vnr_Area"] > 0).astype(int)

# 2. Case 1: Area == 0 → definitely None
none_mask = (df["Mas_Vnr_Area"] == 0) & (df["Mas_Vnr_Type"].isna())
df.loc[none_mask, "Mas_Vnr_Type"] = "None"

# 3. Case 2: Area > 0 but type is missing → ambiguous → "Unknown"
unknown_mask = (df["Mas_Vnr_Area"] > 0) & (df["Mas_Vnr_Type"].isna())
df.loc[unknown_mask, "Mas_Vnr_Type"] = "Unknown"

# 4. Clean whitespace
df["Mas_Vnr_Type"] = df["Mas_Vnr_Type"].str.strip()

print("\nAfter:")
print(df["Mas_Vnr_Type"].value_counts())
print(df["Has_Masonry_Veneer"].value_counts())


In [None]:
# ============================
# CLEANING: Fireplace_Qu
# ============================

print("Before:")
print(df["Fireplace_Qu"].value_counts(dropna=False))

# 1. Boolean flag — does the house have a fireplace?
df["Has_Fireplace"] = df["Fireplace_Qu"].notna().astype(int)

# 2. Replace NA with explicit category "None"
df["Fireplace_Qu"] = df["Fireplace_Qu"].fillna("None")

# 3. Clean whitespace
df["Fireplace_Qu"] = df["Fireplace_Qu"].str.strip()

print("\nAfter:")
print(df["Fireplace_Qu"].value_counts())
print(df["Has_Fireplace"].value_counts())


In [None]:
# Handle Garage
#1.  Intelligently determine the boolean flag for Has_Garage based on available data
df["Has_Garage"] = (
    (df["Garage_Cars"] > 0) | 
    (df["Garage_Area"] > 0) | 
    (df["Garage_Type"].notna())
).astype(int)
#2 Use the boolean flag to impute missing Garage_Type values
# If no garage: type is None
df.loc[(df["Has_Garage"] == 0) & (df["Garage_Type"].isna()), "Garage_Type"] = "None"

# If garage exists but type missing: mark Unknown
df.loc[(df["Has_Garage"] == 1) & (df["Garage_Type"].isna()), "Garage_Type"] = "Unknown"
print("After handling Garage_Type:")
print(df["Garage_Type"].value_counts(dropna=False))

#3 Use the boolean flag to impute other Garage-related categorical columns
garage_cols = ["Garage_Finish", "Garage_Qual", "Garage_Cond"]

for col in garage_cols:
    # No garage → None
    df.loc[(df["Has_Garage"] == 0) & (df[col].isna()), col] = "None"

    # Garage exists but value missing → Unknown
    df.loc[(df["Has_Garage"] == 1) & (df[col].isna()), col] = "Unknown"

#4 Deal with Garage_Yr_Blt.  Assume same year if possible.

df.loc[df["Has_Garage"] == 0, "Garage_Yr_Blt"] = 0

df.loc[(df["Has_Garage"] == 1) & (df["Garage_Yr_Blt"].isna()), "Garage_Yr_Blt"] = \
    df.loc[(df["Has_Garage"] == 1), "Year_Built"]




In [26]:
# ==================================
# CLEANING: Basement block
# ==================================

# 1. Does the house have a basement?
df["Has_Basement"] = (
    (df["Total_Bsmt_SF"] > 0) |
    (df["BsmtFin_SF_1"] > 0) |
    (df["BsmtFin_SF_2"] > 0) |
    (df["Bsmt_Unf_SF"] > 0)
).astype(int)

# 2. Finished vs Unfinished booleans
df["Has_Finished_Basement"] = (
    (df["BsmtFin_SF_1"] > 0) |
    (df["BsmtFin_SF_2"] > 0)
).astype(int)

df["Has_Unfinished_Basement"] = (df["Bsmt_Unf_SF"] > 0).astype(int)

# 3. Clean the categorical basement columns
basement_cols = [
    "Bsmt_Qual", "Bsmt_Cond", "BsmtFin_Type_1", "BsmtFin_Type_2"
]

# 3a. If no basement → fill with "None"
for col in basement_cols:
    df.loc[(df["Has_Basement"] == 0) & (df[col].isna()), col] = "None"

# 3b. If basement exists but NA → "Unknown"
for col in basement_cols:
    df.loc[(df["Has_Basement"] == 1) & (df[col].isna()), col] = "Unknown"

# 4. Special cleaning for Bsmt_Exposure
# NA means "No exposure" IF the house has a basement
df.loc[(df["Has_Basement"] == 0), "Bsmt_Exposure"] = "None"
df.loc[(df["Has_Basement"] == 1) & (df["Bsmt_Exposure"].isna()), "Bsmt_Exposure"] = "No"

# 5. Strip whitespace (safety)
for col in ["Bsmt_Qual", "Bsmt_Cond", "BsmtFin_Type_1", "BsmtFin_Type_2", "Bsmt_Exposure"]:
    df[col] = df[col].astype(str).str.strip()

# 6. Impute missing numeric basement columns with 0 (no basement area)
# 6. Numeric basement fields: set to 0 when no basement, and clean rare NaNs

numeric_basement_cols = [
    "Total_Bsmt_SF",
    "BsmtFin_SF_1",
    "BsmtFin_SF_2",
    "Bsmt_Unf_SF",
    "Bsmt_Full_Bath",
    "Bsmt_Half_Bath",
]

# Case 1: No basement → all numeric basement fields must be 0
for col in numeric_basement_cols:
    df.loc[df["Has_Basement"] == 0, col] = 0

# Case 2: Basement exists but value is NaN → assume 0 (no area / no bath recorded)
for col in numeric_basement_cols:
    df.loc[(df["Has_Basement"] == 1) & (df[col].isna()), col] = 0

# (Optional) cast to int where appropriate
int_basement_cols = ["Bsmt_Full_Bath", "Bsmt_Half_Bath"]
for col in int_basement_cols:
    df[col] = df[col].astype(int)



In [None]:
# ============================
# CLEANING: Mas_Vnr_Type + Mas_Vnr_Area
# ============================

print("Before (type):")
print(df["Mas_Vnr_Type"].value_counts(dropna=False))
print("Before (area NaNs):", df["Mas_Vnr_Area"].isna().sum())

# 1. If area > 0 AND type is NaN → there IS veneer, but type unknown
unknown_mask = (df["Mas_Vnr_Area"] > 0) & (df["Mas_Vnr_Type"].isna())
df.loc[unknown_mask, "Mas_Vnr_Type"] = "Unknown"

# 2. All remaining NaN types → treat as "None" (no veneer)
df.loc[df["Mas_Vnr_Type"].isna(), "Mas_Vnr_Type"] = "None"

# 3. Clean whitespace
df["Mas_Vnr_Type"] = df["Mas_Vnr_Type"].astype(str).str.strip()

# 4. Boolean: has masonry veneer? (now based on cleaned type)
df["Has_Masonry_Veneer"] = (df["Mas_Vnr_Type"] != "None").astype(int)

# 5. Fix Mas_Vnr_Area:
#    Case A: no veneer → area must be 0
none_area_mask = (df["Has_Masonry_Veneer"] == 0) & (df["Mas_Vnr_Area"].isna())
df.loc[none_area_mask, "Mas_Vnr_Area"] = 0

#    Case B: veneer present but area NaN → fill with median for that type
for t in df["Mas_Vnr_Type"].unique():
    type_mask = (df["Mas_Vnr_Type"] == t) & (df["Mas_Vnr_Area"].isna())
    if type_mask.any():
        median_area = df.loc[df["Mas_Vnr_Type"] == t, "Mas_Vnr_Area"].median()
        df.loc[type_mask, "Mas_Vnr_Area"] = median_area

print("\nAfter (type):")
print(df["Mas_Vnr_Type"].value_counts())
print("After (area NaNs):", df["Mas_Vnr_Area"].isna().sum())
print("Has_Masonry_Veneer value counts:")
print(df["Has_Masonry_Veneer"].value_counts())


Before (type):
Mas_Vnr_Type
None       1745
BrkFace     880
Stone       249
BrkCmn       25
nan          23
Unknown       7
CBlock        1
Name: count, dtype: int64

Before (area NaNs): 23

After (type):
Mas_Vnr_Type
None       1745
BrkFace     880
Stone       249
BrkCmn       25
nan          23
Unknown       7
CBlock        1
Name: count, dtype: int64
After (area NaNs): 23
Has_Masonry_Veneer value counts:
Has_Masonry_Veneer
0    1771
1    1159
Name: count, dtype: int64


In [None]:
# ============================
# CLEANING: Bsmt_Full_Bath & Bsmt_Half_Bath
# ============================

print("Before:")
print(df["Bsmt_Full_Bath"].value_counts(dropna=False))
print(df["Bsmt_Half_Bath"].value_counts(dropna=False))

# Full bath:
df.loc[df["Bsmt_Full_Bath"].isna(), "Bsmt_Full_Bath"] = 0

# Half bath:
df.loc[df["Bsmt_Half_Bath"].isna(), "Bsmt_Half_Bath"] = 0

# (Optional sanity: cast to integer)
df["Bsmt_Full_Bath"] = df["Bsmt_Full_Bath"].astype(int)
df["Bsmt_Half_Bath"] = df["Bsmt_Half_Bath"].astype(int)

print("\nAfter:")
print(df["Bsmt_Full_Bath"].value_counts())
print(df["Bsmt_Half_Bath"].value_counts())

df["Bsmt_Total_Baths"] = df["Bsmt_Full_Bath"] + 0.5 * df["Bsmt_Half_Bath"]
print("Created Bsmt_Total_Baths feature.")


In [37]:
# ==========================================
# SHOW ROW INDICES FOR RARE MISSING VALUES
# (columns with fewer than 5 missing entries)
# ==========================================

for col in df.columns:
    missing_idx = df[df[col].isna()].index
    
    if 0 < len(missing_idx) < 24:
        print(f"\nColumn: {col}")
        print(f"Missing count: {len(missing_idx)}")
        print(f"Row indices: {list(missing_idx)}")

missing_summary = df.isna().sum().sort_values(ascending=False)
print("Missing values per column:")
print(missing_summary.head(20))
# ==========================================



Column: Mas_Vnr_Area
Missing count: 23
Row indices: [55, 484, 517, 538, 867, 1095, 1119, 1122, 1127, 1184, 1454, 1727, 1751, 1783, 1799, 1839, 1840, 2229, 2260, 2382, 2392, 2455, 2823]

Column: Garage_Cars
Missing count: 1
Row indices: [2236]

Column: Garage_Area
Missing count: 1
Row indices: [2236]
Missing values per column:
Mas_Vnr_Area    23
Garage_Cars      1
Garage_Area      1
PID              0
MS_SubClass      0
Lot_Area         0
Street           0
Alley            0
Lot_Shape        0
Land_Contour     0
Utilities        0
MS_Zoning        0
Order            0
Land_Slope       0
Neighborhood     0
Condition_2      0
Condition_1      0
House_Style      0
Overall_Qual     0
Overall_Cond     0
dtype: int64


In [28]:
#Filling row 1577 with missing electrical as standard breaker, because home was built in 2008 and Iowa building code likely required it
df.loc[df["Electrical"].isna(), "Electrical"] = "SBrkr"
print("After handling Electrical:")
print(df["Electrical"].value_counts(dropna=False))

After handling Electrical:
Electrical
SBrkr    2683
FuseA     188
FuseF      50
FuseP       8
Mix         1
Name: count, dtype: int64


In [None]:

# ## 7. Define Features (X) and Target (y)
# - Select feature columns explicitly.
# - Split numeric vs categorical.
# - This is where you decide what the model can "see."

# 7. FEATURE / TARGET SETUP

# TODO: decide which columns are features
feature_cols = [c for c in df.columns if c != TARGET_COL]

X = df[feature_cols]
y = df[TARGET_COL]

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

print("Feature columns:", feature_cols)
print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)



In [None]:
# ## 8. Preprocessing: Encoding & Scaling
# - Numeric: StandardScaler
# - Categorical: OneHotEncoder
# - All wrapped in a ColumnTransformer + Pipeline

# 8. PREPROCESSOR

numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Example model: Linear Regression
regressor = LinearRegression()

# Full pipeline: preprocessing + model
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", regressor)
])

model



In [None]:
# ## 9. Train/Test Split
# - Typical split: 80/20 or 70/30.
# - Randomized with fixed `random_state` for reproducibility.

# 9. TRAIN / TEST SPLIT

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_STATE
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)




In [None]:
# 10. Train the Model
# - Fit the pipeline on training data.
# 10. TRAIN MODEL

model.fit(X_train, y_train)

print("Model trained.")

In [None]:
# ## 11. Evaluation on Train and Test Sets
# - R², MAE, RMSE
# - Compare train vs test for over/underfitting.

# 11. EVALUATION

def regression_metrics(y_true, y_pred, label=""):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print(f"--- {label} ---")
    print(f"R²:   {r2:.4f}")
    print(f"MAE:  {mae:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print()

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

regression_metrics(y_train, y_pred_train, label="Train")
regression_metrics(y_test, y_pred_test, label="Test")

In [None]:
# ## 12. Residual Analysis (Train Set)
# - Check assumptions: linearity, homoscedasticity, normality.
# %%
# 12. RESIDUALS

train_residuals = y_train - y_pred_train

# Residuals vs predicted
plt.figure(figsize=(6, 4))
plt.scatter(y_pred_train, train_residuals, alpha=0.5)
plt.axhline(0, color="red", linestyle="--")
plt.xlabel("Predicted (Train)")
plt.ylabel("Residuals")
plt.title("Residuals vs Predicted (Train)")
plt.tight_layout()
plt.show()

# Histogram of residuals
plt.figure(figsize=(6, 4))
sns.histplot(train_residuals, kde=True)
plt.title("Residuals Distribution (Train)")
plt.tight_layout()
plt.show()

In [None]:
# ## 13. Cross Validation (Optional but Recommended)
# - K-Fold CV for more robust estimation.
# - Uses the full pipeline (preprocess + model) inside CV.

# 13. CROSS VALIDATION

kf = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

cv_scores = cross_val_score(model, X, y, cv=kf, scoring="r2")

print("CV R² scores:", cv_scores)
print("CV R² mean:", cv_scores.mean())
print("CV R² std:", cv_scores.std())



In [None]:
# %% [markdown]
# ## 14. TODO: Iteration Hooks
# - If metrics are poor:
#   - Revisit EDA, transformations, feature engineering.
#   - Consider:
#       - log/box-cox transforms on skewed features
#       - polynomial features or interaction terms
#       - different model (e.g., tree-based)
#   - Use cross-validation + grid search for tuning.
#
# This template gives you:
# - A consistent workflow
# - A place to drop in dataset-specific logic
# - A structure that matches your mental model:
#   import → inspect → visualize → clean → encode/scale → split → train → evaluate → refine.

