# House Price ML — Starter Notebook

This notebook trains and compares two regression models on a housing dataset.
- Preferred input: `data/house_prices.csv` (your custom dataset).
- Fallback: use **sklearn California Housing** dataset (auto-load).

**Pipeline**
1. Load data (local CSV or sklearn fallback)
2. Quick EDA (head/info/describe + missing values)
3. Train/validation split
4. Preprocessing (numeric impute + scaling)
5. Models: Linear Regression, Random Forest
6. Metrics: MAE, RMSE, R² (on validation set)
7. Save plots to `plots/` and model to `model.pkl`

In [None]:
# 0) Imports
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
# 1) Load data
# Try to load your own CSV first (place a file at data/house_prices.csv with a 'price' target column).
# If not found, fall back to sklearn California Housing dataset.
csv_path = 'data/house_prices.csv'

if os.path.exists(csv_path):
    df = pd.read_csv(csv_path)
    print(f'Loaded local CSV: {csv_path}')
else:
    from sklearn.datasets import fetch_california_housing
    cali = fetch_california_housing(as_frame=True)
    df = cali.frame.copy()
    # create a "price" column (target) and keep others as features
    df = df.rename(columns={'MedHouseVal': 'price'})
    print('Loaded sklearn California Housing dataset as fallback.')

print('Shape:', df.shape)
df.head()

In [None]:
# 2) Quick EDA
display(df.head())
display(df.describe().T)

print('\nData types:')
print(df.dtypes)

# Missing values summary
missing = df.isna().sum().sort_values(ascending=False)
print('\nMissing values per column:')
print(missing[missing > 0] if missing.sum() > 0 else 'No missing values detected.')

In [None]:
# 3) Define target and features
# Try to detect a reasonable target column. Default to 'price'.
candidate_targets = ['price', 'SalePrice', 'target']
target_col = None
for c in candidate_targets:
    if c in df.columns:
        target_col = c
        break

if target_col is None:
    # If none found, assume the last column is the target (not ideal, but makes notebook runnable).
    target_col = df.columns[-1]
    print(f'No standard target found; using last column as target: {target_col}')

y = df[target_col]
X = df.drop(columns=[target_col])

# Keep only numeric features for this simple baseline
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
X = X[num_cols]
print('Using numeric features:', num_cols[:10], '...' if len(num_cols) > 10 else '')

In [None]:
# 4) Train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train.shape, X_valid.shape

In [None]:
# 5) Preprocessing pipeline for numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocess = ColumnTransformer(
    transformers=[('num', numeric_transformer, X_train.columns)]
)

In [None]:
# 6) Define models
linreg = Pipeline(steps=[('preprocess', preprocess),
                        ('model', LinearRegression())])

rf = Pipeline(steps=[('preprocess', preprocess),
                     ('model', RandomForestRegressor(
                         n_estimators=300, random_state=42, n_jobs=-1
                     ))])

In [None]:
# 7) Train
linreg.fit(X_train, y_train)
rf.fit(X_train, y_train)
print('Training completed.')

In [None]:
# 8) Evaluate
def eval_regression(model, Xv, yv, name='model'):
    preds = model.predict(Xv)
    mae = mean_absolute_error(yv, preds)
    rmse = mean_squared_error(yv, preds, squared=False)
    r2 = r2_score(yv, preds)
    print(f'[{name}] MAE={mae:.4f}  RMSE={rmse:.4f}  R2={r2:.4f}')
    return {'name': name, 'MAE': mae, 'RMSE': rmse, 'R2': r2}

res_lin = eval_regression(linreg, X_valid, y_valid, 'LinearRegression')
res_rf  = eval_regression(rf, X_valid, y_valid, 'RandomForest')

results = pd.DataFrame([res_lin, res_rf])
results

In [None]:
# 9) Plot: Predicted vs Actual (for the best model)
best = res_rf if res_rf['RMSE'] < res_lin['RMSE'] else res_lin
best_name = best['name']
best_model = rf if best_name == 'RandomForest' else linreg

preds = best_model.predict(X_valid)

plt.figure(figsize=(6,6))
plt.scatter(y_valid, preds, alpha=0.4)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title(f'Predicted vs Actual — {best_name}')
plot_path = 'plots/pred_vs_actual.png'
plt.savefig(plot_path, bbox_inches='tight', dpi=150)
plt.show()

print(f'Plot saved to {plot_path}')

In [None]:
# 10) (Optional) Feature importance for RandomForest
if best_name == 'RandomForest':
    # Extract feature importances after preprocessing
    rf_model = best_model.named_steps['model']
    feature_names = best_model.named_steps['preprocess'].transformers_[0][2]
    importances = pd.Series(rf_model.feature_importances_, index=feature_names).sort_values(ascending=False)
    display(importances.head(15))

    plt.figure(figsize=(8,5))
    importances.head(15).plot(kind='bar')
    plt.title('Top 15 Feature Importances (RandomForest)')
    plt.tight_layout()
    plot_path2 = 'plots/feature_importance.png'
    plt.savefig(plot_path2, bbox_inches='tight', dpi=150)
    plt.show()
    print(f'Plot saved to {plot_path2}')
else:
    print('Feature importances shown only for RandomForest.')

In [None]:
# 11) Save best model
joblib.dump(best_model, 'model.pkl')
print('Saved model to model.pkl')