# LazyPredict Comparison

Este notebook carrega os artefatos gerados por `00_data_prep.ipynb` (rio_iqr.parquet, scalers.pt, feature_cols.json, train_idx.npy, val_idx.npy), executa LazyPredict (LazyRegressor) e grava as previsões e leaderboard em arquivos para comparação com o modelo PyTorch.

In [1]:
!pip install lazyPredict



Iniciando variaveis a partir dos arquivos gerados pelo airbnb_predictor.

In [2]:
# ---------------------------------------------------------------------
# 1) Extract features (X) and target (y) from the DataFrame
# ---------------------------------------------------------------------
from pathlib import Path
import json, os
import numpy as np
import pandas as pd
import torch
ROOT = Path('.')
OUT = ROOT / 'outputs'
# Validate expected files
expected = ['rio_iqr.parquet','feature_cols.json','scalers.pt','train_idx.npy','val_idx.npy']
missing = [f for f in expected if not (OUT / f).exists()]
if missing:
    raise FileNotFoundError(f'Missing required artifacts in outputs/: {missing}. Run 00_data_prep.ipynb or lesson04_luciana.ipynb save cells first.')

# Load preprocessed table and meta
rio = pd.read_parquet(OUT / 'rio_iqr.parquet')
with open(OUT / 'feature_cols.json','r',encoding='utf-8') as f:
    feature_cols = json.load(f)
# Robust scalers loader to handle PyTorch 2.6+ weights_only default change and allowlisting
def safe_torch_load(path):
    path = str(path)
    try:
        # Preferred: normal load (works for most files)
        return torch.load(path, map_location='cpu')
    except Exception as e1:
        # Try loading with weights_only=False (may execute arbitrary code) if file is trusted
        try:
            return torch.load(path, map_location='cpu', weights_only=False)
        except Exception as e2:
            # Final attempt: allowlist the numpy multiarray reconstruct if present (trusted file only)
            try:
                torch.serialization.add_safe_globals(["numpy.core.multiarray._reconstruct"])
                return torch.load(path, map_location='cpu')
            except Exception as e3:
                # Re-raise the original error with context
                raise RuntimeError(f'Failed to load {path}: {e1} | {e2} | {e3}') from e3
scalers = safe_torch_load(OUT / 'scalers.pt')
train_idx = np.load(OUT / 'train_idx.npy', allow_pickle=False)
val_idx = np.load(OUT / 'val_idx.npy', allow_pickle=False)

# Build arrays (features must exist in rio)
for c in feature_cols:
    if c not in rio.columns:
        raise KeyError(f"Feature {c} declared in feature_cols.json not found in rio_iqr.parquet")

X = rio[feature_cols].to_numpy(dtype=np.float32)
y = rio['price'].to_numpy(dtype=np.float32).ravel()

# Use saved train/val indices to create the same split
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]

print('Loaded artifacts from outputs/. Shapes:')
print('  full X:', X.shape, 'y:', y.shape)
print('  X_train:', X_train.shape, 'X_val:', X_val.shape)
print('  y_train:', y_train.shape, 'y_val:', y_val.shape)

# Quick sanity checks
assert not np.isnan(X_train).any(), 'Found NaNs in X_train. Clean/impute before running LazyPredict.'
assert not np.isnan(X_val).any(), 'Found NaNs in X_val. Clean/impute before running LazyPredict.'
assert not np.isnan(y_train).any(), 'Found NaNs in y_train.'
assert not np.isnan(y_val).any(), 'Found NaNs in y_val.'

Loaded artifacts from outputs/. Shapes:
  full X: (20923, 12) y: (20923,)
  X_train: (16738, 12) X_val: (4185, 12)
  y_train: (16738,) y_val: (4185,)


Com as variaveis lidas dos arquivos vamos usar o lazypredict

In [3]:
# Optional: install lazypredict if missing (uncomment to install)
# !pip install lazypredict
from lazypredict.Supervised import LazyRegressor
from pathlib import Path
ROOT = Path('.')
OUT = ROOT / 'outputs'
# Ensure we're loading the same artifacts from outputs/ (not the repo root)
rio = pd.read_parquet(OUT / 'rio_iqr.parquet')
with open(OUT / 'feature_cols.json', 'r', encoding='utf-8') as f: feature_cols = json.load(f)
# Rebuild arrays in this cell to guarantee consistency
X = rio[feature_cols].to_numpy(dtype=np.float32)
y = rio['price'].to_numpy(dtype=np.float32).ravel()
train_idx = np.load(OUT / 'train_idx.npy')
val_idx = np.load(OUT / 'val_idx.npy')
X_train, X_val = X[train_idx], X[val_idx]
y_train, y_val = y[train_idx], y[val_idx]
# Run LazyPredict on the consistent split
reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = reg.fit(X_train, X_val, y_train, y_val)
# models is a DataFrame summary; predictions may be a per-sample DataFrame or a per-model summary depending on version
models.to_csv(OUT / 'lazypredict_models_df.csv')
predictions.to_pickle(OUT / 'lazypredict_predictions.pkl')
# Compute leaderboard robustly
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
rows = []
# Case A: predictions is a per-sample DataFrame with shape (n_samples, n_models)
if isinstance(predictions, pd.DataFrame) and predictions.shape[0] == len(y_val):
    for name in predictions.columns:
        y_pred = predictions[name].to_numpy().ravel()
        rows.append({'Model': name, 'RMSE': float(np.sqrt(mean_squared_error(y_val, y_pred))), 'MAE': float(mean_absolute_error(y_val, y_pred)), 'R2': float(r2_score(y_val, y_pred))})
# Case B: predictions is a small summary (e.g., 42 rows x metrics). Fall back to models DataFrame columns
else:
    # Try to extract RMSE/MAE/R2 from models summary if available
    md = models.copy()
    cols = {c.lower(): c for c in md.columns}
    rmse_col = next((cols[c] for c in cols if 'rmse' in c), None)
    mae_col  = next((cols[c] for c in cols if 'mae' in c), None)
    r2_col   = next((cols[c] for c in cols if 'r2' in c), None)
    for idx, row in md.iterrows():
        rows.append({'Model': idx, 'RMSE': float(row[rmse_col]) if rmse_col else np.nan, 'MAE': float(row[mae_col]) if mae_col else np.nan, 'R2': float(row[r2_col]) if r2_col else np.nan})
leader = pd.DataFrame(rows).sort_values('RMSE').reset_index(drop=True)
leader.to_csv(OUT / 'leaderboard_lazypredict.csv', index=False)
print('LazyPredict finished — artifacts saved: outputs/lazypredict_models_df.csv, outputs/lazypredict_predictions.pkl, outputs/leaderboard_lazypredict.csv')

  0%|          | 0/42 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 211
[LightGBM] [Info] Number of data points in the train set: 16738, number of used features: 10
[LightGBM] [Info] Start training from score 377.048990
LazyPredict finished — artifacts saved: outputs/lazypredict_models_df.csv, outputs/lazypredict_predictions.pkl, outputs/leaderboard_lazypredict.csv
