In [1]:
# Question 3: Advanced Model Evaluation with Feature Selection for House Prices

# Step 1: Load a house prices dataset from CSV (Assume you have a house_prices.csv ).
# Step 2: Apply feature selection and create a train-test split.
# Step 3: Train a Lasso Regression model.
# Step 4: Perform model evaluation and hyperparameter tuning using GridSearchCV.

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def load_preprocess_train_evaluate_house_prices(file_path='house_prices.csv', test_size=0.2, random_state=42, cv=5):
    """
    Loads, preprocesses, performs feature selection, trains a Lasso Regression model
    with hyperparameter tuning using GridSearchCV, and evaluates the model
    on a house prices dataset. Includes comprehensive error handling.

    Args:
        file_path (str): Path to the CSV file containing the house prices dataset.
        test_size (float): Proportion of the data to use for testing.
        random_state (int): Random seed for reproducibility.
        cv (int): Number of cross-validation folds.

    Returns:
        tuple: A tuple containing:
            - mse (float): Mean squared error on the test set.
            - r2 (float): R-squared score on the test set.
            - best_model (Pipeline): The best trained pipeline.
    """
    try:
        # Step 1: Load the house prices dataset
        print(f"Loading house prices dataset from: {file_path}")
        data = pd.read_csv(file_path)
        print("Dataset loaded successfully.")

        # Basic Exploration
        print("\n--- Dataset Information ---")
        data.info()
        print("\n--- First 5 rows of the dataset ---")
        print(data.head())

        # Identify target variable (assuming it's the last column)
        target_column = data.columns[-1]
        print(f"\nAssuming '{target_column}' is the target variable.")

        # Separate features and target
        X = data.drop(target_column, axis=1)
        y = data[target_column]

        if X.empty or y.empty:
            raise ValueError("Features (X) or target (y) are empty after separation.")

        # Step 2: Apply feature selection and create a train-test split
        print("\n--- Feature Selection and Train-Test Split ---")

        # Handle categorical features (simple one-hot encoding for demonstration)
        X = pd.get_dummies(X, drop_first=True)

        # Split data
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
        print(f"Training set size: {len(X_train)}")
        print(f"Testing set size: {len(X_test)}")

        # Feature Scaling (important for Lasso)
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Feature Selection using Lasso (L1 regularization)
        # We use Lasso itself as a feature selector by setting a threshold on the coefficients
        lasso_selector = Lasso(alpha=0.1, random_state=random_state) # You might need to tune alpha
        lasso_selector.fit(X_train_scaled, y_train)

        # Select features with non-zero coefficients
        selected_features_mask = lasso_selector.coef_ != 0
        X_train_selected = X_train_scaled[:, selected_features_mask]
        X_test_selected = X_test_scaled[:, selected_features_mask]

        print(f"Number of features before selection: {X_train.shape[1]}")
        print(f"Number of features after Lasso-based selection: {X_train_selected.shape[1]}")

        # Step 3 & 4: Train Lasso Regression with GridSearchCV for hyperparameter tuning and evaluation
        print("\n--- Training Lasso Regression with GridSearchCV ---")

        # Define the pipeline
        pipeline = Pipeline([
            ('lasso', Lasso(random_state=random_state))
        ])

        # Define the parameter grid for alpha (regularization strength)
        param_grid = {
            'lasso__alpha': np.logspace(-4, 2, 7) # Explore a range of alpha values
        }

        # Instantiate GridSearchCV
        grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=1, error_score='raise')

        # Train the model using GridSearchCV on the selected features
        grid_search.fit(X_train_selected, y_train)

        # Get the best model
        best_model = grid_search.best_estimator_
        print("\nBest parameters found:", grid_search.best_params_)

        # Evaluate the best model on the test set
        y_pred = best_model.predict(X_test_selected)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        print("\n--- Model Evaluation on Test Set ---")
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"R-squared Score: {r2:.4f}")

        return mse, r2, best_model

    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Please check the file path.")
        return None, None, None
    except pd.errors.EmptyDataError:
        print(f"Error: The file '{file_path}' is empty.")
        return None, None, None
    except pd.errors.ParserError:
        print(f"Error: Failed to parse the file '{file_path}'. Ensure it's a valid CSV format.")
        return None, None, None
    except ValueError as ve:
        print(f"ValueError: {ve}")
        return None, None, None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None, None, None

if __name__ == "__main__":
    mse, r2, best_model = load_preprocess_train_evaluate_house_prices(file_path='house_prices.csv')

    if mse is not None:
        print("\n--- Summary ---")
        print(f"Final Mean Squared Error on Test Set: {mse:.4f}")
        print(f"Final R-squared Score on Test Set: {r2:.4f}")
        print("\nBest Lasso Model:", best_model)

Loading house prices dataset from: house_prices.csv
Error: The file 'house_prices.csv' was not found. Please check the file path.
