In [2]:
# Question 3: Advanced Model Evaluation with Feature Selection for House Prices

# Step 1: Load a house prices dataset from CSV (Assume you have a house_prices.csv ).
# Step 2: Apply feature selection and create a train-test split.
# Step 3: Train a Lasso Regression model.
# Step 4: Perform model evaluation and hyperparameter tuning using GridSearchCV.

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import logging
import time
import yaml  # For configuration
from typing import Tuple

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)  # Get a specific logger

# Custom Exception for Data Validation
class DataValidationError(ValueError):
    """Custom exception to raise when data validation fails."""
    pass

def load_config(config_file='config.yaml'):
    """Loads configuration parameters from a YAML file.

    Args:
        config_file (str): Path to the YAML configuration file.

    Returns:
        dict: Dictionary containing the configuration parameters.
    """
    try:
        with open(config_file, 'r') as f:
            config = yaml.safe_load(f)
        logger.info(f"Configuration loaded from {config_file}")
        return config
    except FileNotFoundError:
        logger.error(f"Configuration file not found: {config_file}")
        raise
    except yaml.YAMLError as e:
        logger.error(f"Error parsing YAML file: {e}")
        raise

def load_data(file_path: str) -> pd.DataFrame:
    """Loads the house prices dataset from a CSV file.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        pd.DataFrame: The loaded dataset.
    """
    try:
        logger.info(f"Loading dataset from: {file_path}")
        data = pd.read_csv(file_path)
        logger.info("Dataset loaded successfully.")
        return data
    except FileNotFoundError:
        logger.error(f"Error: The file '{file_path}' was not found. Please check the file path.")
        raise
    except pd.errors.EmptyDataError:
        logger.error(f"Error: The file '{file_path}' is empty.")
        raise
    except pd.errors.ParserError:
        logger.error(f"Error: Failed to parse the file '{file_path}'. Ensure it's a valid CSV format.")
        raise
    except Exception as e:
        logger.error(f"An unexpected error occurred: {e}")
        raise

def explore_data(data: pd.DataFrame, target_column: str) -> None:
    """Explores the dataset and performs data validation.

    Args:
        data (pd.DataFrame): The input dataset.
        target_column (str): The name of the target column.
    """
    logger.info("\n--- Dataset Exploration and Validation ---")
    logger.info("Dataset Information:")
    data.info()
    logger.info("\nFirst 5 rows of the dataset:")
    logger.info(data.head())
    logger.info("\nBasic statistics of the dataset:")
    logger.info(data.describe())

    # Check for missing values
    missing_values = data.isnull().sum()
    logger.info("\nMissing values per column:\n%s", missing_values) # Consistent logging
    if missing_values.any():
        logger.warning("Missing values found in the dataset.")

    # Check for duplicate rows
    num_duplicates = data.duplicated().sum()
    logger.info(f"\nNumber of duplicate rows: {num_duplicates}")
    if num_duplicates > 0:
        logger.warning("Duplicate rows found in the dataset.")

    # Check if the target column exists
    if target_column not in data.columns:
        raise DataValidationError(f"Target column '{target_column}' not found in the dataset.")

    # Example data validation: Check for non-numeric values in the target column
    if not pd.api.types.is_numeric_dtype(data[target_column]):
        raise DataValidationError(f"Target column '{target_column}' should be numeric.")

def preprocess_data(data: pd.DataFrame, target_column: str, test_size: float, random_state: int) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, StandardScaler]:
    """
    Prepares the data for modeling by separating features and target,
    handling categorical variables, and splitting into training and testing sets.

    Args:
        data (pd.DataFrame): The input dataset.
        target_column (str): The name of the target column.
        test_size (float): Proportion of the data to use for testing.
        random_state (int): Random seed for reproducibility.

    Returns:
        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, StandardScaler]:
            Tuple containing training and testing features, training and testing target,
            and the StandardScaler used for scaling.
    """
    logger.info("\n--- Data Preprocessing ---")

    # Separate features and target
    X = data.drop(target_column, axis=1)
    y = data[target_column]

    # Handle categorical features using one-hot encoding
    X = pd.get_dummies(X, drop_first=True)
    logger.info("Categorical features encoded using one-hot encoding.")

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    logger.info(f"Training set size: {len(X_train)}, Testing set size: {len(X_test)}")

    # Feature Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    logger.info("Features scaled using StandardScaler.")

    return X_train_scaled, X_test_scaled, y_train, y_test, scaler

def feature_selection(X_train: np.ndarray, y_train: pd.Series, alpha: float, random_state: int, early_stopping=False, cv=5) -> np.ndarray:
    """
    Selects features using Lasso Regression with optional early stopping.

    Args:
        X_train (np.ndarray): Training features.
        y_train (pd.Series): Training target.
        alpha (float): Regularization parameter for Lasso.
        random_state (int): Random seed.
        early_stopping (bool, optional): Whether to use early stopping. Defaults to False.
        cv (int): Number of cross-validation folds for early stopping.

    Returns:
        np.ndarray:  Selected training features.
    """
    logger.info("\n--- Feature Selection ---")
    lasso_selector = Lasso(alpha=alpha, random_state=random_state, warm_start=True) # Enable warm starting for early stopping

    if early_stopping:
        logger.info("Using early stopping for feature selection.")
        kf = KFold(n_splits=cv, shuffle=True, random_state=random_state)
        best_mse = float('inf')
        best_features = None
        for train_index, val_index in kf.split(X_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            lasso_selector.fit(X_train_fold, y_train_fold)  # Fit on the fold
            y_pred_val = lasso_selector.predict(X_val_fold)
            mse = mean_squared_error(y_val_fold, y_pred_val)

            if mse < best_mse:
                best_mse = mse
                best_features = lasso_selector.coef_ != 0
                logger.info(f"Improved MSE: {best_mse:.4f}")
            else:
                logger.info(f"MSE did not improve: {mse:.4f}")
                break # Stop if not improving

        if best_features is not None:
            selected_features_mask = best_features
        else:
             selected_features_mask = lasso_selector.coef_ != 0
    else:
        lasso_selector.fit(X_train, y_train)
        selected_features_mask = lasso_selector.coef_ != 0

    X_train_selected = X_train[:, selected_features_mask]
    logger.info(f"Number of features before selection: {X_train.shape[1]}")
    logger.info(f"Number of features after Lasso-based selection: {X_train_selected.shape[1]}")
    return X_train_selected, selected_features_mask

def train_model(X_train: np.ndarray, y_train: pd.Series, param_grid: dict, cv: int) -> GridSearchCV:
    """Trains a Lasso Regression model with hyperparameter tuning using GridSearchCV.

    Args:
        X_train (np.ndarray): Training features.
        y_train (pd.Series): Training target.
        param_grid (dict): Dictionary of hyperparameters to tune.
        cv (int): Number of cross-validation folds.

    Returns:
        GridSearchCV: Trained GridSearchCV model.
    """
    logger.info("\n--- Model Training ---")
    pipeline = Pipeline([
        ('lasso', Lasso(random_state=42))
    ])
    grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='neg_mean_squared_error', verbose=1, error_score='raise')
    grid_search.fit(X_train, y_train)
    logger.info("Model training completed.")
    return grid_search

def evaluate_model(grid_search: GridSearchCV, X_test: np.ndarray, y_test: pd.Series) -> Tuple[float, float]:
    """Evaluates the trained model on the test set.

    Args:
        grid_search (GridSearchCV): Trained GridSearchCV model.
        X_test (np.ndarray): Testing features.
        y_test (pd.Series): Testing target.

    Returns:
        Tuple[float, float]: Mean squared error and R-squared score on the test set.
    """
    logger.info("\n--- Model Evaluation ---")
    best_model = grid_search.best_estimator_
    logger.info(f"Best parameters found: {grid_search.best_params_}")

    y_pred = best_model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    logger.info(f"Mean Squared Error on Test Set: {mse:.4f}")
    logger.info(f"R-squared Score on Test Set: {r2:.4f}")
    return mse, r2

def main():
    """Main function to orchestrate the loading, preprocessing, feature selection,
    model training, and evaluation of the house prices dataset.
    """
    start_time = time.time()
    try:
        config = load_config() # Load config
        file_path = config['file_path']
        test_size = config['test_size']
        random_state = config['random_state']
        cv = config['cv']
        target_column = config['target_column']
        lasso_alpha = config['lasso_alpha']
        param_grid = {
            'lasso__alpha': np.logspace(config['lasso_alpha_start'], config['lasso_alpha_end'], config['lasso_alpha_num'])
        }
        use_early_stopping = config.get('use_early_stopping', False) #get early stopping

        data = load_data(file_path)
        explore_data(data, target_column)
        X_train_scaled, X_test_scaled, y_train, y_test, scaler = preprocess_data(data, target_column, test_size, random_state)
        X_train_selected, selected_features_mask = feature_selection(X_train_scaled, y_train, lasso_alpha, random_state, early_stopping=use_early_stopping, cv=cv)
        X_test_selected = X_test_scaled[:, selected_features_mask] #apply feature selection to test

        grid_search = train_model(X_train_selected, y_train, param_grid, cv)
        mse, r2 = evaluate_model(grid_search, X_test_selected, y_test)

        print("\n--- Summary ---")
        print(f"Final Mean Squared Error on Test Set: {mse:.4f}")
        print(f"Final R-squared Score on Test Set: {r2:.4f}")

    except DataValidationError as dve:
        logger.error(f"Data Validation Error: {dve}")
    except Exception:
        logger.error("An error occurred during the process. Please check the logs for details.")
    finally:
        end_time = time.time()
        logger.info(f"Total execution time: {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main()




2025-05-07 04:51:04,063 - ERROR - Configuration file not found: config.yaml
2025-05-07 04:51:04,063 - ERROR - An error occurred during the process. Please check the logs for details.
2025-05-07 04:51:04,065 - INFO - Total execution time: 0.00 seconds
