In [2]:
# Question 3: Advanced Model Evaluation with Feature Selection for House Prices

# Step 1: Load a house prices dataset from CSV (Assume you have a house_prices.csv ).
# Step 2: Apply feature selection and create a train-test split.
# Step 3: Train a Lasso Regression model.
# Step 4: Perform model evaluation and hyperparameter tuning using GridSearchCV.

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Step 1: Load the house prices dataset
try:
    df = pd.read_csv('house_prices.csv')
except FileNotFoundError:
    # Create a small synthetic dataset for demonstration
    df = pd.DataFrame({
        'LotArea': [8450, 9600, 11250, 9550, 14260, 10000, 8500, 12000, 13000, 9000],
        'OverallQual': [7, 6, 7, 7, 8, 5, 6, 8, 7, 6],
        'YearBuilt': [2003, 1976, 2001, 1915, 2000, 1995, 1980, 2005, 2010, 1970],
        'TotalBsmtSF': [856, 1262, 920, 756, 1145, 800, 900, 1100, 1200, 950],
        'GrLivArea': [1710, 1262, 1786, 1717, 2198, 1500, 1400, 2000, 2100, 1600],
        'SalePrice': [208500, 181500, 223500, 140000, 250000, 175000, 160000, 240000, 260000, 180000]
    })

# Step 2: Feature selection and train-test split
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Use SelectKBest for feature selection (select top 3 features)
feature_selector = SelectKBest(score_func=f_regression, k=3)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3 & 4: Pipeline with Lasso Regression and GridSearchCV for hyperparameter tuning
pipeline = Pipeline([
    ('select', feature_selector),
    ('lasso', Lasso(max_iter=10000, random_state=42))
])

param_grid = {
    'lasso__alpha': [0.01, 0.1, 1, 10]
}

# Use cv=2 to avoid R^2 warnings with very small test sets
grid_search = GridSearchCV(pipeline, param_grid, cv=2, error_score='raise')
grid_search.fit(X_train, y_train)

print("Best parameters:", grid_search.best_params_)

# Model evaluation
y_pred = grid_search.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", mse)

Best parameters: {'lasso__alpha': 10}
Test MSE: 415242159.2409008
