# 1. Answer: Feature Selection

Here's an example of how you can approach Exercise: Feature Selection.

In [5]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

# Load the Boston Housing dataset
boston = load_boston()
X = pd.DataFrame(boston['data'], columns=boston['feature_names'])
y = pd.Series(boston['target'])

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate the correlation matrix
corr_matrix = X_train.corr()

# Identify highly correlated features and remove one of each pair
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column].abs() > 0.8)]
X_train = X_train.drop(to_drop, axis=1)
X_val = X_val.drop(to_drop, axis=1)

# Perform filter-based feature selection using ANOVA
selector = SelectKBest(score_func=f_regression, k=5)
X_train_selected = selector.fit_transform(X_train, y_train)
X_val_selected = selector.transform(X_val)

# Get the selected feature indices
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = X_train.columns[selected_feature_indices]

# Train a random forest regressor on the selected features
model = RandomForestRegressor()
model.fit(X_train_selected, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val_selected)

# Calculate the root mean squared error
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("Validation RMSE:", rmse)

Validation RMSE: 3.6465074023363946
