In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold, RFE, SelectFromModel
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

In [24]:
melbourne_data = pd.read_csv('melbourne_housing_raw.csv')



In [25]:
missing_percentage = melbourne_data.isnull().mean() * 100
columns_to_remove = [col for col in missing_percentage.index if missing_percentage[col] > 20 and col != 'Price']
reduced_data = melbourne_data.drop(columns=columns_to_remove)

In [26]:
reduced_data = reduced_data.dropna(subset=['Price'])
X = reduced_data.drop(columns=['Price', 'Date', 'Suburb', 'Type', 'Method', 'SellerG', 'CouncilArea', 'Regionname'])
y = reduced_data['Price']

In [27]:
X.fillna(X.mean(), inplace=True)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [28]:
corr_matrix = X_train.corr().abs()

# Identify pairs of highly correlated features
high_corr_pairs = np.where(corr_matrix > 0.85)
high_corr_features = set([X_train.columns[i] for i in high_corr_pairs[0] if i != high_corr_pairs[1][i]])

In [29]:
X_train_corr_filtered = X_train.drop(columns=high_corr_features)
X_test_corr_filtered = X_test.drop(columns=high_corr_features)

In [30]:
variance_filter = VarianceThreshold(threshold=0.01)  # Example threshold for low variance
X_train_low_var = variance_filter.fit_transform(X_train_corr_filtered)
X_test_low_var = variance_filter.transform(X_test_corr_filtered)



In [31]:
linear_model = LinearRegression()
rfe_selector = RFE(estimator=linear_model, n_features_to_select=5, step=1)
rfe_selector.fit(X_train_low_var, y_train)
X_train_forward = rfe_selector.transform(X_train_low_var)
X_test_forward = rfe_selector.transform(X_test_low_var)





In [32]:
random_forest_model = RandomForestRegressor(random_state=42)
rfe_backward = RFE(estimator=random_forest_model, n_features_to_select=5, step=1)
rfe_backward.fit(X_train_low_var, y_train)
X_train_backward = rfe_backward.transform(X_train_low_var)
X_test_backward = rfe_backward.transform(X_test_low_var)





In [33]:
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)
important_features = SelectFromModel(rf_model, threshold="mean", prefit=True)
X_train_rf_selected = important_features.transform(X_train)
X_test_rf_selected = important_features.transform(X_test)



In [34]:
def evaluate_model(X_train, X_test, y_train, y_test):
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    return mse

In [35]:
results = {
    "Baseline (No Feature Selection)": evaluate_model(X_train, X_test, y_train, y_test),
    "High Correlation Filter": evaluate_model(X_train_corr_filtered, X_test_corr_filtered, y_train, y_test),
    "Low Variance Filter": evaluate_model(X_train_low_var, X_test_low_var, y_train, y_test),
    "Forward Selection": evaluate_model(X_train_forward, X_test_forward, y_train, y_test),
    "Backward Elimination": evaluate_model(X_train_backward, X_test_backward, y_train, y_test),
    "Random Forest Selection": evaluate_model(X_train_rf_selected, X_test_rf_selected, y_train, y_test)
}

results

{'Baseline (No Feature Selection)': 143875373039.6255,
 'High Correlation Filter': 143875373039.6255,
 'Low Variance Filter': 143875373039.6255,
 'Forward Selection': 143875373039.6255,
 'Backward Elimination': 143875373039.6255,
 'Random Forest Selection': 147811757361.13766}