In [1]:
%pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import LocalOutlierFactor
import pandas as pd

# Load the data
data = pd.read_csv('imputed_data_7.csv')
X = data.drop(columns=['Price']).values
y = data['Price'].values

# Fit the LOF model to detect outliers
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred = lof.fit_predict(X)

# Identify outliers
outliers = y_pred == -1

# Remove outliers from the data
X_cleaned = X[~outliers]
y_cleaned = y[~outliers]

# Split the cleaned data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=13)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid for SVR
param_grid_svr = {
    'svr__kernel': ['linear', 'rbf'],
    'svr__C': [0.1, 1, 10, 100],
    'svr__gamma': [0.1, 1, 10],
}

# Create the SVR model within a pipeline
svr_pipeline = Pipeline([('scaler', StandardScaler()), ('svr', SVR())])

# Perform GridSearchCV to find the best hyperparameters for SVR
grid_search_svr = GridSearchCV(estimator=svr_pipeline, param_grid=param_grid_svr, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search_svr.fit(X_train, y_train)

# Get the best hyperparameters for SVR
best_params_svr = grid_search_svr.best_params_
print("Best parameters for SVR:", best_params_svr)
svr_best = SVR(kernel=best_params_svr['svr__kernel'], C=best_params_svr['svr__C'], gamma=best_params_svr['svr__gamma'])

# Define the parameter grid for Random Forest
param_grid_rf = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": [None, "sqrt", "log2"]
}

# Perform GridSearchCV to find the best hyperparameters for Random Forest
grid_search_rf = GridSearchCV(estimator=RandomForestRegressor(random_state=13), param_grid=param_grid_rf, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search_rf.fit(X_train, y_train)

# Get the best hyperparameters for Random Forest
best_params_rf = grid_search_rf.best_params_
print("Best parameters for Random Forest:", best_params_rf)
rf_best = RandomForestRegressor(**best_params_rf, random_state=13)

# Create the ensemble model using VotingRegressor
voting_regressor = VotingRegressor(estimators=[
    ('svr', svr_best),
    ('random_forest', rf_best)
], weights=[2, 1])  # Adjust weights based on performance of individual models

# Train the ensemble model
voting_regressor.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = voting_regressor.predict(X_test_scaled)

# Evaluate the ensemble model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the results
print("Root Mean Squared Error (RMSE) on test set:", rmse)
print("R-squared on test set:", r_squared)
print("Mean Absolute Error (MAE) on test set:", mae)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for SVR: {'svr__C': 100, 'svr__gamma': 0.1, 'svr__kernel': 'linear'}
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
Best parameters for Random Forest: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Root Mean Squared Error (RMSE) on test set: 147.91881633792644
R-squared on test set: 0.24320107355762277
Mean Absolute Error (MAE) on test set: 58.240695540301104


In [4]:
# Obtained the most correlated features from using RFECV
features = ['Bedrooms', 'Bathrooms', 'Location', 'PoolQuality', 'HasPhotovoltaics',
       'HasFiberglass', 'IsFurnished', 'HouseColor', 'HasFireplace',
       'KitchensQuality', 'BathroomsQuality', 'BedroomsQuality',
       'LivingRoomsQuality', 'SquareFootageGarden', 'PreviousOwnerRating',
       'HeatingCosts', 'WindowModelNames']
target = 'Price'

In [5]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import LocalOutlierFactor
import pandas as pd

# Load the data
data = pd.read_csv('imputed_data_7.csv')
X = data[features]
y = data[target]

# Fit the LOF model to detect outliers
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred = lof.fit_predict(X)

# Identify outliers
outliers = y_pred == -1

# Remove outliers from the data
X_cleaned = X[~outliers]
y_cleaned = y[~outliers]

# Split the cleaned data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_cleaned, y_cleaned, test_size=0.2, random_state=13)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Use the provided best parameters for SVR
svr_best = SVR(kernel='linear', C=100, gamma=0.1)

# Use the provided best parameters for Random Forest
rf_best = RandomForestRegressor(
    n_estimators=100, 
    max_depth=None, 
    max_features='sqrt', 
    min_samples_leaf=2, 
    min_samples_split=5, 
    random_state=13
)

# Create the ensemble model using VotingRegressor
voting_regressor = VotingRegressor(estimators=[
    ('svr', svr_best),
    ('random_forest', rf_best)
], weights=[2, 1])  # Adjust weights based on performance of individual models

# Train the ensemble model
voting_regressor.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = voting_regressor.predict(X_test_scaled)

# Evaluate the ensemble model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

# Print the results
print("Root Mean Squared Error (RMSE) on test set:", rmse)
print("R-squared on test set:", r_squared)
print("Mean Absolute Error (MAE) on test set:", mae)

Root Mean Squared Error (RMSE) on test set: 63.380839187962565
R-squared on test set: 0.5637655445174128
Mean Absolute Error (MAE) on test set: 48.75822068267942


In [18]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.neighbors import LocalOutlierFactor
import pandas as pd
import numpy as np

# Load the data
data = pd.read_csv('imputed_data_7.csv')
X = data[features]
y = data[target]

# Fit the LOF model to detect outliers
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred = lof.fit_predict(X)

# Identify outliers
outliers = y_pred == -1

# Remove outliers from the data
X_cleaned = X[~outliers]
y_cleaned = y[~outliers]

# Standardize the data
scaler = StandardScaler()
X_cleaned_scaled = scaler.fit_transform(X_cleaned)

# Use the provided best parameters for SVR
svr_best = SVR(kernel='linear', C=100, gamma=0.1)

# Use the provided best parameters for Random Forest
rf_best = RandomForestRegressor(
    n_estimators=100, 
    max_depth=None, 
    max_features='sqrt', 
    min_samples_leaf=2, 
    min_samples_split=5, 
    random_state=13
)

# Create the ensemble model using VotingRegressor
voting_regressor = VotingRegressor(estimators=[
    ('svr', svr_best),
    ('random_forest', rf_best)
], weights=[5,1]) 

def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Training the ensemble model on the training set
voting_regressor.fit(X_cleaned_scaled, y_cleaned)
X_train, X_test, y_train, y_test = train_test_split(X_cleaned_scaled, y_cleaned, test_size=0.2, random_state=13)
voting_regressor.fit(X_train, y_train)
y_pred = voting_regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("\nEvaluation on Test Set:")
print("Root Mean Squared Error (RMSE) on test set:", rmse)
print("R-squared on test set:", r_squared)
print("Mean Absolute Error (MAE) on test set:", mae)

# Evaluate the ensemble model using cross-validation
scoring = {
    'mse': make_scorer(mean_squared_error),
    'mae': make_scorer(mean_absolute_error),
    'r2': make_scorer(r2_score),
    'rmse': make_scorer(rmse_scorer)
}
cv_results = cross_validate(voting_regressor, X_cleaned_scaled, y_cleaned, cv=5, scoring=scoring, return_train_score=True)

print("\nCross-Validation Results:")
print(f"Mean Squared Error (MSE) on test set: {np.mean(cv_results['test_mse'])}")
print(f"Mean Absolute Error (MAE) on test set: {np.mean(cv_results['test_mae'])}")
print(f"R-squared on test set: {np.mean(cv_results['test_r2'])}")
print(f"Root Mean Squared Error (RMSE) on test set: {np.mean(cv_results['test_rmse'])}")


Final Evaluation on Test Set:
Root Mean Squared Error (RMSE) on test set: 51.84816037724185
R-squared on test set: 0.7080753970299145
Mean Absolute Error (MAE) on test set: 39.67402065908165

Cross-Validation Results:
Mean Squared Error (MSE) on test set: 172961.36934662558
Mean Absolute Error (MAE) on test set: 86.36790467500896
R-squared on test set: 0.07826636695894071
Root Mean Squared Error (RMSE) on test set: 376.8815888040527
