In [1]:
%pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.svm import SVR
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
import numpy as np

# Load the data
data = pd.read_csv('imputed_data_7.csv')
X = data.drop(columns=['Price']).values
y = data['Price'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

# Define the best SVR model
best_svr = SVR(kernel='linear', C=100, gamma=0.1)

# Initialize RFECV with the best SVR as the estimator and cross-validation
rfecv = RFECV(estimator=best_svr, cv=5, scoring='neg_mean_squared_error')
rfecv.fit(X_train, y_train)

# Select the features based on RFECV
X_train_selected = rfecv.transform(X_train)
X_test_selected = rfecv.transform(X_test)

# Get selected feature indices
selected_feature_indices = rfecv.support_

# Get column names of the selected features
selected_columns = data.drop(columns=['Price']).columns[selected_feature_indices]

# Define the pipeline with StandardScaler, PCA, and SVR
svr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=0.95)),  # Adjust n_components based on your requirement
    ('svr', best_svr)
])

# Define custom RMSE scorer
def rmse_scorer(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Define scoring metrics
scoring = {
    'mse': make_scorer(mean_squared_error),
    'mae': make_scorer(mean_absolute_error),
    'r2': make_scorer(r2_score),
    'rmse': make_scorer(rmse_scorer)
}

# Cross-validate the pipeline
cv_results = cross_validate(svr_pipeline, X_train_selected, y_train, cv=5, scoring=scoring, return_train_score=True)

# Calculate and print cross-validation results
rmse_cv = np.mean(cv_results['test_rmse'])
r2_cv = np.mean(cv_results['test_r2'])
mae_cv = np.mean(cv_results['test_mae'])
mse_cv = np.mean(cv_results['test_mse'])

# Fit the pipeline on the entire training set and evaluate on the test set
svr_pipeline.fit(X_train_selected, y_train)
y_pred = svr_pipeline.predict(X_test_selected)

# Calculate evaluation metrics on the test set
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r_squared = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("\nEvaluation on Test Set:")
print("Root Mean Squared Error (RMSE) on test set:", rmse)
print("R-squared on test set:", r_squared)
print("Mean Absolute Error (MAE) on test set:", mae)
print("Selected feature columns:")
print(selected_columns)
print("\nCross-validation results:")
print(f"Average RMSE across folds: {rmse_cv}")
print(f"Average R-squared across folds: {r2_cv}")
print(f"Average MAE across folds: {mae_cv}")
print(f"Average MSE across folds: {mse_cv}")


Evaluation on Test Set:
Root Mean Squared Error (RMSE) on test set: 323.42755827943165
R-squared on test set: 0.04560998056668031
Mean Absolute Error (MAE) on test set: 69.97350794958435
Selected feature columns:
Index(['Bedrooms', 'Bathrooms', 'SquareFootageHouse', 'Location',
       'PoolQuality', 'HasPhotovoltaics', 'HeatingType', 'HasFiberglass',
       'IsFurnished', 'HouseColor', 'HasFireplace', 'KitchensQuality',
       'BathroomsQuality', 'BedroomsQuality', 'LivingRoomsQuality',
       'SquareFootageGarden', 'PreviousOwnerRating', 'HeatingCosts',
       'WindowModelNames'],
      dtype='object')

Cross-validation results:
Average RMSE across folds: 381.51865672268576
Average R-squared across folds: 0.12328462395029804
Average MAE across folds: 90.65099181337605
Average MSE across folds: 186311.66316780265
