In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

In [2]:
# Load the training and testing datasets
train_data_path = '../data/train_data.csv'
test_data_path = '../data/test_data.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [3]:
# Define the features and target variable
feature_input_select = ['sqft_log','beds_std','full_baths_std','total_rooms_std', 'zip_code']
target_variable = 'sold_price_log'

X_train = train_data[feature_input_select]
y_train = train_data[target_variable]

X_test = test_data[feature_input_select]
y_test = test_data[target_variable]

In [4]:
# Train Random Forest model
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

In [5]:
# Predict on the test set
y_pred = rf_model.predict(X_test)

In [6]:
# Evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
# print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R²): {r2}")

Root Mean Squared Error (RMSE): 0.3066301171011942
Mean Absolute Error (MAE): 0.22020031207258586
R-squared (R²): 0.6813472789295896


In [7]:
# Save the model
model_file_path = '../models/random_forest_model.pkl'
joblib.dump(rf_model, model_file_path)
print(f"Random Forest model saved to {model_file_path}")

Random Forest model saved to ../models/random_forest_model.pkl
