In [None]:
# Random Forest Regressor for Property Price Prediction
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from joblib import dump
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import os
from datetime import datetime

# Load Preprocessed Data
file_path = r'...\data\standardized_data.csv'
df = pd.read_csv(file_path)

# Define feature matrix (X) and target variable (y)
X = df.drop(columns=['price'])
y = df['price']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Set up the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 150],  # Adjusted to potentially include fewer trees
    'max_depth': [None, 10, 20, 30],  # Keeping a range for max depth
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]  # Keeping the minimum leaf sizes for control
}

# Create a Random Forest Regressor
model = RandomForestRegressor(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_model = grid_search.best_estimator_
print("Best parameters found: ", grid_search.best_params_)

# Evaluate Model Performance on the Test Set using the best model
y_pred_test = best_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print(f"\nTest Set Performance:")
print(f'MSE: {mse_test}, MAE: {mae_test}, R2 Score: {r2_test}')

# Evaluate Model Performance on the Training Set using the best model
y_pred_train = best_model.predict(X_train)
mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

print(f"\nTrain Set Performance:")
print(f'MSE: {mse_train}, MAE: {mae_train}, R2 Score: {r2_train}')

# Perform Cross-Validation on the best model
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_mse = -cv_scores.mean()  # Convert to positive MSE
print(f"\nCross-Validation MSE: {cv_mse}")

# Save the Model and Results
results_path = r'...\results'
os.makedirs(results_path, exist_ok=True)  # Create directory if it doesn't exist

# Save model with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_file = os.path.join(results_path, f'random_forest_model_{timestamp}.joblib')
dump(best_model, model_file)

# Save results to CSV with timestamp
results_file = os.path.join(results_path, f'random_forest_results_{timestamp}.csv')
results_df = pd.DataFrame([{
    'Model': 'Random Forest',
    'Test MSE': mse_test,
    'Test MAE': mae_test,
    'Test R2': r2_test,
    'Train MSE': mse_train,
    'Train MAE': mae_train,
    'Train R2': r2_train,
    'Cross-Validation MSE': cv_mse
}])
results_df.to_csv(results_file, index=False)

print(f"\nResults saved to {results_file}.")

Best parameters found:  {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}

Test Set Performance:
MSE: 0.2669311523863909, MAE: 0.31676074653433395, R2 Score: 0.7379452070456518

Train Set Performance:
MSE: 0.04098514008993178, MAE: 0.12143922683722588, R2 Score: 0.9588209568177347

Cross-Validation MSE: 0.2639351326195748

Results saved to C:\Users\izama\Desktop\machine learning\results\random_forest_results_20241104_131844.csv.
