In [2]:
%pip install pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

data = pd.read_csv('imputed_data_7.csv')

X = data.drop(columns=['Price']).values
y = data['Price'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

params = {
    "n_estimators": [100, 200, 300, 400, 500],
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": [None, "sqrt", "log2"]
}

rf = RandomForestRegressor(random_state=13)
grid_search = GridSearchCV(rf, params, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

rf_best = RandomForestRegressor(**best_params, random_state=13)

rf_best.fit(X_train, y_train)

y_pred = rf_best.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

mae = mean_absolute_error(y_test, y_pred)

r_squared = r2_score(y_test, y_pred)

print("Best hyperparameters:", best_params)
print("Root Mean Squared Error (RMSE) on test set:", rmse)
print("Mean Absolute Error (MAE) on test set:", mae)
print("R-squared on test set:", r_squared)

Best hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Root Mean Squared Error (RMSE) on test set: 389.05565648468223
Mean Absolute Error (MAE) on test set: 118.28465790753629
R-squared on test set: 0.04140864182935944


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('new_data.csv')

# Split features and target
X = data.drop(columns=['Price']).values
y = data['Price'].values

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the features and transform them
X_scaled = scaler.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=20)

# Define the parameters
best_params = {
    'max_depth': None,
    'max_features': 'sqrt',
    'min_samples_leaf': 1,
    'min_samples_split': 10,
    'n_estimators': 100
}

# Initialize the RandomForestRegressor with the specified parameters
rf_best = RandomForestRegressor(**best_params, random_state=13)

# Perform cross-validation on the scaled dataset
cv_scores = cross_val_score(rf_best, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse = (-cv_scores) ** 0.5

# Fit the model on the training set
rf_best.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_best.predict(X_test)

# Calculate performance metrics on the test set
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)

# Calculate R-squared from cross-validation scores
cv_scores_r2 = cross_val_score(rf_best, X_scaled, y, cv=5, scoring='r2')
cv_r2 = cv_scores_r2.mean()

# Calculate MAE from cross-validation scores
cv_scores_mae = cross_val_score(rf_best, X_scaled, y, cv=5, scoring='neg_mean_absolute_error')
cv_mae = -cv_scores_mae.mean()

# Print results
print("Best hyperparameters:", best_params)
print("Cross-validated RMSE:", cv_rmse.mean())
print("Cross-validated RMSE (standard deviation):", cv_rmse.std())
print("Mean Cross-validated MAE:", cv_mae)
print("Mean R-squared from cross-validation:", cv_r2)

Best hyperparameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
Cross-validated RMSE: 51.83027362154903
Cross-validated RMSE (standard deviation): 3.5668355014894955
Mean Cross-validated MAE: 38.62373119519769
Mean R-squared from cross-validation: 0.7188047081281176
