In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('preprocessed_data.csv')

# Convert 'Price' column to numeric, converting non-numeric values to NaN
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

# Drop rows with NaN values in the 'Price' column
df.dropna(subset=['Price'], inplace=True)
df.dropna(inplace=True)

# Exclude 'Price' column from the features (X)
X = df.drop(columns=['Price'])

# Target variable (y) is 'Price'
y = df['Price']

# Convert categorical variables to numeric using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse**0.5
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

actual_vs_predicted = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(actual_vs_predicted)

Mean Squared Error (MSE): 14673086.59613625
Root Mean Squared Error (RMSE): 3830.5465140285464
R-squared (R2): 0.9483449727913066
       Actual  Predicted
4357   2982.0    3308.90
3630  28152.0   27547.28
1208  10893.0   10640.21
5411   6548.0    6750.93
3183   2655.0    2735.03
...       ...        ...
3247  10284.0   10725.28
2487    778.0     813.35
5863   2510.0    2521.90
3974  17600.0    9782.08
4740  10053.0    8107.80

[1136 rows x 2 columns]


In [None]:
# 1) L1 regularization
# 2) One-hot or any other encoding
# 3) L2 regularizations
# 4) Elastic Net
# 5) Variation and correlation amongst the data
# 6) Usage of the range values instead of the avg for temp, humidity etc