In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor

In [39]:
# Read the cleaned file
df = pd.read_csv("data/cleaned_toronto_dataset.csv")

In [None]:
# Define X and y for Linear Regression
X = df.drop("price", axis=1)
y = df["price"]

# Split into training and test sets with 75% for training and 25% for testing
X_train_LR, X_test_LR, y_train_LR, y_test_LR = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Create a Linear Regression Model
Lin_Reg_model = LinearRegression()

# Fill in missing values
imputer = SimpleImputer(strategy='median')
X_train_LR = imputer.fit_transform(X_train_LR)
X_test_LR = imputer.transform(X_test_LR)

# Fit the model to the training set
Lin_Reg_model.fit(X_train_LR, y_train_LR)

# Predict on the test set
y_pred = Lin_Reg_model.predict(X_test_LR)

# Calculate evaluation metrics
mse = mean_squared_error(y_test_LR, y_pred)
rmse = mean_squared_error(y_test_LR, y_pred, squared=False)
r2 = r2_score(y_test_LR, y_pred)
mae = mean_absolute_error(y_test_LR, y_pred)
#print("Mean Squared Error (MSE) on test set: {:.2f}".format(mse))
print("MSE:", mse)
print("RMSE:", rmse)
print("R-squared:", r2)
print("MAE:", mae)

The MSE on the test set is extremely large. This means that the predictions made by this model are very far from the true values.

In [40]:
# Define X and y for Random Forest
X = df.drop("price", axis=1)
y = df["price"]

# Split into training and test sets with 75% for training and 25% for testing
X_train_RF, X_test_RF, y_train_RF, y_test_RF = train_test_split(X, y, test_size=0.25, random_state=42)

In [41]:

# Instantiate the model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fill in missing values
imputer = SimpleImputer(strategy='median')
X_train_RF = imputer.fit_transform(X_train_RF)
X_test_RF = imputer.transform(X_test_RF)

# Fit the model to the training data
rf.fit(X_train_RF, y_train_RF)

# Use the model to make predictions on the test set
y_pred = rf.predict(X_test_RF)

# Calculate the mean squared error on the test set
mse = mean_squared_error(y_test_RF, y_pred)

print("Mean Squared Error (MSE) on test set: {:.2f}".format(mse))

Mean Squared Error (MSE) on test set: 5244.08


The MSE score is much lower than the linear regression model. Therefore, the Random Forest model is performing better.