In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

In [2]:
# Read the cleaned file
df = pd.read_csv("data/cleaned_toronto_dataset.csv")

In [3]:
# Define X and y for Linear Regression
X = df.drop("price", axis=1)
y = df["price"]

# Split into training and test sets with 75% for training and 25% for testing
X_train_LR, X_test_LR, y_train_LR, y_test_LR = train_test_split(X, y, test_size=0.25, random_state=42)

In [4]:
# Create a Linear Regression Model
Lin_Reg_model = LinearRegression()

# Fill in missing values
imputer = SimpleImputer(strategy='median')
X_train_LR = imputer.fit_transform(X_train_LR)
X_test_LR = imputer.transform(X_test_LR)

# Fit the model to the training set
Lin_Reg_model.fit(X_train_LR, y_train_LR)

# Predict on the test set
y_pred = Lin_Reg_model.predict(X_test_LR)

# Calculate evaluation metrics
mse = mean_squared_error(y_test_LR, y_pred)
rmse = mean_squared_error(y_test_LR, y_pred, squared=False)
r2 = r2_score(y_test_LR, y_pred)
mae = mean_absolute_error(y_test_LR, y_pred)

# Print evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R-squared:", r2)
print("MAE:", mae)

MSE: 6351.199420227355
RMSE: 79.69441272904491
R-squared: 0.4756241888901267
MAE: 53.853046260018075


The MSE on the test set is extremely large. This means that the predictions made by this model are very far from the true values.

In [5]:
# Define X and y for Random Forest
X = df.drop("price", axis=1)
y = df["price"]

# Split into training and test sets with 75% for training and 25% for testing
X_train_RF, X_test_RF, y_train_RF, y_test_RF = train_test_split(X, y, test_size=0.25, random_state=42)

In [6]:

# Create a Random Forest Regressor model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fill in missing values
imputer = SimpleImputer(strategy='median')
X_train_RF = imputer.fit_transform(X_train_RF)
X_test_RF = imputer.transform(X_test_RF)

# Fit the model to the training data
rf.fit(X_train_RF, y_train_RF)

# Use the model to make predictions on the test set
y_pred = rf.predict(X_test_RF)

# Calculate evaluation metrics
mse = mean_squared_error(y_test_RF, y_pred)
rmse = mean_squared_error(y_test_RF, y_pred, squared=False)
r2 = r2_score(y_test_RF, y_pred)
mae = mean_absolute_error(y_test_RF, y_pred)

# Print evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R-squared:", r2)
print("MAE:", mae)

MSE: 5351.40750345769
RMSE: 73.15331505446414
R-squared: 0.5581702817788965
MAE: 47.55902115821962


The MSE score is much lower than the linear regression model. Therefore, the Random Forest model is performing better.

In [7]:
# Define X and y for XGBoost
X = df.drop("price", axis=1)
y = df["price"]

# Split into training and test sets with 75% for training and 25% for testing
X_train_XG, X_test_XG, y_train_XG, y_test_XG = train_test_split(X, y, test_size=0.25, random_state=42)

In [8]:
# Create XGBoost Model
model = xgb.XGBRegressor(
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    verbosity=0,
    objective="reg:squarederror",
    random_state=42,
)

# Fill in missing values
imputer = SimpleImputer(strategy='median')
X_train_XG = imputer.fit_transform(X_train_XG)
X_test_XG = imputer.transform(X_test_XG)

# Train the model
model.fit(X_train_XG, y_train_XG)

# Make predictions on the test set
y_pred = model.predict(X_test_XG)

# Calculate evaluation metrics
mse = mean_squared_error(y_test_XG, y_pred)
rmse = mean_squared_error(y_test_XG, y_pred, squared=False)
r2 = r2_score(y_test_XG, y_pred)
mae = mean_absolute_error(y_test_XG, y_pred)

# Print evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R-squared:", r2)
print("MAE:", mae)

MSE: 5504.8105961118545
RMSE: 74.19441081450714
R-squared: 0.5455048203731183
MAE: 47.758891127030516


The score for XGBoost is slightly better but still very similar to the random forest.