In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
from tensorflow.keras.models import Sequential

: 

: 

In [None]:
# Read the cleaned file
df = pd.read_csv("data/cleaned_toronto_dataset.csv")

In [None]:
# Define X and y for Linear Regression
X = df.drop("price", axis=1)
y = df["price"]

# Split into training and test sets with 75% for training and 25% for testing
X_train_LR, X_test_LR, y_train_LR, y_test_LR = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Create a Linear Regression Model
Lin_Reg_model = LinearRegression()

# Fill in missing values
imputer = SimpleImputer(strategy='median')
X_train_LR = imputer.fit_transform(X_train_LR)
X_test_LR = imputer.transform(X_test_LR)

# Fit the model to the training set
Lin_Reg_model.fit(X_train_LR, y_train_LR)

# Predict on the test set
y_pred = Lin_Reg_model.predict(X_test_LR)

# Calculate evaluation metrics
mse = mean_squared_error(y_test_LR, y_pred)
rmse = mean_squared_error(y_test_LR, y_pred, squared=False)
r2 = r2_score(y_test_LR, y_pred)
mae = mean_absolute_error(y_test_LR, y_pred)

# Print evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R-squared:", r2)
print("MAE:", mae)

The MSE on the test set is fairly large.

In [None]:
# Define X and y for Random Forest
X = df.drop("price", axis=1)
y = df["price"]

# Split into training and test sets with 75% for training and 25% for testing
X_train_RF, X_test_RF, y_train_RF, y_test_RF = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:

# Create a Random Forest Regressor model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fill in missing values
imputer = SimpleImputer(strategy='median')
X_train_RF = imputer.fit_transform(X_train_RF)
X_test_RF = imputer.transform(X_test_RF)

# Fit the model to the training data
rf.fit(X_train_RF, y_train_RF)

# Use the model to make predictions on the test set
y_pred = rf.predict(X_test_RF)

# Calculate evaluation metrics
mse = mean_squared_error(y_test_RF, y_pred)
rmse = mean_squared_error(y_test_RF, y_pred, squared=False)
r2 = r2_score(y_test_RF, y_pred)
mae = mean_absolute_error(y_test_RF, y_pred)

# Print evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R-squared:", r2)
print("MAE:", mae)

The MSE score is slightly than the linear regression model. Therefore, the Random Forest model is performing better.

In [None]:
# Define X and y for XGBoost
X = df.drop("price", axis=1)
y = df["price"]

# Split into training and test sets with 70% for training and 25% for testing
X_train_XG, X_test_XG, y_train_XG, y_test_XG = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Create XGBoost Model
model = xgb.XGBRegressor(
    max_depth=3,
    learning_rate=0.1,
    n_estimators=100,
    verbosity=0,
    objective="reg:squarederror",
    random_state=42,
)

# Fill in missing values
imputer = SimpleImputer(strategy='median')
X_train_XG = imputer.fit_transform(X_train_XG)
X_test_XG = imputer.transform(X_test_XG)

# Train the model
model.fit(X_train_XG, y_train_XG)

# Make predictions on the test set
y_pred = model.predict(X_test_XG)

# Calculate evaluation metrics
mse = mean_squared_error(y_test_XG, y_pred)
rmse = mean_squared_error(y_test_XG, y_pred, squared=False)
r2 = r2_score(y_test_XG, y_pred)
mae = mean_absolute_error(y_test_XG, y_pred)

# Print evaluation metrics
print("MSE:", mse)
print("RMSE:", rmse)
print("R-squared:", r2)
print("MAE:", mae)

The score for XGBoost is slightly worse but still very similar to the random forest.

In [None]:
# Prepare the data for Neural Network
X = df.drop("price", axis=1).values
y = df["price"].values.reshape(-1, 1)

# Split the data into training and testing sets
X_train_NN, X_test_NN, y_train_NN, y_test_NN = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
# Normalize the data
scaler = StandardScaler()
X_train_NN = scaler.fit_transform(X_train_NN)
X_test_NN = scaler.transform(X_test_NN)

In [None]:
# Convert the data into PyTorch tensors
X_train_NN = torch.tensor(X_train_NN, dtype=torch.float32)
y_train_NN = torch.tensor(y_train_NN, dtype=torch.float32)
X_test_NN = torch.tensor(X_test_NN, dtype=torch.float32)
y_test_NN = torch.tensor(y_test_NN, dtype=torch.float32)

In [None]:
# Define the neural network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(36, 128) # input layer
        self.fc2 = nn.Linear(128, 64) # hidden layer
        self.fc3 = nn.Linear(64, 32) # hidden layer
        self.fc4 = nn.Linear(32, 1) # output layer
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
# Instantiate the neural network
net = Net()

# Define the loss function
criterion = nn.MSELoss()

# Define the optimizer
optimizer = optim.Adam(net.parameters(), lr=0.01)

In [None]:
# Train the neural network
for epoch in range(200):
    # Zero the parameter gradients
    optimizer.zero_grad()

    # Forward + backward + optimize
    outputs = net(X_train_NN)
    loss = criterion(outputs, y_train_NN)
    loss.backward()
    optimizer.step()

    # Print the loss every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/200, Loss: {loss.item()}")

In [None]:
# Evaluate the neural network on the testing set
with torch.no_grad():
    predictions = net(X_test_NN)
    mse = criterion(predictions, y_test_NN)
    rmse = torch.sqrt(mse)
    r2 = 1 - mse / torch.var(y_test_NN)
    print("MSE:", mse)
    print("RMSE:", rmse)
    print("R2:", r2)

Model performance is poorer than the previous models