In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

First we need to load the data and choose the information we aim to predict.

In [16]:
# Load dataset
data = pd.read_csv("data/housing.csv")

# Features and target
x = data.drop("price", axis=1)
y = data["price"]

print("Data")
print(x)
print()
print("Price")
print(y)

Data
    area  bedrooms  bathrooms
0   1200         2          2
1   1500         3          2
2    800         1          1
3   2000         4          3
4   2400         7          1
5  80000        10         16

Price
0      300000
1      400000
2      200000
3      600000
4      500000
5    75000000
Name: price, dtype: int64


Then we need to train the model with the information we have.

#Train-test split -> randomly splits the dataset into training and testing sets so the model can be trained on one part and evaluated on unseen data.

In [33]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

#20% of data -> test set
#80% of data -> training set

# Model Linear
model_L = LinearRegression()
model_L.fit(X_train, y_train)

# Model Random Forest
model_R = RandomForestRegressor()
model_R.fit(X_train, y_train)


And now we make the predictions using the models (used as test).

In [34]:
# Predictions Linear
y_pred = model_L.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear")
print("Mean Squared Error:", mse)
print("R² Score:", r2)
print()



###############################################################################################

# Predictions Random Forest
y_pred = model_R.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Random Forest")
print("Mean Squared Error:", mse)
print("R² Score:", r2)

Linear
Mean Squared Error: 75327045.9439779
R² Score: 0.9698691816224089

Random Forest
Mean Squared Error: 1318500000.0
R² Score: 0.4726
