In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the processed data
data = pd.read_csv('data/processed/processed_data.csv')

# Split data into features and target variable
X = data.drop('price', axis=1)
y = data['price']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

# Train a random forest model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Evaluate both models
y_pred_linear = linear_model.predict(X_test)
y_pred_rf = rf_model.predict(X_test)

mse_linear = mean_squared_error(y_test, y_pred_linear)
mse_rf = mean_squared_error(y_test, y_pred_rf)

print("Linear Regression MSE:", mse_linear)
print("Random Forest MSE:", mse_rf)

# Choose the best model based on evaluation metrics
if mse_linear < mse_rf:
    best_model = linear_model
else:
    best_model = rf_model

# Save the best model
with open('models/final_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)