In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score , mean_absolute_error
import joblib

# Load the dataset
data = pd.read_csv('../data/data_file.csv')

# Fill missing values in 'total_bedrooms' with the column mean
data['total_bedrooms'] = data['total_bedrooms'].fillna(data['total_bedrooms'].mean())


# One-hot encode 'ocean_proximity' column
data_encoded = pd.get_dummies(data, columns=['ocean_proximity'], drop_first=True)

# Feature Engineering
data_encoded['rooms_per_household'] = data_encoded['total_rooms'] / data_encoded['households']
data_encoded['bedrooms_per_room'] = data_encoded['total_bedrooms'] / data_encoded['total_rooms']
data_encoded['population_per_household'] = data_encoded['population'] / data_encoded['households']

# Define target and features
target = 'median_house_value'
X = data_encoded.drop(columns=[target])
y = data_encoded[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train simple linear regression model (using 'median_income' as a single feature)
X_train_simple = X_train[['median_income']]
X_test_simple = X_test[['median_income']]
model_simple = LinearRegression()
model_simple.fit(X_train_simple, y_train)

# Train multiple linear regression model (using top 5 features from feature importance)
features = ['median_income', 'latitude', 'total_rooms', 'housing_median_age', 'ocean_proximity_INLAND']
X_train_multiple = X_train[features]
X_test_multiple = X_test[features]
model_multiple = LinearRegression()
model_multiple.fit(X_train_multiple, y_train)

# Save the models
joblib.dump(model_simple, 'models/simple_linear_regression_model.pkl')
joblib.dump(model_multiple, 'models/multiple_linear_regression_model.pkl')

# Evaluate both models
y_pred_simple = model_simple.predict(X_test_simple)
y_pred_multiple = model_multiple.predict(X_test_multiple)

# Evaluation metrics for simple linear regression model
print("Simple Linear Regression Model:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_simple))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_simple))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred_simple)))
print("R^2 Score:", r2_score(y_test, y_pred_simple))
print("\n")

# Evaluation metrics for multiple linear regression model
print("Multiple Linear Regression Model:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_multiple))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_multiple))
print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_pred_multiple)))
print("R^2 Score:", r2_score(y_test, y_pred_multiple))

Simple Linear Regression Model:
Mean Absolute Error: 62990.86530093761
Mean Squared Error: 7091157771.76555
Root Mean Squared Error: 84209.01241414454
R^2 Score: 0.45885918903846656


Multiple Linear Regression Model:
Mean Absolute Error: 53668.351217420015
Mean Squared Error: 5474322614.966027
Root Mean Squared Error: 73988.6654492837
R^2 Score: 0.5822431999577118


The metrics show that the Multiple Linear Regression model performs better than the Simple Linear Regression model. Here’s how we can interpret the results and consider steps to improve the model further:

Interpretation:

Mean Absolute Error (MAE): The multiple linear regression model has a lower MAE (53,668 vs. 62,991), indicating it generally makes smaller errors in prediction.

Mean Squared Error (MSE) and RMSE: The multiple model also has a lower MSE and RMSE, suggesting it handles large errors better.

R² Score: The multiple regression model’s R² score (0.582) indicates it explains 58.2% of the variance in house values, which is better than the simple model's R² score (45.9%).