In [17]:
"""

Recall AveBedrms is most negatively correlated to the home price. 
We can add the feature and build a multivariate linear regression model where the home price depends on both MedInc and AveBedrms linearly:

MEDV = b0 + b1 * MedInc + b2 * AveBedrms
To find intercept b0, and coefficients b1 and b2, all steps are the same except for the data preparation part, 
we are now dealing with two features:
"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import mean_squared_error




In [18]:
model = LinearRegression()
california_housing_dataset = fetch_california_housing()

## build a DataFrame
california_housing = pd.DataFrame(california_housing_dataset.data, columns=california_housing_dataset.feature_names)

#We then add the target into the DataFrame
california_housing['MEDV'] = california_housing_dataset.target

X = california_housing[['MedInc']]

Y = california_housing['MEDV']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state=101)

model.fit(X_train, Y_train)

y_test_predicted = model.predict(X_test)


In [19]:
## data preparation
X2 = california_housing[['MedInc', 'AveBedrms']]
Y = california_housing['MEDV']
## train test split
## same random_state to ensure the same splits
X2_train, X2_test, Y_train, Y_test = train_test_split(X2, Y, 
                                                    test_size = 0.3, 
                                                    random_state=101)
model2 = LinearRegression()
print(model2.fit(X2_train, Y_train))

LinearRegression()


In [20]:
print(model2.intercept_.round(3))
print(model2.coef_.round(3))

#Note the coefficients are stored in a 1darray of shape (2,). The second model then is

#MEDV = 0.441 + 0.423 * MedInc + (-0.001) * AveBedrms.


0.475
[ 0.42  -0.033]


In [21]:
y_test_predicted2 = model2.predict(X2_test)
print(y_test_predicted2)

[2.80380927 5.34550567 1.46535295 ... 1.62220717 1.00133736 1.81346447]


In [22]:
print(
model.score(X_test,Y_test).round(3),#univariate 
model2.score(X2_test,Y_test).round(3)#multivariate 
)


"""
The extension from univariate to multivariate linear regression is straightforward in scikit-learn. 

The model instantiation, fitting, and predictions are identical, the only difference being the data preparation.
"""

0.468 0.467


'\nThe extension from univariate to multivariate linear regression is straightforward in scikit-learn. \n\nThe model instantiation, fitting, and predictions are identical, the only difference being the data preparation.\n'

In [24]:
"""
Which model is better? 
An easy metric for linear regression is the mean squared error (MSE) on the testing data. 
Better models have lower MSEs.
"""

print(mean_squared_error(Y_test, y_test_predicted).round(3))
print(mean_squared_error(Y_test, y_test_predicted2).round(3))

0.72
0.721


In [None]:
"""
The fisrt model has a lower MSE, 
thus it does a better job predicting the median home values than the univariate model

In general, the more features the model includes the lower the MSE would be.

Yet be careful about including too many features. Some features could be random noise, 
thus hurt the interpretability of the model.
"""