In [6]:
"""
In scikit-learn, every class of model is represented by a class in python. 
A class of model is not the same as an instance of a model. 
Recall that instance is an individual object of a certain class. 

Thus, we first import the linear regression class, then instantiate the model, 
that is to create an instance of the class LinearRegression:
"""

from sklearn.linear_model import LinearRegression

model = LinearRegression()
print(model)

LinearRegression()


In [7]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
import numpy as np
import matplotlib.pyplot as plt

california_housing_dataset = fetch_california_housing()

## build a DataFrame
california_housing = pd.DataFrame(california_housing_dataset.data, columns=california_housing_dataset.feature_names)

#We then add the target into the DataFrame
california_housing['MEDV'] = california_housing_dataset.target

X = california_housing[['MedInc']]
print(X.shape)

y = california_housing['MEDV']
print(y.shape)



(20640, 1)
(20640,)


In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.3, random_state=1)

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, y, 
  test_size = 0.3, 
  random_state=101)


(14448, 1)
(14448,)
(6192, 1)
(6192,)


In [9]:
"""
In short, fitting is equal to training. 
It fits the model to the training data and finds the coefficients specified in the linear regression model, i.e., 
intercept and slope. After it is trained, the model can be used to make predictions.
"""

model.fit(X_train, Y_train)

#The fit() command triggers the computations and the results are stored in the model object.

#Fitting is how well the machine learning model measures against the data upon which it was trained.

LinearRegression()

In [12]:
"""
The linear regression model has been fitted, 
what it means is that both parameters, the intercept and the slope, have been learned. 

What are they? In Scikit-learn, by convention all model parameters have trailing underscores, 
for example to access the estimated intercept from the model, rounded to the 2nd decimal place for better display:
"""

print(model.intercept_.round(2),
model.coef_.round(2))

0.44 [0.42]


In [14]:
#The two parameters represent the intercept and slope of the line fit to the data. 
#Our fitted model is MEDV = 0.44  + 0.42 * MedInc.

#Note that the input has to be 2-dimensional, either a 2darray or DataFrame will work in this case.

#This value is the same as we plug in the line b + m*x where b is the estimated intercept from the model, and m is the estimated slope.

new_MedInc = np.array([5]).reshape(-1,1) # make sure it's 2d

print(model.predict(new_MedInc))

print(model.intercept_ + model.coef_*5)

[2.53984577]
[2.53984577]




In [16]:
y_test_predicted = model.predict(X_test)
print(y_test_predicted.shape)
print(Y_test.shape)
print(type(y_test_predicted))

#The predict() method estimates the median home value by computing model.intercept_ + model.coef_* MedInc.


(6192,)
(6192,)
<class 'numpy.ndarray'>
