In [33]:
"""
In scikit-learn, every class of model is represented by a class in python. 
A class of model is not the same as an instance of a model. 
Recall that instance is an individual object of a certain class. 

Thus, we first import the linear regression class, then instantiate the model, 
that is to create an instance of the class LinearRegression:
"""

from sklearn.linear_model import LinearRegression

model = LinearRegression()
print(model)

LinearRegression()


In [34]:
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.datasets import fetch_california_housing
import numpy as np
import matplotlib.pyplot as plt

california_housing_dataset = fetch_california_housing()

## build a DataFrame
california_housing = pd.DataFrame(california_housing_dataset.data, columns=california_housing_dataset.feature_names)

#We then add the target into the DataFrame
california_housing['MEDV'] = california_housing_dataset.target

X = california_housing[['MedInc']]
print(X.shape)

y = california_housing['MEDV']
print(y.shape)

(20640, 1)
(20640,)


In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.3, random_state=1)

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, y, 
  test_size = 0.3, 
  random_state=101)

(14448, 1)
(14448,)
(6192, 1)
(6192,)


In [36]:
"""
In short, fitting is equal to training. 
It fits the model to the training data and finds the coefficients specified in the linear regression model, i.e., 
intercept and slope. After it is trained, the model can be used to make predictions.
"""

model.fit(X_train, Y_train)

#The fit() command triggers the computations and the results are stored in the model object.

#Fitting is how well the machine learning model measures against the data upon which it was trained.

LinearRegression()

In [37]:
"""
The linear regression model has been fitted, 
what it means is that both parameters, the intercept and the slope, have been learned. 

What are they? In Scikit-learn, by convention all model parameters have trailing underscores, 
for example to access the estimated intercept from the model, rounded to the 2nd decimal place for better display:
"""

print(model.intercept_.round(2),
model.coef_.round(2))

0.44 [0.42]


In [38]:
#The two parameters represent the intercept and slope of the line fit to the data.
#Our fitted model is MEDV = 0.44  + 0.42 * MedInc.

#Note that the input has to be 2-dimensional, either a 2d-array or DataFrame will work in this case.

#This value is the same as we plug in the line b + m*x where b is the estimated intercept from the model, and m is the estimated slope.

new_MedInc = np.array([5]).reshape(-1,1) # make sure it's 2d
print(new_MedInc.shape)

print(model.predict(new_MedInc))

print(model.intercept_ + model.coef_*5)

(1, 1)
[2.53984577]
[2.53984577]




In [39]:
y_test_predicted = model.predict(X_test)
print(y_test_predicted.shape)
print(Y_test.shape)
print(type(y_test_predicted))

#The predict() method estimates the median home value by computing model.intercept_ + model.coef_* MedInc.


(6192,)
(6192,)
<class 'numpy.ndarray'>


In [40]:
boston_dataset = load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [41]:
#now let's try to use the boston dataset with the same procedure :
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
boston = pd.DataFrame(data,columns=boston_dataset.feature_names)
boston['MEDV'] = target
boston.describe(include = 'all').round(2)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.61,11.36,11.14,0.07,0.55,6.28,68.57,3.8,9.55,408.24,18.46,356.67,12.65,22.53
std,8.6,23.32,6.86,0.25,0.12,0.7,28.15,2.11,8.71,168.54,2.16,91.29,7.14,9.2
min,0.01,0.0,0.46,0.0,0.38,3.56,2.9,1.13,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.08,0.0,5.19,0.0,0.45,5.89,45.02,2.1,4.0,279.0,17.4,375.38,6.95,17.02
50%,0.26,0.0,9.69,0.0,0.54,6.21,77.5,3.21,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.68,12.5,18.1,0.0,0.62,6.62,94.07,5.19,24.0,666.0,20.2,396.22,16.96,25.0
max,88.98,100.0,27.74,1.0,0.87,8.78,100.0,12.13,24.0,711.0,22.0,396.9,37.97,50.0


In [42]:
#RM was chosen over LSTAT
X = boston[['RM']]
y = boston['MEDV']
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size = 0.3, random_state=101)

model2 = LinearRegression()

model2.fit(X_train, Y_train)

print(model2.intercept_.round(2),
model2.coef_.round(2))

-33.05 [8.8]


In [43]:
new_RM = np.array([5]).reshape(-1,1) # make sure it's 2d
print(new_RM.shape)

print(model2.predict(new_RM))

print(model2.intercept_ + model2.coef_*5)

(1, 1)
[10.93625588]
[10.93625588]


