# Simple Linear Regression

<hr>

Sergei Yu. Papulin (papulin.study@yandex.ru)

<a name="0"></a>
<div><span style="font-size:16pt; font-weight:bold">Contents</span>
    <ol>
        <li><a href="#1">Loading Initial Data</a></li>
        <li><a href="#2">Defining Linear Regression Task</a></li>
        <li><a href="#3">Brute-Force Search</a></li>
        <li><a href="#4">Ordinary Least Squares</a></li>
        <li><a href="#5">Gradient Descent</a></li>
        <li><a href="#6">Stochastic Gradient Descent</a></li>
        <li><a href="#7">Linear Regression in Sklearn</a></li>
        <li><a href="#8">References</a></li>
    </ol>
</div>

Import modules and functions that will be used later

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
%matplotlib inline

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">1. Loading Initial Data</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

Create a dataframe from the csv-file of student grades and show the first 5 records/samples:

In [None]:
FILE_PATH = "../data/SAT_GPA.csv"

In [None]:
df1 = pd.read_csv(FILE_PATH, sep=" ")
df1.head(5)

Display a `high_GPA`-`univ_GRA` scatter plot for the loaded data:

In [None]:
# Scatter plot

x_high = df1["high_GPA"].values
y_univ = df1["univ_GPA"].values

plt.figure("1", figsize=[10, 6])

plt.subplot(1,1,1)

plt.scatter(x_high, y_univ, color="slategrey")
plt.xlabel("High_GPA")
plt.ylabel("Univ_GPA")
plt.axis([2, 4, 2, 4])
plt.grid(True)


# Single Point

x_A = df1.loc[36, "high_GPA"]
y_A = df1.loc[36, "univ_GPA"]

xy_A = "$("+str(x_A)+","+str(y_A)+")$"
plt.annotate(xy_A, xy=(x_A, y_A), xytext=(50, -100), xycoords="data", textcoords="offset points", 
             arrowprops=dict(arrowstyle="->", connectionstyle="arc,angleA=0,armA=0,angleB=-90,armB=15,rad=7"),)
plt.show()

<a name="2"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">2. Defining Linear Regression Task</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

In [None]:
import ipywidgets as widgets

In [None]:
x_axis = np.linspace(2, 4, num=100)

w0 = widgets.FloatSlider(min=-1.5, max=1.2, step=0.05, value=0.75)
w1 = widgets.FloatSlider(min=0.4, max=2.0, step=0.05, value=0.75)

def update(w0=0, w1=0):
   
    plt.figure("2", figsize=[10, 6])  
    plt.scatter(x_high, y_univ, color="slategrey", label="Samples")
    plt.xlabel("High_GPA")
    plt.ylabel("Univ_GPA")
    plt.plot(x_axis, w1 * x_axis + w0, color="darkorange", linewidth=2, label="Regression Line")
    plt.axis([2, 4, 2, 4])
    plt.grid(True)
    plt.ylim(2, 4)
    plt.legend()
    plt.show()

widgets.interact(update, w0=w0, w1=w1);

Let's consider three linear functions and determine which one is the best of then for prediction:

1. $h_1\left( x \right) = -3 + 2x$

2. $h_2\left( x \right) = 1.1 + 0.7x$

3. $h_3\left( x \right) = 2 + 0.4x$

Initialize regression parameters of the functions:

In [None]:
slope_1 = 2.0; slope_2 = 0.7; slope_3 = 0.4
intercept_1 = -3.0; intercept_2 = 1.1; intercept_3 = 2.0

Create python functions to compute predictions:   

In [None]:
h_pred_1 = lambda x: slope_1 * x + intercept_1
h_pred_2 = lambda x: slope_2 * x + intercept_2
h_pred_3 = lambda x: slope_3 * x + intercept_3

Show the functions on the plot:

In [None]:
plt.figure("2", figsize=[10, 6])

ax = plt.subplot(1,1,1)


# Samples

plt.xlabel("High_GPA")
plt.ylabel("Univ_GPA")
plt.scatter(x_high, y_univ, color="slategrey", label="Samples")


# Linear Functions

x_line = np.array([1.5, 4.5])
plt.plot(x_line, h_pred_1(x_line), "-", label="$h_1(x)=0.9\cdot x+0.3$")
plt.plot(x_line, h_pred_2(x_line), "-", label="$h_2(x)=0.7\cdot x+1.1$")
plt.plot(x_line, h_pred_3(x_line), "-", label="$h_3(x)=0.4\cdot x+2.0$")


plt.axis([2, 4, 2, 4])
plt.grid(True)
ax.set_axisbelow(True)

plt.legend()

plt.show()

Which of the functions gives the best prediction?

In [None]:
y_pred_1 = h_pred_1(x_high)
y_pred_2 = h_pred_2(x_high)
y_pred_3 = h_pred_3(x_high)

In [None]:
err_1 = ((y_univ - y_pred_1)**2).sum()
err_2 = ((y_univ - y_pred_2)**2).sum()
err_3 = ((y_univ - y_pred_3)**2).sum()

err_1, err_2, err_3

$$\hat \theta_0, \hat \theta_1 =\operatorname*{arg\,min}_{\theta_0, \theta_1} 
\displaystyle\sum_{i=1}^{N} ( y_i - (\theta_0+\theta_1 x_i))^2$$

<a name="3"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">3. Brute-Force Search</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

In [None]:
# Assumptions about ranges of parameter values.
# Actually we can be far away from real values
coord_w0 = np.arange(-4, 5, 0.1)
coord_w1 = np.arange(-4, 5, 0.1)

W0, W1 = np.meshgrid(coord_w0, coord_w1)
W0.shape

Option 1

In [None]:
calculate_err = np.vectorize(
    lambda w0, w1: np.sum((y_univ - (w0 + w1*x_high))**2)
)

In [None]:
errs = calculate_err(W0, W1)
errs.shape

In [None]:
errs[:5, :5]

Option 2

In [None]:
# Pairs of parameters [(w0, w1),...]
W = np.dstack([W0, W1]).reshape(-1, 2)
W[:5]

In [None]:
errs_alt = np.apply_along_axis(
    func1d=lambda w: np.sum((y_univ - (w[0] + w[1]*x_high))**2),
    axis=1, 
    arr=W
)
errs_alt[:5]

Brude force error

In [None]:
err_bf = errs.min()

err_1, err_2, err_3, err_bf

Prediction function

In [None]:
w_bf = W[errs.argmin()]
w_bf

In [None]:
h_pred_bf = lambda x: w_bf[1] * x + w_bf[0]

Plots

In [None]:
plt.figure("3", figsize=[10, 6])

ax = plt.subplot(1,1,1)


# Samples

plt.scatter(x_high, y_univ, color="slategrey", label="Samples")
plt.xlabel("High_GPA")
plt.ylabel("Univ_GPA")


# Regression Lines

x_line = np.array([1.5, 4.5])
plt.plot(x_line, h_pred_1(x_line), "-", label="$f_1(x)=0.9\cdot x+0.3$")
plt.plot(x_line, h_pred_2(x_line), "-", label="$f_2(x)=0.7\cdot x+1.1$")
plt.plot(x_line, h_pred_3(x_line), "-", label="$f_3(x)=0.4\cdot x+2.0$")
plt.plot(x_line, h_pred_bf(x_line), "-", label="$f_{BF}(x)$")

plt.axis([2, 4, 2, 4])
plt.grid(True)
plt.legend()

plt.show()

<a name="4"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">4. Ordinary Least Squares</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

Estimates of parameters:

$$\hat{\theta}=\left(\begin{matrix}{\hat{\theta}}_0\\\begin{matrix}{\hat{\theta}}_1\\\vdots\\\end{matrix}\\{\hat{\theta}}_p\\\end{matrix}\right)=\left(X^TX\right)^{-1}X^T\mathrm{y}$$

Create $X$ and $\mathrm{y}$ matrices:

In [None]:
X = np.c_[np.ones(x_high.size), x_high]
X[:5,:2]

In [None]:
y = y_univ.reshape(-1, 1)
y[:5]

Now apply the formula for calculating regression parameters $\theta$:

In [None]:
# %%timeit -r 7
w = np.linalg.inv(X.T @ X) @ X.T @ y
w

Alternatively, the `linalg` module has the built-in function for finding the least-squares solution. Use it to get parameters:

In [None]:
# %%timeit -r 7
w, residuals, rank, s = np.linalg.lstsq(X, y, rcond=None)
w

In [None]:
w0, w1 = w[0,0], w[1,0]
w0, w1

Define the prediction function:

In [None]:
h_pred_ols = lambda x: w1 * x + w0

Plot the graph of the regression line:

In [None]:
plt.figure("3", figsize=[10, 6])

ax = plt.subplot(1,1,1)


# Samples

plt.scatter(x_high, y_univ, color="slategrey", label="Samples")
plt.xlabel("High_GPA")
plt.ylabel("Univ_GPA")


# Regression Lines

x_line = np.array([1.5, 4.5])
plt.plot(x_line, h_pred_1(x_line), "-", label="$f_1(x)=0.9\cdot x+0.3$")
plt.plot(x_line, h_pred_2(x_line), "-", label="$f_2(x)=0.7\cdot x+1.1$")
plt.plot(x_line, h_pred_3(x_line), "-", label="$f_3(x)=0.4\cdot x+2.0$")
plt.plot(x_line, h_pred_bf(x_line), "-", label="$f_{BF}(x)$")
plt.plot(x_line, h_pred_ols(x_line), "-", linewidth=4, 
         label="$f_{OLS}(x)$")


plt.axis([2, 4, 2, 4])
plt.grid(True)
plt.legend()

plt.show()

In [None]:
err_ols = ((h_pred_ols(x_high) - y_univ)**2).sum()
err_ols

In [None]:
err_1, err_2, err_3, err_bf, err_ols

<a name="5"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">5. Gradient Descent</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

Define the loss/cost function and its partial derivatives:

In [None]:
# The number of sample elements
n = len(x_high)

# Loss function 
loss = lambda x, y, w0, w1: 1 / n * sum([(y[i] - w1 * x[i] - w0) ** 2 for i in range(n)])

# Partial derivative of the loss over w0 and w1
derivative_w0 = lambda x, y, w0, w1: 2 / n * sum([-1 * (y[i] - w1 * x[i] - w0) for i in range(n)])
derivative_w1 = lambda x, y, w0, w1: 2 / n * sum([-x[i] * ( y[i] - w1 * x[i] - w0) for i in range(n)])

Write a function implementing the gradient descent method:

In [None]:
def compute_gradient_descent_(x, y, max_iter=200, min_err=0.0001, learning_rate=0.05):
    """
    Naive Gradient Descent (single feature, row computation)
    
    Parameters
    ----------
    learning_rate: step size
    max_iter: the maximum numbers of iterations
    min_err: a minimal change of cost error
    err: a cost function
    
    Returns
    -------
    
    
    """
    i = 0  # current iteration
    w0 = 0; w1 = 0  #  initial parameters
    w0_prev = 4; w1_prev = 4  # start point

    while i < max_iter:

        w0 = w0_prev - learning_rate * derivative_w0(x, y, w0_prev, w1_prev)
        w1 = w1_prev - learning_rate * derivative_w1(x, y, w0_prev, w1_prev)

        if abs(loss(x, y, w0, w1) - loss(x, y, w0_prev, w1_prev)) <= min_err:
            break

        w0_prev = w0
        w1_prev = w1

        i += 1

    return (w0, w1, i)

In [None]:
def compute_gradient_descent(
    X, y, 
    max_iter=200, 
    min_err=0.0001, 
    learning_rate=0.05, 
    initial_w=None
):
    """
    Naive Gradient Descent (multiple features, matrix computation)
    """
    # Number of samples and features
    n, p = X.shape
    # Initial weights (parameters)
    w = np.zeros((p, 1)) if initial_w is None else initial_w.reshape(-1, 1)
    # Loss function
    loss = 1.0/n * np.sum((X @ w - y)**2)
    for i in range(max_iter):
        # Compute gradient
        gradient = 2.0 / n * X.T @ (X @ w - y)
        # Update weights
        w -= learning_rate * gradient
        # Calculate loss
        loss_ = 1.0/n * np.sum((X @ w - y)**2)
        # Check stop criteria
        if abs(loss - loss_) <= min_err:
            break
        loss = loss_
    return (*w.reshape(-1).tolist(), i+1)

Launch parameters estimation using GD

In [None]:
w0_gd, w1_gd, i = compute_gradient_descent_(x_high, y_univ)
w0_gd, w1_gd, i

In [None]:
compute_gradient_descent(X, y, initial_w=np.array([4., 4.]))

In [None]:
# Specify the prediction function with the estimated parameters:
h_pred_gd = lambda x: w1_gd * x + w0_gd

In [None]:
# Plot the prediction line

x_line = np.array([1.5, 4.5])

plt.figure("3", figsize=[10, 6])

ax = plt.subplot(1,1,1)
plt.scatter(x_high, y_univ, color="slategrey", label="Samples")
plt.plot(x_line, h_pred_2(x_line), "-", label="$f_2(x)$")
plt.plot(x_line, h_pred_ols(x_line), "-", linewidth=4, 
         label="$f_{OLS}(x)$")
plt.plot(x_line, h_pred_gd(x_line), "-", linewidth=2, color="black",
         label="$f_{GD}(x)$")

plt.xlabel("High_GPA")
plt.ylabel("Univ_GPA")

plt.axis([2, 4, 2, 4])

plt.grid(True)
ax.set_axisbelow(True) 

plt.legend()

plt.show()

Calculate the MSE for the prediction function:

In [None]:
err_gd = ((y_univ - h_pred_gd(x_high))**2).sum()
err_gd

Compare errors:

In [None]:
err_1, err_2, err_3, err_bf, err_ols, err_gd

What's wrong? Plot the contour graphic of the cost function:

In [None]:
coord_w0 = np.arange(-4, 5, 0.1)
coord_w1 = np.arange(-4, 5, 0.1)

W0, W1 = np.meshgrid(coord_w0, coord_w1)

In [None]:
W0

In [None]:
coord_w0_large = np.arange(-400, 500, 1)
coord_w1_large = np.arange(-400, 500, 1)

W0_large, W1_large = np.meshgrid(coord_w0_large, coord_w1_large)

In [None]:
plt.figure("12",figsize=[12, 4])

plt.subplot(1,2,1)
plt.contour(W0, W1, loss(x_high, y_univ, W0, W1), 20, cmap=cm.bwr, alpha=0.5)
plt.grid(True)
plt.colorbar()
plt.xlabel("$\\theta_0$")
plt.ylabel("$\\theta_1$")

plt.subplot(1,2,2)
plt.contour(W0_large, W1_large, loss(x_high, y_univ, W0_large, W1_large), 20, cmap=cm.bwr, alpha=0.5)
plt.grid(True)
plt.colorbar()
plt.xlabel("$\\theta_0$")
plt.ylabel("$\\theta_1$")

plt.show()

### Feature standardization

$$ x_s = \frac{x - \bar{x}}{s} $$

In [None]:
# Standardization for single feature
x_high__stand = (x_high - x_high.mean()) / x_high.std()
x_high__stand[:5]

In [None]:
# Standardization for feature matrix
X_ = x_high.reshape(-1, 1)
X_stand = (X_ - X_.mean(axis=0)) / X_.std(axis=0)
X_stand = np.c_[np.ones(X_stand.shape[0]), X_stand]
X_stand[:3]

In [None]:
x_line = np.array([1.5, 4.5])

plt.figure("3", figsize=[12, 4])

ax = plt.subplot(1,2,1)

plt.scatter(x_high, y_univ, color="slategrey", label="Samples")
plt.title("Initial")
plt.xlabel("High_GPA")
plt.ylabel("Univ_GPA")

plt.axis([2, 4, 2, 4])

plt.grid(True)
ax.set_axisbelow(True) 

plt.legend()

ax = plt.subplot(1,2,2)

plt.scatter(x_high__stand, y_univ, color="slategrey", label="Samples")
plt.title("Standardized")
plt.xlabel("High_GPA")
plt.ylabel("Univ_GPA")

plt.grid(True)
ax.set_axisbelow(True) 

plt.legend()


plt.show()

In [None]:
plt.figure("12", figsize=[12, 4])

plt.subplot(1,2,1)
plt.contour(W0, W1, loss(x_high, y_univ, W0, W1), 20, cmap=cm.bwr, alpha=0.5)
plt.title("Initial")
plt.xlabel("$\\theta_0$")
plt.ylabel("$\\theta_1$")
plt.grid(True)
plt.colorbar()

plt.subplot(1,2,2)
plt.contour(W0, W1, loss(x_high__stand, y_univ, W0, W1), 20, cmap=cm.bwr, alpha=0.5)
plt.title("Standardized")
plt.xlabel("$\\theta_0$")
plt.ylabel("$\\theta_1$")
plt.grid(True)
plt.colorbar()

plt.show()

Repeat training with the GD for the standardized feature:

In [None]:
# %%timeit
w0_gd_stand, w1_gd_stand, i = compute_gradient_descent_(x_high__stand, y_univ)
w0_gd_stand, w1_gd_stand, i

Specify the prediction function:

In [None]:
h_pred_gr_stand = lambda x: w1_gd_stand * x + w0_gd_stand

Display the scatter plot:

In [None]:
x_line_stand = np.array([-2, 2])

plt.figure("3", figsize=[10, 6])

ax = plt.subplot(1,1,1)

plt.scatter(x_high__stand, y_univ, color="slategrey", label="Samples")
plt.plot(x_line_stand, h_pred_gr_stand(x_line_stand), "-", linewidth=2, color="black",
         label="$f_{StndGD}(x)$")

plt.xlabel("High_GPA")
plt.ylabel("Univ_GPA")

plt.grid(True)
ax.set_axisbelow(True) 

plt.legend()

plt.show()

Recover the initial scale:

In [None]:
w1_gd_rec = w1_gd_stand / x_high.std()
w1_gd_rec

In [None]:
w0_gd_rec = w0_gd_stand - w1_gd_stand * x_high.mean() / x_high.std()
w0_gd_rec

Specify the prediction function:

In [None]:
h_pred_gr_stand_recover = lambda x: w0_gd_rec + w1_gd_rec * x

Plot the prediction line:

In [None]:
x_line = np.array([1.5, 4.5])

plt.figure("3", figsize=[10, 6])

ax = plt.subplot(1,1,1)

plt.scatter(x_high, y_univ, color="slategrey", label="Samples")
plt.plot(x_line, h_pred_2(x_line), "-", label="$f_2(x)$")
plt.plot(x_line, h_pred_ols(x_line), "-", linewidth=4, 
         label="$f_{OLS}(x)$")
plt.plot(x_line, h_pred_gd(x_line), "-", linewidth=2, color="black",
         label="$f_{GD}(x)$")
plt.plot(x_line, h_pred_gr_stand_recover(x_line), "-", linewidth=2, color="cyan",
         label="$f_{StdGD}(x)$")

plt.xlabel("High_GPA")
plt.ylabel("Univ_GPA")

plt.axis([2, 4, 2, 4])

plt.grid(True)
ax.set_axisbelow(True) 

plt.legend()

plt.show()

Calculate the RSS:

In [None]:
err_gd_stand = ((y_univ - h_pred_gr_stand_recover(x_high))**2).sum()
err_gd_stand

In [None]:
err_1, err_2, err_3, err_bf, err_ols, err_gd, err_gd_stand

<a name="6"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">6. Stochastic Gradient Descent</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

Define the loss/cost function and its partial derivatives:

In [None]:
# The number of sample elements
n = len(x_high)

# Loss function 
loss = lambda x, y, w0, w1: 1 / n * sum([(y[i] - w1*x[i] - w0) ** 2 for i in range(n)])

# Partial derivative of err over w0 and w1
derivative_w0_i = lambda x, y, w0, w1, i: -2.0 * (y[i] - w0 - w1*x[i])
derivative_w1_i = lambda x, y, w0, w1, i: -2.0 * x[i] * (y[i] - w0 - w1*x[i])

In [None]:
def compute_stochastic_gradient_descent_(x, y, min_err=0.000001, learning_rate=0.05):
    """
    Naive Stochastic Gradient Descent (single feature, row computation)
    
    Parameters
    ----------
    learning_rate: step size
    min_err: a minimal change of cost error
    
    Returns
    -------
    w0: intercept
    w1: slope
    i: the number of iteration
    
    """

    i = 0  # current iteration
    w0 = 0; w1 = 0  #  initial parameters
    w0_prev = 4; w1_prev = 4  # start point
    
    err_prev = loss(x, y, w0, w1)
    err_cur = err_prev

    for j in range(n):
    
        w0 = w0_prev - learning_rate * derivative_w0_i(x, y, w0_prev, w1_prev, j)
        w1 = w1_prev - learning_rate * derivative_w1_i(x, y, w0_prev, w1_prev, j)
        
        err_cur = loss(x, y, w0, w1)

        if abs(err_cur - err_prev) <= min_err:
                break

        err_prev = err_cur    

        w0_prev = w0
        w1_prev = w1

        i += 1

    return (w0, w1, i)

In [None]:
def compute_stochastic_gradient_descent(
    X, y, 
    learning_rate=0.05, 
    min_err=0.000001,
    num_epochs=1, 
    batch_size=1,
    initial_w=None
):
    # Number of samples and features
    n, p = X.shape
    # Initial weights (parameters)
    w = np.zeros((p, 1)) if initial_w is None else initial_w.reshape(-1, 1)
    # Loss function
    loss = 1.0/n * np.sum((X @ w - y)**2)
    for epoch in range(num_epochs):
        # Shuffle data 
        shuffled_indices = np.random.permutation(n)        
        X_shuffled = X[shuffled_indices]
        y_shuffled = y[shuffled_indices]
        # Iterate over mini-batches
        for i in range(0, n, batch_size):
            # Compose mini-batch
            X_batch = X_shuffled[i:i + batch_size]
            y_batch = y_shuffled[i:i + batch_size]
            # Compute gradient
            gradient = 2.0 / batch_size * X_batch.T @ (X_batch @ w - y_batch)
            # Update weights
            w -= learning_rate * gradient
        # Calculate loss
        loss_ = 1.0/n * np.sum((X @ w - y)**2)
        # Check stop criteria
        if abs(loss - loss_) <= min_err:
            break
        loss = loss_
    return (*w.reshape(-1).tolist(), epoch*n + i+1)

Estimate regression parameters using the SGD:

In [None]:
# %%timeit
w0_sgd_stand, w1_sgd_stand, i = compute_stochastic_gradient_descent_(x_high__stand, y_univ)
w0_sgd_stand, w1_sgd_stand, i

In [None]:
w0_sgd_stand, w1_sgd_stand, i = compute_stochastic_gradient_descent(X_stand, y)
w0_sgd_stand, w1_sgd_stand, i

Specify the prediction function with the found parameters:

In [None]:
h_pred_sgd = lambda x: w1_sgd_stand * x + w0_sgd_stand

Plot the prediction line:

In [None]:
x_line_stand = np.array([-2, 2])

plt.figure("3", figsize=[10, 6])

ax = plt.subplot(1,1,1)

plt.scatter(x_high__stand, y_univ, color="slategrey", label="Samples")
plt.plot(x_line_stand, h_pred_sgd(x_line_stand), "-", linewidth=2, color="black",
         label="$f_{StndGD}(x)$")

plt.xlabel("High_GPA")
plt.ylabel("Univ_GPA")

plt.grid(True)
ax.set_axisbelow(True) 

plt.legend()

plt.show()

Recover the initial scale:

In [None]:
w1_sgd_rec = w1_sgd_stand / x_high.std()
w1_sgd_rec

In [None]:
w0_sgd_rec = w0_sgd_stand - w1_sgd_stand * x_high.mean() / x_high.std()
w0_sgd_rec

Specify the prediction function:

In [None]:
h_pred_sgd_stand_recover = lambda x: w0_sgd_rec + w1_sgd_rec * x

Plot the prediction line:

In [None]:
x_line = np.array([1.5, 4.5])

plt.figure("3", figsize=[10, 6])

ax = plt.subplot(1,1,1)

plt.scatter(x_high, y_univ, color="slategrey", label="Samples")
plt.plot(x_line, h_pred_ols(x_line), "-", linewidth=4, 
         label="$h_{OLS}(x)$")
plt.plot(x_line, h_pred_gr_stand_recover(x_line), "-", linewidth=1, color="cyan",
         label="$h_{StdGD}(x)$")
plt.plot(x_line, h_pred_sgd_stand_recover(x_line), "-", linewidth=4, color="darkmagenta",
         label="$h_{StdSGD}(x)$")

plt.xlabel("High_GPA")
plt.ylabel("Univ_GPA")

plt.axis([2, 4, 2, 4])

plt.grid(True)
ax.set_axisbelow(True) 

plt.legend()

plt.show()

Calculate the MSE:

In [None]:
err_sgd_stand = ((y_univ - h_pred_sgd_stand_recover(x_high))**2).sum()
err_sgd_stand

In [None]:
err_1, err_2, err_3, err_bf, err_ols, err_gd, err_gd_stand, err_sgd_stand

<a name="7"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">7. Linear Regression in Sklearn</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>

In [None]:
from sklearn.linear_model import LinearRegression, SGDRegressor

In [None]:
X = df1[["high_GPA"]]
y = df1["univ_GPA"]

# X = df1[["high_GPA"]].values
# y = df1["univ_GPA"].values

### Ordinary Least Squares

In [None]:
model = LinearRegression().fit(X, y)
model.intercept_, model.coef_[0]

In [None]:
err_sklearn__ols = np.sum((y - model.predict(X))**2)
err_sklearn__ols

### Stochastic Gradient Descent

In [None]:
mean = df1["high_GPA"].mean()
std = df1["high_GPA"].std()

X_stand = (df1[["high_GPA"]] - mean) / std

In [None]:
model = SGDRegressor().fit(X_stand, y)

# Parameters for standardized X
model.intercept_, model.coef_[0]

In [None]:
err_sklearn__sgd = np.sum((y - model.predict(X_stand))**2)
err_sklearn__sgd

<a name="8"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">8. References</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To contents</a></div>
    </div>
</div>