In [44]:
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import clone

# linear input
X = 2 * np.random.rand(100,1)
y = 4 + 3 * X + np.random.rand(100,1)

# Normal Equation
X_b = np.c_[np.ones((100,1)), X]
theta_best = np.linalg.inv(X_b.T.dot(X_b)).dot(X_b.T).dot(y)
X_new = np.array([[0],[2]])
X_new_b = np.c_[np.ones((2,1)), X_new]
y_predict = X_new_b.dot(theta_best)

# sklearn
lin_reg = LinearRegression()
lin_reg.fit(X, y)
y_predict_sklearn = lin_reg.predict(X_new)

# plot linear regression
plt.figure()
plt.plot(X_new, y_predict, "r-", linewidth=2, label="normal equation")
plt.plot(X, y, "b.", label="data")
plt.plot(X_new, y_predict_sklearn, "g--", linewidth=0.5, label="sklearn")
plt.xlabel("x_1")
plt.ylabel("y")
plt.xlim([0,2])
plt.ylim([0,15])
plt.legend()
plt.savefig("../plots/ex_4_01.pdf")

# batch gradient descent
eta = 0.1
n_iterations = 1000
m = 100
theta = np.random.randn(2,1)
theta0_batch = []
theta1_batch = []
for iteration in range(n_iterations):
    gradients = 2/m * X_b.T.dot(X_b.dot(theta)-y)
    theta = theta - eta * gradients
    theta0_batch.append(theta[0])
    theta1_batch.append(theta[1])

# stochastic gradient descent (SGD)
n_epochs = 50
t0, t1 = 5, 50
def learning_schedule(t):
    return t0 / (t+t1)
theta = np.random.randn(2,1)
theta0_stoch = []
theta1_stoch = []
for epoch in range(n_epochs):
    for i in range(m):
        random_index = np.random.randint(m)
        xi = X_b[random_index:random_index+1]
        yi = y[random_index:random_index+1]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients
        theta0_stoch.append(theta[0])
        theta1_stoch.append(theta[1])

# mini-batch gradient descent
theta = np.random.randn(2,1)
theta0_mini_stoch = []
theta1_mini_stoch = []
for epoch in range(n_epochs):
    for i in range(m):
        random_index = np.random.randint(m-3)
        xi = X_b[random_index:random_index+3]
        yi = y[random_index:random_index+3]
        gradients = 2 * xi.T.dot(xi.dot(theta) - yi)
        eta = learning_schedule(epoch * m + i)
        theta = theta - eta * gradients
        theta0_mini_stoch.append(theta[0])
        theta1_mini_stoch.append(theta[1])
        
# plot gradient descent
plt.figure()
plt.plot(theta0_stoch, theta1_stoch, "g.-",label="stochastic")
plt.plot(theta0_stoch[-1], theta1_stoch[-1], "r.")
plt.plot(theta0_mini_stoch, theta1_mini_stoch, "y.-",label="mini batch")
plt.plot(theta0_mini_stoch[-1], theta1_mini_stoch[-1], "r.")
plt.plot(theta0_batch, theta1_batch, "b.-", label="batch")
plt.plot(theta0_batch[-1], theta1_batch[-1], "r.")
plt.xlabel("theta_0")
plt.ylabel("theta_1")
plt.legend()
plt.savefig("../plots/ex_4_02.pdf")

# sklearn SGD
sgd_reg = SGDRegressor(n_iter=50, penalty=None, eta0=0.1)
sgd_reg.fit(X, y.ravel())

# polynomial input
m = 100
X = 6 * np.random.rand(m, 1) - 3
y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)

# sklearn polynomial
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)
lin_reg = LinearRegression()
lin_reg.fit(X_poly, y)
y_predict_poly = lin_reg.predict(X_poly)

# plot polynomial input
plt.figure()
plt.plot(X, y, "b.", label="data")
plt.plot(X, y_predict_poly, "g.", label="poly")
#plt.plot(X, y_predict_ridge, "y.", label="Ridge reg.")
#plt.plot(X, y_predict_sgd, "r.", label="SGD reg.")
plt.xlabel("x_1")
plt.ylabel("y")
plt.xlim([-3,3])
plt.ylim([0,10])
plt.legend()
plt.savefig("../plots/ex_4_03.pdf")

# learning curves
def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))
        val_errors.append(mean_squared_error(y_val_predict, y_val))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
    plt.xlabel("Training set size")
    plt.ylabel("RMSE")
    plt.legend()
lin_reg = LinearRegression()
plt.figure()
plot_learning_curves(lin_reg, X, y)
plt.title("DoF = 1")
plt.ylim([0,5])
plt.savefig("../plots/ex_4_04.pdf")
polynomial_regression = Pipeline([
    ("poly_feature", PolynomialFeatures(degree=10, include_bias=False)),
    ("lin_reg", LinearRegression()),
])
plt.figure()
plot_learning_curves(polynomial_regression, X, y)
plt.title("DoF = 10")
plt.ylim([0,5])
plt.savefig("../plots/ex_4_05.pdf")

# Ridge Regression
ridge_reg = Ridge(alpha=1, solver="cholesky")
ridge_reg.fit(X, y)
y_predict_ridge = ridge_reg.predict(X)
sgd_reg = SGDRegressor(penalty="l2")
sgd_reg.fit(X, y.ravel())
y_predict_sgd_l2 = sgd_reg.predict(X)

# Lasso Regression
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X, y)
y_predict_lasso = lasso_reg.predict(X)
sgd_reg = SGDRegressor(penalty="l1")
sgd_reg.fit(X, y.ravel())
y_predict_sgd_l1 = sgd_reg.predict(X)

# Elastic Net
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X, y)
y_predict_elastic = elastic_net.predict(X)

# early stopping
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train)
X_val_poly_scaled = scaler.fit_transform(X_val)
sgd_reg = SGDRegressor(n_iter=1, warm_start=True, penalty=None, 
                       learning_rate="constant", eta0=0.0005)
minimum_val_error = float("inf")
best_epoch = None
best_model = None
train_error = []
val_error = []
for epoch in range(1000):
    sgd_reg.fit(X_train_poly_scaled, y_train)
    y_train_predict = sgd_reg.predict(X_train_poly_scaled)
    y_val_predict = sgd_reg.predict(X_val_poly_scaled)
    train_error.append(mean_squared_error(y_train_predict, y_train))
    val_error.append(mean_squared_error(y_val_predict, y_val))
    """
    if val_error < minimum_val_error:
        minimum_val_error = val_error
        best_epoch = epoch
        best_model = clone(sgd_reg)
    """
plt.figure()
plt.plot(range(1000), train_error, "b", label="train")
plt.plot(range(1000), val_error, "g--", label="val")
plt.xlabel("Epoch")
plt.ylabel("RMSE")
plt.xlim([0,500])
plt.ylim([1,4])
plt.legend()
plt.savefig("../plots/ex_4_06.pdf")
