In [1]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import time

In [2]:
boston = load_boston()
X = boston.data
y = boston.target

In [9]:
# functions in Problem 1
def MSE(theta_hat, A, y):
    return 1/len(y)*np.sum(np.power(y-np.dot(A, theta_hat),2))
def ridge_regression(A, y, lambd=0.2):
    I = np.eye(A.shape[1])
    I[0][0] = 0
    return np.dot(np.dot(np.linalg.inv(np.dot(A.T, A)+lambd*I),A.T),y)
def Ridge_find_best(X, y, kfold, lambdas, to_print = True):
    kf = KFold(n_splits = kfold, shuffle = True)
    best_lambda = 0
    best_MSE = 100
    for idx, lambda_value in enumerate(lambdas):
        sum = 0
        for train_index, holdout_index in kf.split(X):
            Xtrain, Xholdout = X[train_index], X[holdout_index]
            ytrain, yholdout = y[train_index], y[holdout_index]
            Atrain = np.insert(Xtrain, 0, values = np.array([[1]*len(Xtrain)]), axis = 1)
            Aholdout = np.insert(Xholdout, 0, values = np.array([[1]*len(Xholdout)]), axis = 1)
            theta_kf = ridge_regression(Atrain, ytrain, lambd = lambda_value)
            sum += MSE(theta_kf, Aholdout, yholdout)
        MSE_now = sum/kfold
        if MSE_now < best_MSE:
            print("Current MSE: ", MSE_now, "; Current lambda: ",lambda_value)
            best_MSE = MSE_now
            best_lambda = lambda_value
    if to_print:
        print("===========================")
        print("The best value of lambda is: ",best_lambda)
    return best_lambda

# Problem 2

In [21]:
np.random.seed(2017)
n = 100
xtrain = np.random.rand(n)
ytrain = 0.25 + 0.5*xtrain + np.sqrt(0.1)*np.random.randn(n)
idx = np.random.randint(0,100,10)
ytrain[idx] = ytrain[idx] + np.random.randn(10)

## Step 1

In [22]:
X_train = xtrain.reshape(-1,1)
lambda_value = Ridge_find_best(X_train, ytrain, 10, np.arange(0,1,0.0001), to_print = True)

Current MSE:  0.14905081014734065 ; Current lambda:  0.0
Current MSE:  0.14834804803182666 ; Current lambda:  0.0006000000000000001
Current MSE:  0.14828096083651005 ; Current lambda:  0.0011
Current MSE:  0.14807429742526484 ; Current lambda:  0.021
Current MSE:  0.14747540851291202 ; Current lambda:  0.025400000000000002
Current MSE:  0.14712390795883654 ; Current lambda:  0.0646
Current MSE:  0.14702922191346646 ; Current lambda:  0.2043
Current MSE:  0.14695466541301808 ; Current lambda:  0.6183000000000001
Current MSE:  0.14691690442356126 ; Current lambda:  0.6947
The best value of lambda is:  0.6947


In [11]:
Atrain = np.insert(X_train, 0, values = np.array([[1]*len(xtrain)]), axis = 1)
theta_r2 = ridge_regression(Atrain, ytrain, lambd=lambda_value)
print("Mean-Squared Error: ", MSE(theta_r2, Atrain, ytrain))
print("Slope: ", theta_r2[0])
print("Intercept: ", theta_r2[1])

Mean-Squared Error:  0.1456883625019879
Slope:  0.29663654687134855
Intercept:  0.2750776792779469


## Step 2

### a. Take example of $\epsilon = 1.35$ and $\alpha = 0.001$

In [12]:
reg = linear_model.HuberRegressor(epsilon = 1.35, alpha=0.001)
reg.fit(xtrain.reshape(-1, 1),ytrain)
ypred = reg.predict(xtrain.reshape(-1, 1))
print("Coefficients: ", reg.coef_)
print("intercept: ", reg.intercept_)
print("Mean-Squared Error: %.2f" % mean_squared_error(ytrain, ypred))

Coefficients:  [0.34358241]
intercept:  0.26703906607398287
Mean-Squared Error: 0.15


### b. Find the best $\epsilon$ and $\alpha$ value

In [13]:
def Huber_find_best(X, y, kfold, epsilons, alphas, to_print = True):
    kf = KFold(n_splits = kfold, shuffle = True)
    best_epsilon, best_alpha = 0, 0
    best_MSE = 100
    for idxe, epsilon_value in enumerate(epsilons):
        for idxa, alpha_value in enumerate(alphas):
            sum = 0
            for train_index, holdout_index in kf.split(X):
                Xtrain, Xholdout = X[train_index], X[holdout_index]
                ytrain, yholdout = y[train_index], y[holdout_index]
                reg = linear_model.HuberRegressor(epsilon = epsilon_value, alpha=alpha_value)
                reg.fit(Xtrain,ytrain)
                ypred = reg.predict(Xholdout)
                sum += mean_squared_error(yholdout, ypred)
            MSE_now = sum/kfold
            if MSE_now < best_MSE:
                print("Current MSE: ",MSE_now,"; Current epsilon: ",epsilon_value,"; Current alpha: ",alpha_value)
                best_MSE = MSE_now
                best_epsilon = epsilon_value
                best_alpha = alpha_value
    if to_print:
        print("===========================")
        print("The best value of epsilon is: ", best_epsilon)
        print("The best value of alpha is: ", best_alpha)
    return [best_epsilon, best_alpha]

In [14]:
# generate test data
np.random.seed(2020)
m = 100
xtest = np.random.rand(m)
ytest = 0.25 + 0.5*xtest + np.sqrt(0.1)*np.random.randn(m)
idx = np.random.randint(0,100,10)
ytest[idx] = ytest[idx] + np.random.randn(10)

In [20]:
# This cell takes 4 minutes to run.
start_time = time.time()
[best_epsilon, best_alpha] = Huber_find_best(X_train, ytrain, 3, np.arange(1.01,2,0.01), np.arange(0,5,0.01))
reg = linear_model.HuberRegressor(epsilon = best_epsilon, alpha=best_alpha)
reg.fit(xtest.reshape(-1, 1),ytest)
ypred = reg.predict(xtest.reshape(-1, 1))
print("===========================")
print("Coefficients: ", reg.coef_)
print("intercept: ", reg.intercept_)
print("Mean-Squared Error: %.2f" % mean_squared_error(ytest, ypred))
print("===========================")
print("This cell takes %s seconds to run." % (time.time() - start_time))

Current MSE:  0.15205083065806316 ; Current epsilon:  1.01 ; Current alpha:  0.0
Current MSE:  0.1462254497559515 ; Current epsilon:  1.01 ; Current alpha:  0.03
Current MSE:  0.14512746614333846 ; Current epsilon:  1.01 ; Current alpha:  0.11
Current MSE:  0.1444861398135839 ; Current epsilon:  1.01 ; Current alpha:  0.6
Current MSE:  0.14436405875075128 ; Current epsilon:  1.02 ; Current alpha:  2.86
Current MSE:  0.14389530733722422 ; Current epsilon:  1.03 ; Current alpha:  1.07
Current MSE:  0.14365433213589848 ; Current epsilon:  1.06 ; Current alpha:  0.14
Current MSE:  0.14355881757834996 ; Current epsilon:  1.1 ; Current alpha:  1.42
The best value of epsilon is:  1.1
The best value of alpha is:  1.42
Coefficients:  [0.43724164]
intercept:  0.27018732193237704
Mean-Squared Error: 0.18
This cell takes 242.35617780685425 seconds to run.
