In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso,LassoCV,LassoLarsCV, RidgeCV, SGDRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error

## Data Loading

In [None]:
    # Data loading
    data = pd.read_csv("train.csv")
    y = data["y"].to_numpy()
    data = data.drop(columns=["Id", "y"])
    # print a few data samples
    print(data.head())

    X = data.to_numpy()

In [None]:
X[:,0].size

## function 

In [None]:
def calculate_RMSE(w, X, y):
    """This function takes test data points (X and y), and computes the empirical RMSE of 
    predicting y from X using a linear model with weights w. 

    Parameters
    ----------
    w: array of floats: dim = (13,), optimal parameters of ridge regression 
    X: matrix of floats, dim = (15,13), inputs with 13 features
    y: array of floats, dim = (15,), input labels

    Returns
    ----------
    RMSE: float: dim = 1, RMSE value
    """
    RMSE = 0
    # TODO: Enter your code here
#     y_pred = X @ w
#     RMSE = sqrt(np.sum(y - y_pred) ** 2)
    y_pred = np.matmul(X, w)
    RMSE = mean_squared_error(y, y_pred, squared=False)
    assert np.isscalar(RMSE)
    return RMSE

## Data transformation

### Test 

In [None]:
X_transformed = np.zeros((700, 21))

In [None]:
X_transformed[:,0].shape

In [None]:
# Linear
X_transformed[:,0:5] = X

In [None]:
# Quadratic
X_transformed[:,5:10] = np.multiply(X, X)

In [None]:
# Exponential
X_transformed[:,10:15] = np.exp(X)

In [None]:
# Cosine
X_transformed[:,15:20] = np.cos(X)

In [None]:
# Constant
X_transformed[:,20] = 1

In [None]:
X[0,:]

In [None]:
X_transformed[0,:]

### Function Implement

In [None]:
## function
def transform_data(X):
    """
    This function transforms the 5 input features of matrix X (x_i denoting the i-th component of X) 
    into 21 new features phi(X) in the following manner:
    5 linear features: phi_1(X) = x_1, phi_2(X) = x_2, phi_3(X) = x_3, phi_4(X) = x_4, phi_5(X) = x_5
    5 quadratic features: phi_6(X) = x_1^2, phi_7(X) = x_2^2, phi_8(X) = x_3^2, phi_9(X) = x_4^2, phi_10(X) = x_5^2
    5 exponential features: phi_11(X) = exp(x_1), phi_12(X) = exp(x_2), phi_13(X) = exp(x_3), phi_14(X) = exp(x_4), phi_15(X) = exp(x_5)
    5 cosine features: phi_16(X) = cos(x_1), phi_17(X) = cos(x_2), phi_18(X) = cos(x_3), phi_19(X) = cos(x_4), phi_20(X) = cos(x_5)
    1 constant features: phi_21(X)=1

    Parameters
    ----------
    X: matrix of floats, dim = (700,5), inputs with 5 features

    Returns
    ----------
    X_transformed: array of floats: dim = (700,21), transformed input with 21 features
    """
    X_transformed = np.zeros((700, 21))
    # TODO: Enter your code here
    # Linear
    X_transformed[:,0:5] = X
    # Quadratic
    X_transformed[:,5:10] = np.multiply(X, X)
    # Exponential
    X_transformed[:,10:15] = np.exp(X)
    # Cosine
    X_transformed[:,15:20] = np.cos(X)
    # Constant
    X_transformed[:,20] = 1
    assert X_transformed.shape == (700, 21)
    return X_transformed



##  Fit

### Test

In [None]:
weight = np.zeros((21,))

In [None]:
# 划分测试集和训练集
xtrain,xtest,ytrain,ytest = train_test_split(X_transformed, y, test_size=0.3)

In [None]:
# 自己建立Lasso进行alpha选择的范围
# 形成10为底的指数函数
# 10**（-10） -10**（-2）
alpha_range = np.logspace(-10, 2, 5000, base=10)
# print(alpha_range) # 200个自定义的alpha值

In [None]:
alpha_range

### Lasso 

In [None]:
Lasso?

In [None]:
lasso = Lasso(max_iter = 10000, alpha = 0.003, random_state = 99).fit(xtrain,ytrain)

In [None]:
calculate_RMSE(lasso.coef_, xtest, ytest)

### LassoCV

In [None]:
lasso_ = LassoCV(alphas=alpha_range,random_state = 99, cv=5).fit(xtrain,ytrain)

In [None]:
lasso_2 = LassoCV(alphas=alpha_range,random_state = 99, cv=5).fit(X_transformed, y)

In [None]:
# 查看最佳正则化系数
best_alpha = lasso_2.alpha_ 

In [None]:
best_alpha

In [None]:
calculate_RMSE(lasso_2.coef_, xtest, ytest)

In [None]:
calculate_RMSE(lasso_.coef_, xtest, ytest)

### Linear

In [None]:
Linear_ = LinearRegression(fit_intercept=False)

In [None]:
Linear_.fit(xtrain,ytrain)

In [None]:
calculate_RMSE(Linear_.coef_, xtest, ytest)

In [None]:
LinearRegression?

### Ridge 

In [None]:
ridge_ = RidgeCV(alphas=alpha_range,cv=5).fit(xtrain,ytrain)

In [None]:
ridge_2 = RidgeCV(alphas=alpha_range,cv=5).fit(X_transformed, y)

In [None]:
ridge_.alpha_

In [None]:
ridge_.coef_

In [None]:
calculate_RMSE(ridge_.coef_, xtest, ytest)

In [None]:
ridge_2.alpha_

In [None]:
ridge_2.coef_

In [None]:
calculate_RMSE(ridge_2.coef_, xtest, ytest)

### SGDRegressor 

In [None]:
SGDRegressor?

In [None]:
SGD = SGDRegressor(loss='squared_error', penalty='l2', fit_intercept=False, random_state=0)

In [None]:
SGD.fit(X_transformed, y)

In [None]:
calculate_RMSE(SGD.coef_, xtest, ytest)

### SGDRegressor CV

In [None]:
loss_range = ['squared_error',  'epsilon_insensitive',  'squared_epsilon_insensitive', 'huber']
penalty_range = ['l1',  'l2']
n_folds = 10

In [None]:
loss_range[1]

In [None]:
for loss in loss_range:
    for penalty in penalty_range:
        SGD = SGDRegressor(loss=loss, penalty=penalty, fit_intercept=False, random_state=0).fit(xtrain, ytrain)
        print(loss)
        print(penalty)
        print(calculate_RMSE(SGD.coef_, xtest, ytest))

In [None]:
def fit_SGD(X, y, loss, penalty):

    w = np.zeros((13,))
    # TODO: Enter your code here
    SGD = SGDRegressor(loss=loss, penalty=penalty, fit_intercept=False, random_state=0).fit(xtrain, ytrain)
    w = SGD.coef_
    assert w.shape == (21,)
    return w

In [None]:
def select_model(X, y, loss_range, penalty_range, n_folds):
    RMSE_mat = np.zeros((n_folds, len(loss_range), len(penalty_range)))
    kf = KFold(n_splits = n_folds)
    for index_loss, loss in enumerate(loss_range):
        for index_pen,penalty in enumerate(penalty_range):
            index_kSet = 0
            for train_index, valid_index in kf.split(X):
                weight = fit_SGD(X[train_index], y[train_index], loss, penalty)
                RMSE = calculate_RMSE(weight, X[valid_index], y[valid_index])
                RMSE_mat[index_kSet, index_loss, index_pen] = RMSE
                index_kSet= index_kSet + 1
                        
    avg_RMSE = np.mean(RMSE_mat, axis=0)
    best_para = [loss_range[np.argmin([avg_RMSE])%3], penalty_range[np.argmin([avg_RMSE])%2]]
    return avg_RMSE, best_para

In [None]:
avg_RMSE,best_para = select_model(X_transformed, y, loss_range, penalty_range, n_folds)

In [None]:
avg_RMSE

In [None]:
best_para

In [None]:
w_SGD = SGDRegressor(loss = best_para[0], penalty = best_para[1], fit_intercept=False, random_state=0).fit(X_transformed, y)

In [None]:
w_SGD

### Function Implement 

In [None]:
w = w_SGD

In [None]:
np.savetxt("./results.csv", w, fmt="%.12f")

In [None]:
w

[ 6.62159306 -5.21058439 -1.36003242  7.06248808 -0.52948709 -0.2185518
  2.13810967  3.76680927  2.71728928  7.5252329  -6.94040494  3.68717124
 -0.87612654 -7.12829856  2.42435479  2.32225476  0.15700074  0.05849815
 -0.40078346 -1.42872466  1.6447392 ]

In [None]:
w_r = np.array([ 6.62159306, -5.21058439, -1.36003242,  7.06248808, -0.52948709, -0.2185518,
  2.13810967,  3.76680927,  2.71728928,  7.5252329,  -6.94040494,  3.68717124,
 -0.87612654, -7.12829856,  2.42435479,  2.32225476,  0.15700074,  0.05849815,
 -0.40078346, -1.42872466,  1.6447392])

In [None]:
calculate_RMSE(w_r, xtest, ytest)