In [38]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.model_selection import KFold

In [39]:
train_data = pd.read_csv('train.csv')
y = train_data['y'].to_numpy()
X = train_data.iloc[:, 1:].to_numpy()

In [40]:
lambda_vec = np.array([0.1,1,10,100,200]) #### lambda vector
number_folds = 10 #### Cross-validation folds
rand_seed = 10

In [41]:
# calculate weights with the closed form for ridge regression

# 1. iteratively split the date in K = 10 folds
# K-fold cross-validation through sklearn

kfolds_class = KFold(number_folds, shuffle=True, random_state=rand_seed)
#kfolds_class = KFold(number_folds)
RMSE_4_lambda = []
# 2. do ridge regression with iteratively different lambdas
for lbd in lambda_vec :
    RMSE_list = []
    regression_class = linear_model.Ridge(alpha=lbd,solver='svd',random_state=rand_seed)
    #regression_class = linear_model.Ridge(alpha=lbd)
    for train_index, test_index in kfolds_class.split(X):

        X_train, X_validation = X[train_index], X[test_index]
        y_train, y_validation = y[train_index], y[test_index]

        regression_class.fit(X_train,y_train)
        pred_label = regression_class.predict(X_validation)

        RMSE_list.append(mean_squared_error(pred_label, y_validation)**0.5)

    RMSE_list = np.array(RMSE_list)
    RMSE_4_lambda.append(np.average(RMSE_list))

RMSE_4_lambda = np.array(RMSE_4_lambda)
print(RMSE_4_lambda)

[5.32224088 5.32304588 5.24667796 5.36259547 5.45166629]


In [42]:
# Create Pandas Series
rmse_series = pd.Series(RMSE_4_lambda)
# Output to file
rmse_series.to_csv('Submission.csv',header=False,index=False)

report: the code is fundamentally divided in five parts. I've used a notebook for the ease of visualization of the output of each section.
First part are the import, I used sklearn, a standard library for ML with the exact libraries to perform what was requested by the task.
 For the second part I used pandas to import the .csv train file from the same folder and and split tables and data in to separate numpy arrays. I used .Iloc() to skip the first column that is the label. Third section is simply used to define environmental variables, as requested by the task. I have set a random_seed for reproducibility purposes.Fourth section: I first used the sklearn class Fold the idea is to use the attribute split of KFold class, to generate indices to split data into training and validation set. Shuffle allows to form a fold with non consecutive elements.  RMSE_4_lambda is used to store the average of the RMSE for each lambda. Then we iterate for each lambda. regression_class is the object used to store the model to perform ridge regression, there we used the current lambda , singular Value decomposition as solver (is it has given the lowest error so far). X_train and validation, Y_train and validation are vectors and the are feeded to the .fit class that calculates the weights, than we do the forward pass with validation set and calculate the RMSE for the fold. For each lambda we average the RMSE for OK-fold then we write our results trough the fifth section on the submission