In [3]:
import pandas as pd
import numpy as np
from scipy import stats
import random
import math

from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score

In [4]:
def NormalEquation(matX,vecY):
    X_transpose = np.transpose(matX)

    left_side = np.dot(X_transpose,matX)
    right_side = np.dot(X_transpose,vecY)

    ans = np.linalg.inv(left_side).dot(right_side)
    return ans

def generate_feature_matrix(x, degree):
    if degree == 0:
        X = np.ones((len(x), 1))
        return X
    elif degree == 1:
        X = np.column_stack((np.ones(len(x)), x))
        return X
    elif degree > 1:
        X_poly = np.column_stack([x ** d for d in range(0, degree)])
        return X_poly
    else:
        raise ValueError("Degree must be a non-negative integer.")

def CalculateMSE(x_positions, y_positions, w_list):
    n = len(x_positions)
    x_positions = generate_feature_matrix(x_positions,len(w_list))
    MSE = 0
    for i in range(0, n):
        wx = np.dot(x_positions[i],np.flip(w_list))
        MSE += (wx - y_positions[i]) ** 2
        
    MSE *= 1/(n)
    # print("new",MSE)
    return MSE

def testCalculateMSE(x_positions, y_positions, w_list):
    x = np.array(x_positions)
    n = len(x_positions)
    MSE = 0
    for i in range(0, n):
        wx, count = 0, 0
    for w in w_list:
        wx += w * x[i, count-1] if count > 0 else w
        count += 1
        MSE += (wx - y_positions[i]) ** 2
    MSE *= 1/(n)
    return MSE

In [5]:
df = pd.read_csv("data/sin_noiseless_10sample.csv")
print(df["y"])

0   -1.220000e-16
1   -5.877853e-01
2   -9.510565e-01
3   -9.510565e-01
4   -5.877853e-01
5    0.000000e+00
6    5.877853e-01
7    9.510565e-01
8    9.510565e-01
9    5.877853e-01
Name: y, dtype: float64


In [6]:
k_folds = KFold(n_splits=5)

for train_index, validate_index in k_folds.split(df[["x", "x^2", "x^3"]]):
    print(train_index, validate_index)
    X_train , X_validate = df[["x", "x^2", "x^3"]].iloc[train_index],df[["x", "x^2", "x^3"]].iloc[validate_index]
    y_train , y_validate = df["y"].iloc[train_index] , df["y"].iloc[validate_index]

    new_X = [[1,1,1,1,1,1,1,1]]
    for i in np.array(X_train).transpose():
        new_X.append(list(i))
    print(new_X)
    X_matrix = np.array(new_X).transpose()
    print(X_matrix)
    print(NormalEquation(X_matrix, y_train))
    print(df[["x", "x^2", "x^3"]].shape)

[2 3 4 5 6 7 8 9] [0 1]
[[1, 1, 1, 1, 1, 1, 1, 1], [-0.6, -0.4, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8], [0.36, 0.16, 0.04, 0.0, 0.04, 0.16, 0.36, 0.64], [-0.216, -0.064, -0.008, 0.0, 0.008, 0.064, 0.216, 0.512]]
[[ 1.    -0.6    0.36  -0.216]
 [ 1.    -0.4    0.16  -0.064]
 [ 1.    -0.2    0.04  -0.008]
 [ 1.     0.     0.     0.   ]
 [ 1.     0.2    0.04   0.008]
 [ 1.     0.4    0.16   0.064]
 [ 1.     0.6    0.36   0.216]
 [ 1.     0.8    0.64   0.512]]
[-0.01076204  2.92260765  0.08968369 -3.55868716]
(10, 3)
[0 1 4 5 6 7 8 9] [2 3]
[[1, 1, 1, 1, 1, 1, 1, 1], [-1.0, -0.8, -0.2, 0.0, 0.2, 0.4, 0.6, 0.8], [1.0, 0.64, 0.04, 0.0, 0.04, 0.16, 0.36, 0.64], [-1.0, -0.512, -0.008, 0.0, 0.008, 0.064, 0.216, 0.512]]
[[ 1.    -1.     1.    -1.   ]
 [ 1.    -0.8    0.64  -0.512]
 [ 1.    -0.2    0.04  -0.008]
 [ 1.     0.     0.     0.   ]
 [ 1.     0.2    0.04   0.008]
 [ 1.     0.4    0.16   0.064]
 [ 1.     0.6    0.36   0.216]
 [ 1.     0.8    0.64   0.512]]
[ 0.03330992  2.65935653 -0.14460659 -2

In [7]:
def old_cross_validation(X, y, folds):
    #Not done
    k_folds = KFold(n_splits=folds)

    avg_rmse = []

    for train_index, validate_index in k_folds.split(X):
        #print(train_index, validate_index)
        X_train , X_validate = X.iloc[train_index], X.iloc[validate_index]
        y_train , y_validate = y.iloc[train_index] , y.iloc[validate_index]

        new_X = [[1 for i in range(X_train.shape[0])]]
        for i in np.array(X_train).transpose():
            new_X.append(list(i))
        #print(new_X)
        X_matrix = np.array(new_X).transpose()
        model = NormalEquation(X_matrix, y_train)
        #print(X_matrix)
        #print(model)
        avg_rmse.append(testCalculateMSE(np.array(X_validate), np.array(y_validate), model)**(1/2))
        #print(testCalculateMSE(np.array(X_validate), np.array(y_validate), model[0], model[1]))
    return np.mean(avg_rmse)

In [13]:
def cross_validation(X, y, folds, degree):
    k_folds = KFold(n_splits=folds)
    
    avg_rmse = []
    for train_index, validate_index in k_folds.split(X):
        # print(train_index,validate_index)
        X_train , X_validate = X.iloc[train_index], X.iloc[validate_index]
        y_train , y_validate = y.iloc[train_index], y.iloc[validate_index]

        result = np.polyfit(X_train.values.flatten(), np.array(y_train), degree)

        avg_rmse.append(CalculateMSE(X_validate.values.flatten(), np.array(y_validate), result)**(1/2))
    
    return np.average(avg_rmse)

def cross_validate_for_degree(X, y, folds, degree_list):
    column_name = list(X)
    best_degree = degree_list[0]
    rmse_list = []
    if len(column_name) == 1: return best_degree
    for degree in degree_list:
        rmse_list.append(cross_validation(X["x"], y, folds, degree))
    best_degree = degree_list[rmse_list.index(min(rmse_list))]
    return (rmse_list, best_degree)

def nested_cross_validation(X, y, folds, degree_list):
    k_folds = KFold(n_splits=folds)

    rmse_list = []

    for train_index, test_index in k_folds.split(X):
        X_train , X_test = X.iloc[train_index], X.iloc[test_index]
        y_train , y_test = y.iloc[train_index], y.iloc[test_index]

        degree = cross_validate_for_degree(X_train, y_train, folds, degree_list)[1]


        model = np.polyfit(X_train["x"].values.flatten(), np.array(y_train), degree)
        rmse_list.append(CalculateMSE(X_test["x"].values.flatten(), np.array(y_test), model)**(1/2))

    return np.average(rmse_list)


In [16]:
print(cross_validation(df[["x"]], df["y"], 10, 8))

0.00289954424127811


In [17]:
cross_validate_for_degree(df[list(df)[:-2]], df["y"], 5, range(1,9))

  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))


([0.6656330596585266,
  1.1910411800020615,
  0.29044687960240695,
  0.48626848538194256,
  0.08610047347755402,
  0.12060067686311929,
  0.016607980611663083,
  0.36285718455116284],
 7)

In [19]:
nested_cross_validation(df[list(df)[:-2]], df["y"], 2, range(1,9))

  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, degree))
  rmse_list.append(cross_validation(X["x"], y, folds, d

3.6730202883043335