In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import random
import math

from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, cross_val_score

In [2]:
def NormalEquation(matX,vecY):
    X_transpose = np.transpose(matX)

    left_side = np.dot(X_transpose,matX)
    right_side = np.dot(X_transpose,vecY)

    ans = np.linalg.inv(left_side).dot(right_side)
    return ans

def generate_feature_matrix(x, degree):
    if degree == 0:
        X = np.ones((len(x), 1))
        return X
    elif degree == 1:
        X = np.column_stack((np.ones(len(x)), x))
        return X
    elif degree > 1:
        X_poly = np.column_stack([x ** d for d in range(1, degree + 1)])
        return X_poly
    else:
        raise ValueError("Degree must be a non-negative integer.")

def CalculateMSE(x_positions, y_positions, w_list):
    n = len(x_positions)
    x_positions = generate_feature_matrix(x_positions,len(w_list))
    MSE = 0
    for i in range(0, n):
        wx = np.dot(x_positions[i],np.flip(w_list))
        MSE += (wx - y_positions[i]) ** 2
        
    MSE *= 1/(n)
    # print("new",MSE)
    return MSE

def testCalculateMSE(x_positions, y_positions, w0, w1):
    n = len(x_positions)
    MSE = 0
    for i in range(0, n):
        MSE += (w0 + w1*x_positions[i] - y_positions[i]) ** 2
    MSE *= 1/(n)
    # print("old",MSE)
    return MSE

In [6]:
df = pd.read_csv("data/sin_noiseless_10sample.csv")
print(df["y"])

0   -1.220000e-16
1   -5.877853e-01
2   -9.510565e-01
3   -9.510565e-01
4   -5.877853e-01
5    0.000000e+00
6    5.877853e-01
7    9.510565e-01
8    9.510565e-01
9    5.877853e-01
Name: y, dtype: float64


In [None]:
k_folds = KFold(n_splits=5)

for train_index, validate_index in k_folds.split(df[["x", "x^2", "x^3"]]):
    print(train_index, validate_index)
    X_train , X_validate = df[["x", "x^2", "x^3"]].iloc[train_index],df[["x", "x^2", "x^3"]].iloc[validate_index]
    y_train , y_validate = df["y"].iloc[train_index] , df["y"].iloc[validate_index]

    new_X = [[1,1,1,1,1,1,1,1]]
    for i in np.array(X_train).transpose():
        new_X.append(list(i))
    print(new_X)
    X_matrix = np.array(new_X).transpose()
    print(X_matrix)
    print(NormalEquation(X_matrix, y_train))
    print(df[["x", "x^2", "x^3"]].shape)

In [None]:
def old_cross_validation(X, y, folds):
    #Not done
    k_folds = KFold(n_splits=folds)

    avg_rmse = []

    for train_index, validate_index in k_folds.split(X):
        #print(train_index, validate_index)
        X_train , X_validate = X.iloc[train_index], X.iloc[validate_index]
        y_train , y_validate = y.iloc[train_index] , y.iloc[validate_index]

        new_X = [[1 for i in range(X_train.shape[0])]]
        for i in np.array(X_train).transpose():
            new_X.append(list(i))
        #print(new_X)
        X_matrix = np.array(new_X).transpose()
        model = NormalEquation(X_matrix, y_train)
        #print(X_matrix)
        #print(model)
        avg_rmse.append(CalculateMSE(np.array(X_validate), np.array(y_validate), model)**(1/2))
        #print(testCalculateMSE(np.array(X_validate), np.array(y_validate), model[0], model[1]))
    print(avg_rmse)
    return np.mean(avg_rmse)

In [21]:
def cross_validation(X, y, folds, degree):
    #Not done
    k_folds = KFold(n_splits=folds)
    
    avg_rmse = []
    for train_index, validate_index in k_folds.split(X):
        # print(train_index,validate_index)
        X_train , X_validate = X.iloc[train_index], X.iloc[validate_index]
        y_train , y_validate = y.iloc[train_index], y.iloc[validate_index]

        result = np.polyfit(X_train.values.flatten(), np.array(y_train), degree)

        avg_rmse.append(CalculateMSE(X_validate.values.flatten(), np.array(y_validate), result))
    print(np.mean(avg_rmse))
    return np.mean(avg_rmse)

def cross_validate_for_degree(X, y, folds, degree_list):
    column_name = list(X)
    print(column_name)
    best_degree = degree_list[0]
    if len(column_name) == 1: return best_degree
    best_mse = cross_validation(X["x"], y, folds, degree_list[0])
    for degree in degree_list[1:]:
        mse = cross_validation(X["x"], y, folds, degree)
        if mse < best_mse:
            best_mse = mse
            best_degree = degree
    return best_degree

In [19]:
cross_validation(df[["x"]], df["y"], 2, 8)

  cross_validation(df[["x"]], df["y"], 2, 8)
  cross_validation(df[["x"]], df["y"], 2, 8)


2.7945397670282217

In [22]:
cross_validate_for_degree(df[list(df)[:-2]], df["y"], 10, range(1,9))

['x', 'x^2', 'x^3', 'x^4', 'x^5', 'x^6', 'x^7']
0.7044217065746845
0.8325715444244797
0.7084166683337539
0.7058682724654413
0.6415387676795928
0.6393181623426114
0.6411426584328244
0.6414514721904806


6