In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

MAX_DEGREE = 10
FOLDS = 10

def CalculateMSE_poly(x_positions, y_positions, w_list):
    """CalculateMSE for polynomial regression model"""
    x = np.array(x_positions)
    n = len(x_positions)
    MSE = 0
    for i in range(0, n):
        wx = 0
        for j in range(len(w_list)):
            wx += w_list[j] * x[i]**(len(w_list)-j-1)
        MSE += (wx - y_positions[i]) ** 2
    MSE *= 1/(n)
    return MSE

def generate_feature_matrix(x, degree):
    if degree == 0:
        X = np.ones((len(x), 1))
        return X
    elif degree == 1:
        X = np.column_stack((np.ones(len(x)), x))
        return X
    elif degree > 1:
        X_poly = np.column_stack([x ** d for d in range(0, degree)])
        return X_poly
    else:
        raise ValueError("Degree must be a non-negative integer.")
    
def CalculateMSE(x_positions, y_positions, w_list):
    n = len(x_positions)
    x_positions = generate_feature_matrix(x_positions,len(w_list))
    MSE = 0
    for i in range(0, n):
        wx = np.dot(x_positions[i],np.flip(w_list))
        MSE += (wx - y_positions[i]) ** 2
        
    MSE *= 1/(n)
    # print("new",MSE)
    return MSE

def cross_validation(X, y, folds, degree):
    #Not done
    k_folds = KFold(n_splits=folds)
    
    avg_rmse = []
    for train_index, validate_index in k_folds.split(X):
        # print(train_index,validate_index)
        X_train , X_validate = X.iloc[train_index], X.iloc[validate_index]
        y_train , y_validate = y.iloc[train_index], y.iloc[validate_index]

        result = np.polyfit(X_train.values.flatten(), np.array(y_train), degree)
        
        avg_rmse.append(CalculateMSE(X_validate.values.flatten(), np.array(y_validate), result)**(1/2))
        #avg_rmse.append(CalculateMSE_poly(X_validate.values.flatten(), np.array(y_validate), result))
    return np.mean(avg_rmse)

# Sin

## Training set

### Noiseless

In [None]:
# sin noiseless 10 sample
print("sin noiseless 10 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_10sample.csv")

for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["y"], degree)
    RMSE = CalculateMSE_poly(df["x"], df["y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noiseless 20 sample
print("sin noiseless 20 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_20sample.csv")

for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["y"], degree)
    RMSE = CalculateMSE_poly(df["x"], df["y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noiseless 40 sample
print("sin noiseless 40 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_40sample.csv")

for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["y"], degree)
    RMSE = CalculateMSE_poly(df["x"], df["y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noiseless 80 sample
print("sin noiseless 80 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_80sample.csv")

for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["y"], degree)
    RMSE = CalculateMSE_poly(df["x"], df["y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

### Noisy

In [None]:
# sin noisy 10 sample
print("sin noisy 10 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_10sample.csv")

for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["noisy_y"], degree)
    RMSE = CalculateMSE_poly(df["x"], df["noisy_y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noisy 20 sample
print("sin noisy 20 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_20sample.csv")

for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["noisy_y"], degree)
    RMSE = CalculateMSE_poly(df["x"], df["noisy_y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noisy 40 sample
print("sin noisy 40 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_40sample.csv")

for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["noisy_y"], degree)
    RMSE = CalculateMSE_poly(df["x"], df["noisy_y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noisy 80 sample
print("sin noisy 80 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_80sample.csv")

for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["noisy_y"], degree)
    RMSE = CalculateMSE_poly(df["x"], df["noisy_y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

## Cross-validation 10-folds

### Noiseless

In [None]:
# sin noiseless 10 sample
print("sin noiseless 10 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_10sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noiseless 20 sample
print("sin noiseless 20 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_20sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noiseless 40 sample
print("sin noiseless 40 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_40sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noiseless 80 sample
print("sin noiseless 80 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_80sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

### Noisy

In [None]:
# sin noisy 10 sample
print("sin noisy 10 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_10sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["noisy_y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noisy 20 sample
print("sin noisy 20 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_20sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["noisy_y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noisy 40 sample
print("sin noisy 40 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_40sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["noisy_y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noisy 80 sample
print("sin noisy 80 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_80sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["noisy_y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

## Degree of polynomial

### Degree 8

In [None]:
import os

data_path = os.path.abspath("data/sin experiment/")
filenames = ['sin_noiseless_10sample.csv',
             'sin_noisy_10sample.csv',
             'sin_noiseless_80sample.csv',
             'sin_noisy_80sample.csv']
print(filenames)

In [None]:
DEGREE = 8
for filename in filenames:
    df = pd.read_csv(os.path.join(data_path, filename))
    x = df["x"]
    if "noiseless" in filename:
        y = df["y"]
    else:
        y = df["noisy_y"]
    w_list = np.polyfit(x, y, DEGREE)
    print(filename)
    for i in range(len(w_list)):
        print(f'w{len(w_list)-i-1} = {w_list[i]}')

### Degree 3

In [None]:
DEGREE = 3
for filename in filenames:
    df = pd.read_csv(os.path.join(data_path, filename))
    x = df["x"]
    if "noiseless" in filename:
        y = df["y"]
    else:
        y = df["noisy_y"]
    w_list = np.polyfit(x, y, DEGREE)
    print(filename)
    for i in range(len(w_list)):
        print(f'w{len(w_list)-i-1} = {w_list[i]}')