In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import random
import math
import os
from sklearn.model_selection import KFold


import plotly.graph_objects as go
import warnings
warnings.simplefilter('ignore', np.RankWarning)
MAX_DEGREE = 10
FOLDS = 10

def CalculateMSE_poly(x_positions, y_positions, w_list):
    """CalculateMSE for polynomial regression model"""
    x = np.array(x_positions)
    n = len(x_positions)
    MSE = 0
    for i in range(0, n):
        wx = 0
        for j in range(len(w_list)):
            wx += w_list[j] * x[i]**(len(w_list)-j-1)
        MSE += (wx - y_positions[i]) ** 2
    MSE *= 1/(n)
    return MSE

def generate_feature_matrix(x, degree):
    if degree == 1:
        X = np.ones((len(x), 1))
        return X
    elif degree == 2:
        X = np.column_stack((np.ones(len(x)), x))
        return X
    elif degree > 2:
        X_poly = np.column_stack([x ** d for d in range(0, degree)])
        return X_poly
    else:
        raise ValueError("Degree must be a non-negative integer.")
    
def CalculateMSE(x_positions, y_positions, w_list):
    n = len(x_positions)
    x_positions = generate_feature_matrix(x_positions,len(w_list))
    MSE = 0
    for i in range(0, n):
        wx = np.dot(x_positions[i],np.flip(w_list)) #x_positon is array 1 x len(w_list) dimension Ex. [ 1 , x^1 , x^2 , ..... , x^n]
        MSE += (wx - y_positions[i]) ** 2

        
    MSE *= 1/(n)
    # print("new",MSE)
    return MSE

def cross_validation(X, y, folds, degree):
    #Not done
    k_folds = KFold(n_splits=folds)
    
    avg_rmse = []
    for train_index, validate_index in k_folds.split(X):
        # print(train_index,validate_index)
        X_train , X_validate = X.iloc[train_index], X.iloc[validate_index]
        y_train , y_validate = y.iloc[train_index], y.iloc[validate_index]

        result = np.polyfit(X_train.values.flatten(), np.array(y_train), degree)
        
        avg_rmse.append(CalculateMSE(X_validate.values.flatten(), np.array(y_validate), result)**(1/2))
        #avg_rmse.append(CalculateMSE_poly(X_validate.values.flatten(), np.array(y_validate), result))
    return np.mean(avg_rmse)

def polynomial_predictions(x, w):
    degree = len(w) - 1
    y_pred = np.zeros_like(x)
    for i in range(degree + 1):
        y_pred += w[i] * (x ** (degree - i))
    return y_pred

def model_plot(x, y, w_list):
    fig = go.Figure()
    x_range = np.linspace(min(x), max(x), 100)

    for i, w in enumerate(w_list):
        y_pred = polynomial_predictions(x_range, w)
        label = f'Polynomial degree {i+1}'
        fig.add_trace(go.Scatter(x=x_range, y=y_pred, mode='lines', name=label))

    fig.add_trace(go.Scatter(x=x, y=y, mode='markers', name='Data Points', marker=dict(color='blue')))
    #y_sin = [math.sin(math.pi*x) for x in x_range]
    #fig.add_trace(go.Scatter(x=x_range, y=y_sin, mode='lines', name="sin"))

    fig.update_layout(
        title=f'Data and Polynomial Regression {len(x)} Samples',
        xaxis_title='X',
        yaxis_title='Y',
        showlegend=True,
        width=1000,
        height=600
    )

    fig.show()

def model_plot2(x, y, w_list, line_label_list, y_lim=None):
    """Model_plot but can fix y limit and rename label"""
    fig = go.Figure()

    if y_lim != None:
        fig.update_layout(yaxis_range=y_lim)

    x_range = np.linspace(min(x), max(x), 100)

    for i, w in enumerate(w_list):
        y_pred = polynomial_predictions(x_range, w)
        label = f'Polynomial degree {i+1}'
        fig.add_trace(go.Scatter(x=x_range, y=y_pred, mode='lines', name=line_label_list[i]))

    fig.add_trace(go.Scatter(x=x, y=y, mode='markers', name='Data Points', marker=dict(color='blue')))
    #y_sin = [math.sin(math.pi*x) for x in x_range]
    #fig.add_trace(go.Scatter(x=x_range, y=y_sin, mode='lines', name="sin"))

    fig.update_layout(
        title=f'Data and Polynomial Regression {len(x)} Samples',
        xaxis_title='X',
        yaxis_title='Y',
        showlegend=True,
        width=1000,
        height=600
    )

    fig.show()

# Sin

## Training set

### Noiseless

In [None]:
# sin noiseless 10 sample
print("sin noiseless 10 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_10sample.csv")

w_list = []
for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["y"], degree)
    w_list.append(w)
    RMSE = CalculateMSE_poly(df["x"], df["y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')
model_plot(df["x"],df["y"],w_list)

In [None]:
# sin noiseless 20 sample
print("sin noiseless 20 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_20sample.csv")

w_list = []
for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["y"], degree)
    w_list.append(w)
    RMSE = CalculateMSE_poly(df["x"], df["y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')
model_plot(df["x"],df["y"],w_list)

In [None]:
# sin noiseless 40 sample
print("sin noiseless 40 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_40sample.csv")

w_list = []
for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["y"], degree)
    w_list.append(w)
    RMSE = CalculateMSE_poly(df["x"], df["y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')
model_plot(df["x"],df["y"],w_list)

In [None]:
# sin noiseless 80 sample
print("sin noiseless 80 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_80sample.csv")

w_list = []
for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["y"], degree)
    w_list.append(w)
    RMSE = CalculateMSE_poly(df["x"], df["y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')
model_plot(df["x"],df["y"],w_list)

### Noisy

In [92]:
# sin noisy 10 sample
print("sin noisy 10 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_10sample.csv")

w_list = []
for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["noisy_y"], degree)
    w_list.append(w)
    RMSE = CalculateMSE_poly(df["x"], df["noisy_y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')
model_plot(df["x"],df["noisy_y"],w_list)

sin noisy 10 sample
Degree = 1 | RMSE = 0.5728
Degree = 2 | RMSE = 0.4298
Degree = 3 | RMSE = 0.2265
Degree = 4 | RMSE = 0.2246
Degree = 5 | RMSE = 0.2081
Degree = 6 | RMSE = 0.1063
Degree = 7 | RMSE = 0.0865
Degree = 8 | RMSE = 0.0607
Degree = 9 | RMSE = 0.0000
Degree = 10 | RMSE = 0.0000


In [None]:
# sin noisy 20 sample
print("sin noisy 20 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_20sample.csv")

w_list = []
for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["noisy_y"], degree)
    w_list.append(w)
    RMSE = CalculateMSE_poly(df["x"], df["noisy_y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')
model_plot(df["x"],df["noisy_y"],w_list)

In [None]:
# sin noisy 40 sample
print("sin noisy 40 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_40sample.csv")

w_list = []
for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["noisy_y"], degree)
    w_list.append(w)
    RMSE = CalculateMSE_poly(df["x"], df["noisy_y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')
model_plot(df["x"],df["noisy_y"],w_list)

In [None]:
# sin noisy 80 sample
print("sin noisy 80 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_80sample.csv")

w_list = []
for degree in range(1, MAX_DEGREE+1):
    w = np.polyfit(df["x"], df["noisy_y"], degree)
    w_list.append(w)
    RMSE = CalculateMSE_poly(df["x"], df["noisy_y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')
model_plot(df["x"],df["noisy_y"],w_list)

## Cross-validation 10-folds

### Noiseless

In [40]:
# sin noiseless 10 sample
print("sin noiseless 10 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_10sample.csv")

for degree in range(0, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

sin noiseless 10 sample
Degree = 0 | RMSE = 0.6839
Degree = 1 | RMSE = 0.5104
Degree = 2 | RMSE = 0.6102
Degree = 3 | RMSE = 0.1366
Degree = 4 | RMSE = 0.1645
Degree = 5 | RMSE = 0.0228
Degree = 6 | RMSE = 0.0258
Degree = 7 | RMSE = 0.0029
Degree = 8 | RMSE = 0.0029
Degree = 9 | RMSE = 0.0525
Degree = 10 | RMSE = 0.0493


In [39]:
# sin noiseless 20 sample
print("sin noiseless 20 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_20sample.csv")

for degree in range(0, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

sin noiseless 20 sample
Degree = 0 | RMSE = 0.7194
Degree = 1 | RMSE = 0.5126
Degree = 2 | RMSE = 0.6437
Degree = 3 | RMSE = 0.1051
Degree = 4 | RMSE = 0.1744
Degree = 5 | RMSE = 0.0121
Degree = 6 | RMSE = 0.0226
Degree = 7 | RMSE = 0.0012
Degree = 8 | RMSE = 0.0021
Degree = 9 | RMSE = 0.0001
Degree = 10 | RMSE = 0.0001


In [7]:
# sin noiseless 40 sample
print("sin noiseless 40 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_40sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noiseless 80 sample
print("sin noiseless 80 sample")
df = pd.read_csv("data/sin experiment/sin_noiseless_80sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

### Noisy

In [None]:
# sin noisy 10 sample
print("sin noisy 10 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_10sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["noisy_y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noisy 20 sample
print("sin noisy 20 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_20sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["noisy_y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noisy 40 sample
print("sin noisy 40 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_40sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["noisy_y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

In [None]:
# sin noisy 80 sample
print("sin noisy 80 sample")
df = pd.read_csv("data/sin experiment/sin_noisy_80sample.csv")

for degree in range(1, MAX_DEGREE+1):
    RMSE = cross_validation(df[["x"]], df["noisy_y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE = {RMSE:.4f}')

## Degree of polynomial

### Degree 8

In [None]:
import os

data_path = os.path.abspath("data/sin experiment/")
filenames = ['sin_noiseless_10sample.csv',
             'sin_noisy_10sample.csv',
             'sin_noiseless_80sample.csv',
             'sin_noisy_80sample.csv']
print(filenames)

In [62]:
FOLDS = 10
MAX_DEGREE = 10

def E_LineScatterPlot( E_in , E_out , X , name ):
    fig = go.Figure()
    fig.add_trace(go.Scatter(
    x= X,
    y= E_out,
    name= "E_out" ) )

    fig.add_trace(go.Scatter(
    x= X,
    y= E_in,
    name= "E_in" ) )

    fig.update_layout(
        title= name,
        xaxis_title='Degree',
        yaxis_title='RMSE',
        showlegend=True,
        width=1000,
        height=600
    )
    return fig


In [106]:

filenames = "sin_noiseless_80sample.csv"
df = pd.read_csv("data/sin experiment/"+ filenames)

E = {"E_out":[] , "E_in":[]}
for degree in range(0, MAX_DEGREE+1):
    #CrossValidation
    E["E_out"].append(cross_validation(df[["x"]], df["y"], FOLDS, degree))
    #TrainingData
    w = np.polyfit(df[["x"]].values.flatten() , np.array(df["y"]) , degree )
    E["E_in"].append(CalculateMSE(df[["x"]].values.flatten(),np.array(df["y"]),w)**(1/2))
    
    #print(f'Degree = {degree} | E_out = {E["E_out"][degree]} ; E_in = {E["E_in"][degree]}')
#print(type(E["E_out"]))
E_LineScatterPlot(E["E_in"] , E["E_out"] , list(range(0,MAX_DEGREE+1)) , filenames[:-4])



In [102]:
filenames = "sin_noisy_80sample.csv"
df = pd.read_csv("data/sin experiment/"+ filenames)

E = {"E_out":[] , "E_in":[]}
for degree in range(0, MAX_DEGREE+1):
    #CrossValidation
    E["E_out"].append(cross_validation(df[["x"]], df["noisy_y"], FOLDS, degree))
    #TrainingData
    w = np.polyfit(df[["x"]].values.flatten() , np.array(df["noisy_y"]) , degree )
    E["E_in"].append(CalculateMSE(df[["x"]].values.flatten(),np.array(df["noisy_y"]),w)**(1/2))
    
    #print(f'Degree = {degree} | E_out = {E["E_out"][degree]} ; E_in = {E["E_in"][degree]}')
#print(type(E["E_out"]))
E_LineScatterPlot(E["E_in"] , E["E_out"] , list(range(0,MAX_DEGREE+1)) , filenames[:-4])

In [69]:
DEGREE = 8
for filename in filenames:
    df = pd.read_csv(os.path.join(data_path, filename))
    x = df["x"]
    if "noiseless" in filename:
        y = df["y"]
    else:
        y = df["noisy_y"]
    w_list = np.polyfit(x, y, DEGREE)
    print(filename)
    for i in range(len(w_list)):
        print(f'w{len(w_list)-i-1} = {w_list[i]}')

### Degree 3

In [None]:
DEGREE = 3
for filename in filenames:
    df = pd.read_csv(os.path.join(data_path, filename))
    x = df["x"]
    if "noiseless" in filename:
        y = df["y"]
    else:
        y = df["noisy_y"]
    w_list = np.polyfit(x, y, DEGREE)
    print(filename)
    for i in range(len(w_list)):
        print(f'w{len(w_list)-i-1} = {w_list[i]}')

# More complex target function

In [None]:
# Set a fixed seed value
seed_value = 30
random.seed(seed_value)

# Generate random y for a 50-degree polynomial
degree = 50
y_values = [random.uniform(-10, 10) for _ in range(degree)]
print(y_values)

# Generate x values for polyfit
x_values = np.linspace(-1.5, 1.5, len(y_values))

target_function = np.polyfit(x_values, y_values, 50)

# Generate x values for plotting
x_range = np.linspace(-1, 1, 1000)  # Adjust the range as needed
y_range = polynomial_predictions(x_range, target_function)

if not os.path.exists("data/50_degree_target_function.csv"):
    target_df = pd.DataFrame({'x':x_range, 'y':y_range})
    target_df.to_csv("data/50_degree_target_function.csv", index=False)

df = pd.read_csv("data/50_degree_target_function.csv")
x_range = df["x"]
y_range = df["y"]

# Sample point
x_15_sample = np.array([random.uniform(-1, 1) for _ in range(15)])
y_15_sample = polynomial_predictions(x_15_sample, target_function)

if not os.path.exists("data/50_degree_target_function_15sample.csv"):
    sample_df = pd.DataFrame({'x':x_15_sample, 'y':y_15_sample})
    sample_df.to_csv("data/50_degree_target_function_15sample.csv", index=False)

# Plot the graph
plt.figure(figsize=(10, 6))
plt.plot(x_range, y_range, label='Target function', color='blue')
plt.scatter(x_15_sample, y_15_sample, label='Sample points', color='red')
plt.xlabel('x')
plt.ylabel('P(x)')
plt.title('50-Degree Polynomial Graph')
plt.legend()
plt.grid()
plt.show()

In [None]:
df = pd.read_csv("data/50_degree_target_function_15sample.csv")

w_list = []
degree_list = [2, 10]
for degree in degree_list:
    w = np.polyfit(df["x"], df["y"], degree)
    w_list.append(w)
    RMSE = cross_validation(df["x"], df["y"], FOLDS, degree)
    print(f'Degree = {degree} | RMSE_out = {RMSE:.4f}')
    RMSE_in = CalculateMSE_poly(df["x"], df["y"], w)**(1/2)
    print(f'Degree = {degree} | RMSE_in = {RMSE_in:.4f}')
label = ["2nd Order Fit", "10th Order Fit"]
model_plot2(df["x"], df["y"], w_list,label, [-15, 15])