# import packages

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import os

# Load the data

In [6]:
data_path = os.path.join(os.getcwd(), 'hk_var_present.txt')
data = pd.read_csv(data_path, delimiter='\t')

# Extract relevant data for a specific period within a year
# Assuming I pick the year 2015 and days from March 15 (74th day) to August 15 (227th day)
start_day = 74
end_day = 227
temperature = data['Temperature (K)'].iloc[start_day:end_day+1]

# Split the data into training, validation, and test sets
X = np.arange(start_day, end_day+1).reshape(-1, 1)  # Day of the year as input
y = temperature.values
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=1/3, random_state=42)

# Create polynomial features

In [11]:
best_order = None
best_mse = float('inf')
best_model = None

for order in range(1, 21):  # Let's try polynomial orders from 1 to 20
    poly = PolynomialFeatures(degree=order)
    X_train_poly = poly.fit_transform(X_train)
    X_val_poly = poly.transform(X_val)

    # Train the model
    model = LinearRegression().fit(X_train_poly, y_train)

    # Evaluate the model
    y_val_pred = model.predict(X_val_poly)
    mse = mean_squared_error(y_val, y_val_pred)
    print(f"Polynomial order: {order}, Validation MSE: {mse}")

    if mse < best_mse:
        best_mse = mse
        best_order = order
        best_model = model

print(f"Best polynomial order: {best_order}")
print(f"Validation MSE: {best_mse}")

Polynomial order: 1, Validation MSE: 1.2150034553195335
Polynomial order: 2, Validation MSE: 0.38313552710690324
Polynomial order: 3, Validation MSE: 0.3800358444117739
Polynomial order: 4, Validation MSE: 0.4181822676405261
Polynomial order: 5, Validation MSE: 0.4157344773948194
Polynomial order: 6, Validation MSE: 0.39319154668889017
Polynomial order: 7, Validation MSE: 0.48647774314723463
Polynomial order: 8, Validation MSE: 0.5395470123182999
Polynomial order: 9, Validation MSE: 0.29613257425333994
Polynomial order: 10, Validation MSE: 0.2731080070515161
Polynomial order: 11, Validation MSE: 0.29038927623631533
Polynomial order: 12, Validation MSE: 0.3755576105294413
Polynomial order: 13, Validation MSE: 0.5122247151225124
Polynomial order: 14, Validation MSE: 0.6800308293819323
Polynomial order: 15, Validation MSE: 0.8929572090601576
Polynomial order: 16, Validation MSE: 1.1726937036667378
Polynomial order: 17, Validation MSE: 1.5224676884911585
Polynomial order: 18, Validation MS

# Evaluate the final model on the test set

In [12]:
X_test_poly = PolynomialFeatures(degree=best_order).fit_transform(X_test)
y_test_pred = best_model.predict(X_test_poly)
test_mse = mean_squared_error(y_test, y_test_pred)
print(f"Test MSE: {test_mse}")

Test MSE: 0.9469357117332293
