### 1 Model Complexity and Model Selection
### Student ID: 35224436 | Full name: Yiming Zhang

## 1.1 Question 1 KNN Regressor

### import packages

In [None]:
from scipy.spatial import KDTree
from sklearn.base import BaseEstimator
from sklearn.datasets import load_diabetes
import numpy as np

### KNN Regressor Implementation

In [None]:
class KnnRegressor(BaseEstimator):
    def __init__(self, k=5): # ADD PARAMETERS AS REQUIRED
        self.k = k
        
    def fit(self, x, y):
        self.y_train_ = y
        self.x_train_kdtree_ = KDTree(x)
        return self

    def predict(self, x):
        _, neighbours = self.x_train_kdtree_.query(x, k=self.k)
        neighbours = neighbours.reshape(len(x), self.k)
        neighbour_labels = self.y_train_[neighbours]
        return np.mean(neighbour_labels, axis=1)

### Test implementation
测试准备，分割测试集和训练集的工具函数

In [None]:
def train_test_split(x, y, train_size=0.6, random_state=None):
    """Split the data into training and testing sets"""
    RNG = np.random.default_rng(random_state)
    N = len(x)
    N_train = round(N * train_size)
    idx_train = RNG.choice(N, N_train, replace=False)
    idx_test = np.setdiff1d(np.arange(N), idx_train)
    RNG.shuffle(idx_test)
    return x[idx_train], x[idx_test], y[idx_train], y[idx_test]


#### Load dataset

In [None]:
# Load dataset
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target
X.shape, y.shape

#### Split dataset

In [None]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=1024
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

#### Guess a K
依据经验法则, $\sqrt{traingSetSize}\,$ 往往接近最优K的取值。因此我猜测 K = $\sqrt{309} \approx$ 18 为可接受的 K 取值。

#### Test

In [None]:
# Test KNN Regressor
knn = KnnRegressor(k=18)
knn.fit(X_train, y_train)
y_hat_train = knn.predict(X_train)
y_hat_test = knn.predict(X_test)
y_hat_train, y_hat_test

### Evaulation
用 the sum of the squares of the errors 作为 error function 来衡量  training errors and testing errors.

In [None]:
# calculate the sum of squared errors
def sse(y_true, y_pred):
    return np.sum((y_true - y_pred) ** 2)

#### 计算error

In [None]:
sse_train = sse(y_train, y_hat_train)
sse_test = sse(y_test, y_hat_test)
sse_train, sse_test

#### Further test to find optimal K

In [None]:
def choose_k(X_train, y_train, X_test, y_test, max_k):
    """
    Choose the best K for KNN Regressor
    max_k: the maximum K to test
    """
    sse_train = []
    sse_test = []
    for k in range(1, max_k + 1):
        knn = KnnRegressor(k=k)
        knn.fit(X_train, y_train)
        y_hat_train = knn.predict(X_train)
        y_hat_test = knn.predict(X_test)
        sse_train.append(sse(y_train, y_hat_train))
        sse_test.append(sse(y_test, y_hat_test))
    return sse_train, sse_test

sse_train, sse_test = choose_k(X_train, y_train, X_test, y_test, 30)

# find the minimum sse_test and the corresponding k
min_sse_test = min(sse_test)
k_min = sse_test.index(min_sse_test) + 1
print(f"The minimum testing error is {min_sse_test} when K = {k_min}")
print(f"Hence, the optimal K is {k_min}")


## 1.2 Question 2 L-fold Cross Validation

In [None]:
class LFold:
    def __init__(self, L = 5): # ADD PARAMETERS AS REQUIRED
        # YOUR CODE HERE
        self.L = L
    def get_n_splits(self, x=None, y=None, groups=None):
        # split the data into L folds
        N = len(x)
        N_fold = N // self.L
        for i in range(self.L):
            test_idx = np.arange(i*N_fold, (i+1)*N_fold)
            train_idx = np.setdiff1d(np.arange(N), test_idx)
            yield train_idx, test_idx
    
    def split(self, x, y=None, groups=None):
        # YOUR CODE HERE
        yield train_idx, test_idx

## 1.3 Question 3 Automatic Model Selection