In [49]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [50]:
df_final = pd.read_csv('../Data/df_final.csv', index_col='time')

In [51]:
X = df_final.drop(columns=['generation solar','generation wind onshore'])
y_solar = df_final['generation solar']
y_wind = df_final['generation wind onshore']

## Train and test data

In [52]:
# I'm going to split the data 
def split_by_date(X,train_end_date):
    if not isinstance(train_end_date, pd.Timestamp):
        train_end_date = pd.Timestamp(train_end_date)

    # Convert index of X and y to Timestamp objects if they are strings
    if isinstance(X.index[0], str):
        X.index = pd.to_datetime(X.index)
        
    X_train = X[X.index <= train_end_date]
    X_test = X[X.index > train_end_date]
    
    return X_train, X_test

train_end_date = pd.to_datetime('2017-12-31 23:00:00+00:00')
X_train, X_test = split_by_date(X,train_end_date)
y_solar_train, y_solar_test = split_by_date(y_solar, train_end_date)
y_wind_train, y_wind_test = split_by_date(y_wind, train_end_date)

## Standardization 

In [53]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Grid Search

In [54]:
knn = KNeighborsRegressor()

In [55]:
# Set the candidates with 3, 5, 7, 9, 11
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'metric': ['euclidean', 'manhattan']
}
cv_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5)
cv_knn.fit(X_train_scaled, y_solar_train)

In [56]:
# The best parameters are 
cv_knn.best_estimator_

The best n_neighbors is '11'. We can suspect that even more higher n_neighbors will be better. So I'm going to do grid search with more higher neighbors. 

## Grid Search 2

In [68]:
# Because in the previous, manhattan is better, so I'm going to maintain it.
knn_man = KNeighborsRegressor(metric='manhattan')

In [71]:
param_grid_2 = {
    'n_neighbors' : [11,21,31,41,51]
}

In [72]:
cv_knn = GridSearchCV(estimator = knn_man, param_grid = param_grid_2, cv = 5)
cv_knn.fit(X_train_scaled, y_solar_train)

In [74]:
# We can see that n_neighbors = 21 is the best.
cv_knn.best_estimator_

## Prediction and evaluation 

In [75]:
# Prediction 
y_solar_pred = cv_knn.predict(X_test_scaled)
# MSE
mse_solar = mean_squared_error(y_solar_test, y_solar_pred)
print("MSE for generation solar:", mse_solar)

MSE for generation solar: 877589.2327062879
