In [1]:
## load modules
import sys
sys.path.append("..")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from modeling.functions import modelling, log_to_mlflow, get_features 
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.neighbors import KNeighborsRegressor 
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

import seaborn as sns
import pickle 


RSEED = 42




### Read data, remove NaNs and get dummies for cardinal wind directions ###

In [2]:
## read data
data = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', parse_dates=['TIMESTAMP'])
data.dropna(inplace=True)
data = pd.get_dummies(data, columns = ['WD100CARD','WD10CARD'], drop_first=True)
data.info()
data.head()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 175265 entries, 0 to 175433
Data columns (total 47 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   ZONEID         175265 non-null  int64         
 1   TIMESTAMP      175265 non-null  datetime64[ns]
 2   TARGETVAR      175265 non-null  float64       
 3   U10            175265 non-null  float64       
 4   V10            175265 non-null  float64       
 5   U100           175265 non-null  float64       
 6   V100           175265 non-null  float64       
 7   HOUR           175265 non-null  int64         
 8   MONTH          175265 non-null  int64         
 9   WEEKDAY        175265 non-null  int64         
 10  IS_HOLIDAY     175265 non-null  int64         
 11  WS10           175265 non-null  float64       
 12  WS100          175265 non-null  float64       
 13  WD10           175265 non-null  float64       
 14  WD100          175265 non-null  float64       
 15  

Unnamed: 0,ZONEID,TIMESTAMP,TARGETVAR,U10,V10,U100,V100,HOUR,MONTH,WEEKDAY,...,WD10CARD_NNW,WD10CARD_NW,WD10CARD_S,WD10CARD_SE,WD10CARD_SSE,WD10CARD_SSW,WD10CARD_SW,WD10CARD_W,WD10CARD_WNW,WD10CARD_WSW
0,1,2012-01-01 01:00:00,0.0,2.1246,-2.681966,2.86428,-3.666076,1,1,6,...,0,1,0,0,0,0,0,0,0,0
1,1,2012-01-01 02:00:00,0.054879,2.521695,-1.79696,3.344859,-2.464761,2,1,6,...,0,1,0,0,0,0,0,0,0,0
2,1,2012-01-01 03:00:00,0.110234,2.67221,-0.822516,3.508448,-1.214093,3,1,6,...,0,0,0,0,0,0,0,0,1,0
3,1,2012-01-01 04:00:00,0.165116,2.457504,-0.143642,3.215233,-0.355546,4,1,6,...,0,0,0,0,0,0,0,1,0,0
4,1,2012-01-01 05:00:00,0.15694,2.245898,0.389576,2.957678,0.332701,5,1,6,...,0,0,0,0,0,0,0,1,0,0


In [3]:
#np.random.RandomState(seed=42)
np.random.seed(42)
dates = np.unique(data.TIMESTAMP.dt.date)
np.random.shuffle(dates)

dates_train = dates[:int(len(dates) * .75)] 
dates_test = dates[int(len(dates) * .75):] 

print(len(dates_train),len(dates_test),len(dates))

data_train = data[data.TIMESTAMP.dt.date.isin(dates_train)]
data_test = data[data.TIMESTAMP.dt.date.isin(dates_test)]

548 183 731


In [4]:
## train-test-split
data_train = data[data.TIMESTAMP <= '2013-07-01 00:00:00']
data_test = data[data.TIMESTAMP > '2013-07-01 00:00:00']

In [5]:
print(data_train.TIMESTAMP.min(),data_train.TIMESTAMP.max())
print(data_test.TIMESTAMP.min(),data_test.TIMESTAMP.max())

2012-01-01 01:00:00 2013-12-31 18:00:00
2012-01-02 00:00:00 2013-12-29 23:00:00


In [6]:
## get features and zones
feature_dict = get_features(data)
zones = np.unique(data.ZONEID)

In [7]:
## run different models
models = ['KNN']
results = {key : {} for key in feature_dict.keys()}

# loop over various combinations of features

# KNN
if 'KNN' in models:
    #param_grid = {'n_neighbors' : np.arange(2,15), 'weights' : ['uniform','distance'], 'p' : [1,2]}
    param_grid = {'n_neighbors' : np.arange(20,141,5), 'weights' : ['uniform','distance'], 'p' : [1,2]}
    results['no_card_100Norm']['KNN_train'], results['no_card_100Norm']['KNN_test'], KNN_bestparams = \
        modelling(data_train, data_test, feature_dict['no_card_100Norm'], KNeighborsRegressor(), \
                    scaler=MinMaxScaler(), save_model = True, perform_gridCV = True, param_grid = param_grid)
    results['no_card_100Norm']['KNN_train'] = {k : np.round(value,10) for k,value in results['no_card_100Norm']['KNN_train'].items()}
    results['no_card_100Norm']['KNN_test'] = {k : np.round(value,10) for k,value in results['no_card_100Norm']['KNN_test'].items()}


# # save result dictionary in pickle
# with open("../data/GEFCom2014Data/Wind/results.pkl", "wb") as f_out:
#     pickle.dump(results,f_out)

# # load result dictionary from pickle
# with open("../data/GEFCom2014Data/Wind/results.pkl",'rb') as f_in:
#     results = pickle.load(f_in)

Scaler: MinMaxScaler
Scaled X_train min/max: 0.0, 1.0
Scaled X_test min/max: -0.05, 1.0

ZONEID 1
Fitting 5 folds for each of 400 candidates, totalling 2000 fits




[CV] END ...............n_neighbors=40, p=1, weights=uniform; total time=   0.2s
[CV] END ...............n_neighbors=40, p=1, weights=uniform; total time=   0.2s
[CV] END ...............n_neighbors=40, p=1, weights=uniform; total time=   0.2s
[CV] END ..............n_neighbors=40, p=1, weights=distance; total time=   0.2s
[CV] END ..............n_neighbors=40, p=1, weights=distance; total time=   0.2s
[CV] END ..............n_neighbors=40, p=1, weights=distance; total time=   0.2s
[CV] END ...............n_neighbors=40, p=1, weights=uniform; total time=   0.2s
[CV] END ...............n_neighbors=40, p=1, weights=uniform; total time=   0.2s
[CV] END ...............n_neighbors=40, p=2, weights=uniform; total time=   0.1s
[CV] END ...............n_neighbors=40, p=2, weights=uniform; total time=   0.1s
[CV] END ...............n_neighbors=40, p=2, weights=uniform; total time=   0.1s
[CV] END ...............n_neighbors=40, p=2, weights=uniform; total time=   0.1s
[CV] END ..............n_nei

In [8]:
KNN_bestparams

{1: KNeighborsRegressor(n_neighbors=79, p=1, weights='distance'),
 2: KNeighborsRegressor(n_neighbors=40, p=1, weights='distance'),
 3: KNeighborsRegressor(n_neighbors=56, p=1, weights='distance'),
 4: KNeighborsRegressor(n_neighbors=44, p=1, weights='distance'),
 5: KNeighborsRegressor(n_neighbors=48, p=1, weights='distance'),
 6: KNeighborsRegressor(n_neighbors=44, p=1, weights='distance'),
 7: KNeighborsRegressor(n_neighbors=52, p=1, weights='distance'),
 8: KNeighborsRegressor(n_neighbors=89, p=1, weights='distance'),
 9: KNeighborsRegressor(n_neighbors=67, p=1, weights='distance'),
 10: KNeighborsRegressor(n_neighbors=52, p=1, weights='distance')}

In [9]:
results['no_card_100Norm']

{'KNN_train': {'ZONE1': 0.0,
  'ZONE2': 0.0,
  'ZONE3': 0.0,
  'ZONE4': 0.0,
  'ZONE5': 0.0,
  'ZONE6': 0.0,
  'ZONE7': 0.0,
  'ZONE8': 0.0,
  'ZONE9': 0.0,
  'ZONE10': 0.0,
  'TOTAL': 0.0},
 'KNN_test': {'ZONE1': 0.1828718832,
  'ZONE2': 0.1464576674,
  'ZONE3': 0.1624919182,
  'ZONE4': 0.1834512406,
  'ZONE5': 0.1804339312,
  'ZONE6': 0.1841631003,
  'ZONE7': 0.1426007437,
  'ZONE8': 0.1702797112,
  'ZONE9': 0.162706774,
  'ZONE10': 0.2023620417,
  'TOTAL': 0.1726851103}}