In [99]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor

In [100]:
data_all = pd.read_csv('../data/GEFCom2014Data/Wind/raw_data_incl_features.csv', date_parser='TIMESTAMP')

In [101]:
#data_all.dropna(inplace=True, axis=0)
data_all.fillna(0, inplace=True)
#data_all.info()
data_all_dummies = pd.get_dummies(data_all,columns=['WD100CARD', 'WD10CARD'], drop_first=True)
zone1 = data_all_dummies[data_all_dummies['ZONEID']==1]

In [102]:
features = data_all_dummies.columns
features
features_remove = [ 'ZONEID', 'TIMESTAMP', 'TARGETVAR', 'U10', 'V10', 'U100', 'V100', 
                    'WD10', 'WD100', 'U100NORM', 'V100NORM']
features = [ff for ff in features if ff not in features_remove]

In [103]:
y = zone1['TARGETVAR']
X = zone1[features]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)

In [104]:
def fit_predict(reg, X_train, X_test, y_train, y_test):
    reg.fit(X_train, y_train)
    y_pred_train = reg.predict(X_train)
    y_pred_train = [0 if ff<0 else 1 if ff>1 else ff for ff in y_pred_train]
    y_pred_test = reg.predict(X_test)
    y_pred_test = [0 if ff<0 else 1 if ff>1 else ff for ff in y_pred_test]
    rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
    print(f'RMSE train: {rmse_train}')
    rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)
    print(f'RMSE test: {rmse_test}')
    return rmse_train, rmse_test

def remove_windy_zeros(X_train, y_train):
    z = pd.concat([X_train, y_train], axis=1)
    z = z[~((z['TARGETVAR']==0) & (z['WS100']>=4))]
    X_train = z[features]
    y_train = z['TARGETVAR']
    return X_train, y_train

In [105]:
def get_train_test_split(data, features, random_state=42, test_size=0.25):
    y = data['TARGETVAR']
    X = data[features]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25)
    return X_train, X_test, y_train, y_test

In [106]:
reg_lr = LinearRegression()
fit_predict(reg_lr, X_train, X_test, y_train, y_test)

RMSE train: 0.18470434495625396
RMSE test: 0.18083657845849982


(0.18470434495625396, 0.18083657845849982)

## Day / Night

In [107]:
#data_all_dummies['DAYNIGHT'] = data_all_dummies['HOUR'].apply(lambda x : 0 if x<8 else 0 if x>19 else 1)

In [108]:
features = data_all_dummies.columns
features
features_remove = [ 'ZONEID', 'TIMESTAMP', 'TARGETVAR', 'U10', 'V10', 'U100', 'V100', 
                    'WD10', 'WD100', 'U100NORM', 'V100NORM']
features = [ff for ff in features if ff not in features_remove]
features

['HOUR',
 'MONTH',
 'WEEKDAY',
 'IS_HOLIDAY',
 'WS10',
 'WS100',
 'WD100CARD_ENE',
 'WD100CARD_ESE',
 'WD100CARD_N',
 'WD100CARD_NE',
 'WD100CARD_NNE',
 'WD100CARD_NNW',
 'WD100CARD_NW',
 'WD100CARD_S',
 'WD100CARD_SE',
 'WD100CARD_SSE',
 'WD100CARD_SSW',
 'WD100CARD_SW',
 'WD100CARD_W',
 'WD100CARD_WNW',
 'WD100CARD_WSW',
 'WD10CARD_ENE',
 'WD10CARD_ESE',
 'WD10CARD_N',
 'WD10CARD_NE',
 'WD10CARD_NNE',
 'WD10CARD_NNW',
 'WD10CARD_NW',
 'WD10CARD_S',
 'WD10CARD_SE',
 'WD10CARD_SSE',
 'WD10CARD_SSW',
 'WD10CARD_SW',
 'WD10CARD_W',
 'WD10CARD_WNW',
 'WD10CARD_WSW']

In [109]:
#X_train, y_train = remove_windy_zeros(X_train, y_train)
reg = LinearRegression()
fit_predict(reg, X_train, X_test, y_train, y_test)

RMSE train: 0.18470434495625396
RMSE test: 0.18083657845849982


(0.18470434495625396, 0.18083657845849982)

In [111]:
# linear model, without hours, but with day/night
rmse_all_train, rmse_all_test = 0,0
count = 0
test_size = 0.25
for idx in range(1,11):
    zone = data_all_dummies[data_all_dummies['ZONEID']==idx]
    X_train, X_test, y_train, y_test = get_train_test_split(zone, features, test_size)
    #X_train, y_train = remove_windy_zeros(X_train, y_train)
    reg = LinearRegression()
    print('Zone ',idx)
    rmse_train, rmse_test = fit_predict(reg, X_train, X_test, y_train, y_test)
    rmse_all_train += rmse_train*len(y_train)
    rmse_all_test += rmse_test*len(y_test)

rmse_all_train /= data_all_dummies.shape[0]*(1-test_size)
rmse_all_test /= data_all_dummies.shape[0]*test_size
print('rmse_all_train',rmse_all_train)
print('rmse_all_test', rmse_all_test)

Zone  1
RMSE train: 0.18470434495625396
RMSE test: 0.18083657845849982
Zone  2
RMSE train: 0.15554560228882194
RMSE test: 0.15202842260076221
Zone  3
RMSE train: 0.15410534153161942
RMSE test: 0.1555744401957098
Zone  4
RMSE train: 0.1774753073705115
RMSE test: 0.185432402099842
Zone  5
RMSE train: 0.18258680681851033
RMSE test: 0.18482578199499852
Zone  6
RMSE train: 0.19002029022610087
RMSE test: 0.19115495821545847
Zone  7
RMSE train: 0.1416880526746128
RMSE test: 0.14024440292856374
Zone  8
RMSE train: 0.17368722472862422
RMSE test: 0.16715420245226123
Zone  9
RMSE train: 0.16692662779311462
RMSE test: 0.16282698735547158
Zone  10
RMSE train: 0.20523495092389699
RMSE test: 0.20157197242865327
rmse_all_train 0.17319745493120667
rmse_all_test 0.17216501487302205


In [118]:
## random forest
rmse_all_train, rmse_all_test = 0,0
count = 0
test_size = 0.25
for idx in range(1,11):
    zone = data_all_dummies[data_all_dummies['ZONEID']==idx]
    X_train, X_test, y_train, y_test = get_train_test_split(zone, features, test_size)
    X_train, y_train = remove_windy_zeros(X_train, y_train)
    reg = ExtraTreesRegressor( 
        n_estimators=20,
        n_jobs=-1, 
        max_depth=15
    )
    print('Zone ',idx)
    rmse_train, rmse_test = fit_predict(reg, X_train, X_test, y_train, y_test)
    rmse_all_train += rmse_train*len(y_train)
    rmse_all_test += rmse_test*len(y_test)

rmse_all_train /= data_all_dummies.shape[0]*(1-test_size)
rmse_all_test /= data_all_dummies.shape[0]*test_size
print('rmse_all_train',rmse_all_train)
print('rmse_all_test', rmse_all_test)

Zone  1
RMSE train: 0.0948590424728361
RMSE test: 0.15506487735225136
Zone  2
RMSE train: 0.0751095022051934
RMSE test: 0.1302312346430733
Zone  3
RMSE train: 0.09733876602981723
RMSE test: 0.14112502001221508
Zone  4
RMSE train: 0.10960456081024969
RMSE test: 0.16963925906337107
Zone  5
RMSE train: 0.10189938368610005
RMSE test: 0.16383429599697094
Zone  6
RMSE train: 0.11409993199189851
RMSE test: 0.1701698908628115
Zone  7
RMSE train: 0.07357621910071598
RMSE test: 0.12035619582804337
Zone  8
RMSE train: 0.08126190389412027
RMSE test: 0.14952024830244975
Zone  9
RMSE train: 0.08458637755255637
RMSE test: 0.14613497973786896
Zone  10
RMSE train: 0.12538033616308245
RMSE test: 0.18222583605596437
rmse_all_train 0.09360801067665417
rmse_all_test 0.15283018378550198


In [71]:
# neural networks
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(random_state=1, max_iter=500)

rmse_all_train, rmse_all_test = 0,0
count = 0
test_size = 0.25
for idx in range(1,11):
    zone = data_all_dummies[data_all_dummies['ZONEID']==idx]
    X_train, X_test, y_train, y_test = get_train_test_split(zone, features, test_size)
    X_train, y_train = remove_windy_zeros(X_train, y_train)
    print('Zone ',idx)
    rmse_train, rmse_test = fit_predict(regr, X_train, X_test, y_train, y_test)
    rmse_all_train += rmse_train*len(y_train)
    rmse_all_test += rmse_test*len(y_test)

rmse_all_train /= data_all_dummies.shape[0]*(1-test_size)
rmse_all_test /= data_all_dummies.shape[0]*test_size
print('rmse_all_train',rmse_all_train)
print('rmse_all_test', rmse_all_test)

Zone  1
RMSE train: 0.15433579455501575
RMSE test: 0.16632753008740278
Zone  2
RMSE train: 0.13332380081027087
RMSE test: 0.1424671790246357
Zone  3
RMSE train: 0.1361423683321396
RMSE test: 0.14754087359761692
Zone  4
RMSE train: 0.1551498261313936
RMSE test: 0.17448751089792924
Zone  5
RMSE train: 0.1650393405224739
RMSE test: 0.17831736959239883
Zone  6
RMSE train: 0.17242719189695266
RMSE test: 0.18556332706918832
Zone  7
RMSE train: 0.12154486527047029
RMSE test: 0.1325840761085123
Zone  8
RMSE train: 0.13877959672853166
RMSE test: 0.16816585913196
Zone  9
RMSE train: 0.14127821709925908
RMSE test: 0.1619813749700205
Zone  10
RMSE train: 0.17918547071603708
RMSE test: 0.191986177430827
rmse_all_train 0.14632717482152627
rmse_all_test 0.16497070272129627
