In [1]:
import pandas as pd
import seaborn as sns
import openpyxl
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import numpy as np
import os
import pickle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout,LSTM, RepeatVector, TimeDistributed
from keras import regularizers
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from vmdpy import VMD

In [2]:
from sklearn.model_selection import KFold, train_test_split, cross_val_score

from sklearn.metrics import mean_squared_error, mean_absolute_error
#import optuna

In [3]:
from Functions.helper_functions import * 

In [4]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning)
pd.options.mode.chained_assignment = None  # default='warn'

In [5]:
np.random.seed(42)

# Data

In [6]:
train_wp1 = pd.read_csv('Data/Preprocessing/WP1_train_preprocessed.csv', sep=',')
train_wp2 = pd.read_csv('Data/Preprocessing/WP2_train_preprocessed.csv', sep=',')
train_wp3 = pd.read_csv('Data/Preprocessing/WP3_train_preprocessed.csv', sep=',')
train_wp4 = pd.read_csv('Data/Preprocessing/WP4_train_preprocessed.csv', sep=',')
train_wp5 = pd.read_csv('Data/Preprocessing/WP5_train_preprocessed.csv', sep=',')
train_wp6 = pd.read_csv('Data/Preprocessing/WP6_train_preprocessed.csv', sep=',')

In [7]:
test_wp1 = pd.read_csv('Data/Preprocessing/WP1_test_preprocessed.csv', sep=',')
test_wp2 = pd.read_csv('Data/Preprocessing/WP2_test_preprocessed.csv', sep=',')
test_wp3 = pd.read_csv('Data/Preprocessing/WP3_test_preprocessed.csv', sep=',')
test_wp4 = pd.read_csv('Data/Preprocessing/WP4_test_preprocessed.csv', sep=',')
test_wp5 = pd.read_csv('Data/Preprocessing/WP5_test_preprocessed.csv', sep=',')
test_wp6 = pd.read_csv('Data/Preprocessing/WP6_test_preprocessed.csv', sep=',')
test_dates = pd.read_csv('Data/Initial/test.csv', sep=',').date.values

In [8]:
to_drop = ['date','wd','forecast_time', 'forecast', "forecast_dist"]

# DNN

In [9]:
#Architecture du modèle:
def build_model(data,n_neurons,n_steps_out):
    model = Sequential()
    n_neurons_2= int(n_neurons/2)
    model.add(LSTM(n_neurons,activation='relu',return_sequences=True,input_shape=(data.shape[1],data.shape[2])))
    model.add(Dropout(0.1))
    model.add(LSTM(n_neurons,activation='relu',return_sequences=False))
    model.add(Dropout(0.1))
    #model.add(Dense(n_neurons/4,activation='relu'))
    
    model.add(Dense(n_steps_out,activation='sigmoid'))

  

    model.compile(loss='mse',
                optimizer='rmsprop',
                metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

In [10]:
#Architecture du modèle:
def build_model_seq2seq(data):
    model = Sequential()
    model.add(LSTM(200, activation='relu', input_shape=(data.shape[1],data.shape[2])))
    model.add(RepeatVector(n_steps_out))
    model.add(LSTM(200, activation='relu', return_sequences=True))
    model.add(TimeDistributed(Dense(1)))
    

  

    model.compile(loss='mse',
                optimizer='rmsprop',
                metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model

In [11]:
def dnn_cross_validation(X, y, epochs, n_neurons):
    model = build_model(X,n_neurons)
    print(model.summary())
    model.save_weights('model.h5')
    #scaler_X = MinMaxScaler(feature_range=(0,1))
    #X = scaler_X.fit_transform(X)

    print('-----------DNN CROSS VALIDATION BEGINNING-----------')
    split = 10
    kf = KFold(n_splits=split, shuffle=True)       
    dnn_rmse_scores = []
    dnn_mae_scores = []
    i = 1
    for (train_index, test_index) in kf.split(pd.DataFrame(X1), pd.DataFrame(y)):
        X_train, X_test = pd.DataFrame(X).iloc[train_index], pd.DataFrame(X).iloc[test_index]
        Y_train, Y_test = pd.DataFrame(y).iloc[train_index],pd.DataFrame(y).iloc[test_index]

        model.fit(X_train, Y_train,epochs = epochs, verbose=2,batch_size=4096)

        prediction = model.predict(X_test)
        dnn_rmse_scores.append(mean_squared_error(Y_test, prediction,squared=False))
        dnn_mae_scores.append(mean_absolute_error(Y_test, prediction))
        model.load_weights('model.h5')
        print(show_evaluation(prediction, Y_test))
        print(f'-------------------FOLD {i}-----------------')
        i+=1

    print('---------------CROSS VALIDATION COMPLETE-------------')
    print('--------------------------RMSE-----------------------')
    display_scores(dnn_rmse_scores)
    print('--------------------------MAE------------------------')
    display_scores(dnn_mae_scores)

In [12]:
def vmd(y,k):
    
    #Intrinsic mode generation
     #Empirical Mode Decomposition
    #. some sample parameters for VMD  
    alpha = 1       # moderate bandwidth constraint  
    tau = 0.           # noise-tolerance (no strict fidelity enforcement)  
    K = k              # k modes  
    DC = 0             # no DC part imposed  
    init = 1           # initialize omegas uniformly  
    tol = 1e-7
    u, u_hat, omega = VMD(y,alpha, tau, K, DC, init, tol)
    df_vmfs = pd.DataFrame()
    #Integration in the dataframe
    for num, imf in enumerate(u):
        #print('----Creating VMFwp{0} EMD columns----'.format(num+1))
        df_vmfs['IMFwp{0}'.format(num+1)] = imf
    return df_vmfs

### WP1

In [13]:
wp1_X = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
#train_size = int(len(wp1_X) * 0.67)
#test_size = len(wp1_X) - train_size
#train, test = wp1_X[:35100], wp1_X[35100:]

X1 = wp1_X.drop('wp', axis=1),
y1 = wp1_X['wp']


In [14]:
#wp1_X = train_wp1[[c for c in train_wp1 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)


In [15]:
#wp1_X.columns

In [16]:
def split_sequences(sequences, n_steps):
	X, y = list(), list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_steps
		# check if we are beyond the dataset
		if end_ix > len(sequences):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequences.iloc[i:end_ix, :-1], sequences.iloc[end_ix-1, -1]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)

In [17]:
def split_sequences_seq2seq(sequences, n_steps_in, n_steps_out):
	X, y = list(), list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_steps_in
		out_end_ix = end_ix + n_steps_out-1
		# check if we are beyond the dataset
		if out_end_ix > len(sequences):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequences.iloc[i:end_ix, :-1], sequences.iloc[end_ix-1:out_end_ix, -1]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)
 

In [24]:
def split_sequences_seq2seq_yincluded(sequences, n_steps_in, n_steps_out):
	X, y = list(), list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_steps_in
		out_end_ix = end_ix + n_steps_out-1
		# check if we are beyond the dataset
		if out_end_ix > len(sequences):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequences.iloc[i:end_ix,], sequences.iloc[end_ix-1:out_end_ix, -1]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)
 

In [18]:
def split_sequences_vmf(sequences_in,sequences_out, n_steps_in, n_steps_out):
	X, y = list(), list()
	for i in range(len(sequences_in)):
		# find the end of this pattern
		end_ix = i + n_steps_in
		out_end_ix = end_ix + n_steps_out-1
		# check if we are beyond the dataset
		if out_end_ix > len(sequences_in):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequences_in.iloc[i:end_ix,:-1], sequences_in.iloc[end_ix-1:out_end_ix]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)
 

In [19]:
#n_steps = 36

In [20]:
#X, y = split_sequences(wp1_X, n_steps)

In [21]:
vmf_1=vmd(y1,4)

In [22]:
n_steps_in = 36
n_steps_out = 48

In [23]:
X_vmf,y_vmf = split_sequences_vmf(wp1_X,vmf_1['IMFwp1'],n_steps_in,n_steps_out)

In [25]:
X, y = split_sequences_seq2seq(wp1_X, n_steps_in, n_steps_out)

In [26]:
X_y, y_y = split_sequences_seq2seq_yincluded(wp1_X, n_steps_in, n_steps_out)

In [27]:
model = build_model(X,200,n_steps_out)
#model.fit(X,y,epochs=1,verbose=2)
#print(type((model.predict(X1).reshape(len(X1),))[0]))

In [28]:
model.fit(X,y,epochs=5,verbose=2)

Epoch 1/5
1636/1636 - 335s - loss: 0.3603 - root_mean_squared_error: 0.6003
Epoch 2/5
1636/1636 - 264s - loss: 0.3653 - root_mean_squared_error: 0.6044
Epoch 3/5
1636/1636 - 269s - loss: 0.3756 - root_mean_squared_error: 0.6128
Epoch 4/5


KeyboardInterrupt: 

In [29]:
model.reset_states()

In [None]:
model.fit(X_vmf,y_vmf,epochs=5,verbose=2)

In [None]:
model.reset_states()

In [None]:
model.fit(X_y,y_y,epochs=5,verbose=2)

In [None]:
#X, y = split_sequences_seq2seq(wp1_X, n_steps_in, n_steps_out)

In [None]:
#model = build_model_seq2seq(X)
#model.fit(X,y,epochs=20,verbose=2,batch_size=4096)
#print(type((model.predict(X1).reshape(len(X1),))[0]))

### WP2

In [None]:
wp2_X = train_wp2[[c for c in train_wp2 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X2 = wp2_X.drop('wp', axis=1)
y2 = wp2_X['wp']

In [None]:
dnn_cross_validation(X2, y2, 20, 574)

### WP3

In [None]:
wp3_X = train_wp3[[c for c in train_wp3 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X3 = wp3_X.drop('wp', axis = 1)
y3 = wp3_X['wp']

In [None]:
dnn_cross_validation(X3, y3, 20, 574)

### WP4

In [None]:
wp4_X = train_wp4[[c for c in train_wp4 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X4 = wp4_X.drop('wp', axis = 1)
y4 = wp4_X['wp']

In [None]:
dnn_cross_validation(X4, y4, 20, 574)

### WP5

In [None]:
wp5_X = train_wp5[[c for c in train_wp5 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X5 = wp5_X.drop('wp', axis = 1)
y5 = wp5_X['wp']

In [None]:
dnn_cross_validation(X5, y5, 20, 574)

### WP6

In [None]:
wp6_X = train_wp6[[c for c in train_wp6 if c not in ["wp"]] + ["wp"]].drop(to_drop, axis = 1)
X6 = wp6_X.drop('wp', axis = 1)
y6 = wp6_X['wp']

In [None]:
dnn_cross_validation(X6, y6, 20, 574)

# Predictions

## Functions

In [None]:
to_drop_test = ['date','wd','forecast_time', 'forecast', "forecast_dist", 'wp']
def make_prediction_dataset(test, to_drop=to_drop_test):
    test_to_predict = test.dropna(subset=['ws','u','v'], how = 'any') # keeps only lines with u,v,ws,wd
    test_to_predict = test_to_predict[test_to_predict['wp'].isna()] # keeps only lines with no wp
    test_to_predict = test_to_predict.sort_values(by=['date', 'forecast_time'], ascending = [True, False]).drop_duplicates(subset='date')
    test_to_predict = test_to_predict.drop(to_drop, axis = 1)
    scaler_X = MinMaxScaler(feature_range=(0,1))
    test_to_predict = scaler_X.fit_transform(test_to_predict)
    return test_to_predict

In [None]:
def make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, dates,epochs):
    i = 1
    lst_prediction = []
    lst_models_trained = []
    for X, y, test, model in zip(lst_X_trains, lst_y_trains, lst_tests, lst_models):
        print(f'--------------Model {i}--------------')
        model.fit(X, y,epochs,verbose=0)
        print(f'True:\n\tMin:{min(y)}\n\tMax:{max(y)}\n\tMean:{y.mean()}')
        predictions = model.predict(test).reshape(len(test),)
        print(f'Prediction:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        predictions = [min(y) if i < 0 else i for i in predictions]
        predictions = [max(y) if i > max(y) else i for i in predictions]
        print(f'Prediction corrected:\n\tMin:{min(predictions)}\n\tMax:{max(predictions)}\n\tMean:{np.mean(predictions)}')
        lst_prediction.append(predictions)
        lst_models_trained.append(model)
        i+=1
    
    df_predictions = pd.DataFrame({
        'date': test_dates,
        'wp1': lst_prediction[0],
        'wp2': lst_prediction[1],
        'wp3': lst_prediction[2],
        'wp4': lst_prediction[3],
        'wp5': lst_prediction[4],
        'wp6': lst_prediction[5],        
    })
    return df_predictions, lst_models_trained

## Submission

In [None]:
scaler_x1=MinMaxScaler(feature_range=(0,1))
scaler_x2=MinMaxScaler(feature_range=(0,1))
scaler_x3=MinMaxScaler(feature_range=(0,1))
scaler_x4=MinMaxScaler(feature_range=(0,1))
scaler_x5=MinMaxScaler(feature_range=(0,1))
scaler_x6=MinMaxScaler(feature_range=(0,1))

In [None]:
X1_scaled = scaler_x1.fit_transform(X1)
X2_scaled = scaler_x2.fit_transform(X2)
X3_scaled = scaler_x3.fit_transform(X3)
X4_scaled = scaler_x4.fit_transform(X4)
X5_scaled = scaler_x5.fit_transform(X5)
X6_scaled = scaler_x6.fit_transform(X6)

In [None]:
model_1 = build_model(X1,574)
model_2 = build_model(X2,574)
model_3 = build_model(X3,574)
model_4 = build_model(X4,574)
model_5 = build_model(X5,574)
model_6 = build_model(X6,574)

lst_models = [model_1, model_2, model_3, model_4, model_5, model_6]
lst_X_trains = [X1_scaled, X2_scaled, X3_scaled, X4_scaled, X5_scaled, X6_scaled]
lst_y_trains = [y1, y2, y3, y4, y5, y6]

In [None]:
lst_tests = []
for test in [test_wp1, test_wp2, test_wp3, test_wp4, test_wp5, test_wp6]:
    test = make_prediction_dataset(test)
    lst_tests.append(test)

In [None]:
epochs = 300

In [None]:
df_predictions, lst_models_trained = make_submission_file(lst_X_trains, lst_y_trains, lst_tests, lst_models, test_dates,epochs)

In [None]:
df_predictions.to_csv('Predictions/submission_nb_32_full_dnn.csv', index=False, sep=';')

In [None]:
df_predictions.head()

In [None]:
type(df_predictions["wp1"][0])

In [None]:
## Saving models

In [None]:
pkl_model = "Models/DNN/DNN-wp1-3layers-574neurons.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[0], file)
    
    
pkl_model = "Models/DNN/DNN-wp2-3layers-574neurons.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[1], file)
    

pkl_model = "Models/DNN/DNN-wp3-3layers-574neurons.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[2], file)


pkl_model = "Models/DNN/DNN-wp4-3layers-574neurons.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[3], file)


pkl_model = "Models/DNN/DNN-wp5-3layers-574neurons.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[4], file)


pkl_model = "Models/DNN/DNN-wp6-3layers-574neurons.pkl"
with open(pkl_model, 'wb') as file:
    pickle.dump(lst_models_trained[5], file)