In [1]:
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly import graph_objs as go
from scipy.stats import pearsonr
from statsmodels.tsa.arima_model import ARIMA
from fbprophet import Prophet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from datetime import timedelta
import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Flatten, Dropout, Activation

from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

handler = logging.FileHandler('nn.log')
handler.setLevel(logging.INFO)
logger.addHandler(handler)

%matplotlib inline
init_notebook_mode(connected = True)

Using TensorFlow backend.

inspect.getargspec() is deprecated, use inspect.signature() or inspect.getfullargspec()



In [4]:
class PPPredictor:
    
    def __init__(self):
        self.model = None
    
    def prepare_data(self, data, actual_date):
        df = data.copy()
        df.index = pd.to_datetime(df['Date'])
        df.drop(['Date'], axis=1, inplace=True)
        for col in df.columns:
            df[col].interpolate(method='time', inplace=True)
        df.index = pd.to_datetime(df.index)
        return df[df.index <= pd.to_datetime(actual_date)]

    def fit(self, prepared_data, use_text_model=False):
        model = Sequential()

        model.add(LSTM(
            input_dim=1,
            output_dim=50,
            return_sequences=True))
        model.add(Dropout(0.2))

        model.add(LSTM(
            100,
            return_sequences=False))
        model.add(Dropout(0.2))

        model.add(Dense(
            output_dim=1))
        model.add(Activation('linear'))

        model.compile(loss='mse', optimizer='rmsprop')
        
        df = pd.DataFrame({'ds': prepared_data.index, 'y': prepared_data['PPSpotAvgPrice']}).reset_index().drop(['Date'], axis=1)
        df['ds'] = pd.to_numeric(df['ds'])
        
        self.scaler_X = MinMaxScaler()
        X_train = np.reshape(np.array(df['ds']), (-1, 1))
        X_train = self.scaler_X.fit_transform(X_train)
        
        self.scaler_y = MinMaxScaler()
        y_train = np.reshape(np.array(df['y']), (-1, 1))
        y_train = self.scaler_y.fit_transform(y_train)
        
        X_train = np.reshape(X_train, (X_train.shape[0], 1, 1))
        #y_train = np.reshape(y_train, (y_train.shape[0], 1, 1))
        model.fit(X_train, y_train, batch_size=128, nb_epoch=10, validation_split=0.05)
        
        self.model = model
        return model

    
    def predict(self, date):
        X_test = np.array(pd.to_numeric(pd.Series(pd.to_datetime(date))))
        X_test = np.reshape(X_test, (-1, 1))
        X_test = self.scaler_X.transform(X_test)
        X_test = np.reshape(X_test, (1, 1, 1))
        forecast = self.model.predict(X_test)
        return self.scaler_y.inverse_transform(forecast)

In [5]:
 def get_next_monday(df, date):
    first = True
    while len(df[df.index == str(date).split()[0]]) == 0:
        if first:
            date += timedelta(days=((7 - date.weekday()) % 7))
            first = False
        else:
            date += timedelta(7)
    return date

def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def test_sol(date):
    df = pd.read_csv('./data/retrieved_data.csv')
    ppp = PPPredictor()
    prepared = ppp.prepare_data(df, date)
    ppp.fit(prepared)
    
    df.index = pd.to_datetime(df['Date'])
    df.drop(['Date'], axis=1, inplace=True)
    for col in df.columns:
        df[col].interpolate(method='time', inplace=True)
    
    start = pd.to_datetime(date) + timedelta(3 * 30)
    end = pd.to_datetime(date) + timedelta(4 * 30)
    pred = []
    actual = []
    for dt in pd.date_range(start, end, freq='W'):
        pred.append(ppp.predict(str(dt).split()[0]))
        actual_dt = get_next_monday(df, dt)
        right_val = df[df.index == str(actual_dt).split()[0]]['PPSpotAvgPrice'].iloc[0]
        actual.append(right_val)
    logger.info('MAPE for {}: {:.2f}%'.format(date, mean_absolute_percentage_error(pred, actual)))

for dt in pd.date_range(start=pd.to_datetime('2017-12-01'), 
                        end=pd.to_datetime('2018-07-01'), freq='M'):
    test_sol(str(dt).split()[0])

Train on 201 samples, validate on 11 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


INFO:__main__:MAPE for 2017-12-31: 8.12%


Train on 206 samples, validate on 11 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


INFO:__main__:MAPE for 2018-01-31: 9.34%


Train on 209 samples, validate on 12 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


INFO:__main__:MAPE for 2018-02-28: 10.67%


Train on 213 samples, validate on 12 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


INFO:__main__:MAPE for 2018-03-31: 10.40%


Train on 218 samples, validate on 12 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


INFO:__main__:MAPE for 2018-04-30: 16.75%


Train on 222 samples, validate on 12 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


INFO:__main__:MAPE for 2018-05-31: 18.07%


Train on 226 samples, validate on 12 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


INFO:__main__:MAPE for 2018-06-30: 29.28%
