In [1]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from IPython.display import Image, display

mpl.rcParams['figure.figsize'] = (10,8)
mpl.rcParms['axes.grid'] = False

ModuleNotFoundError: No module named 'tensorflow'

In [3]:
df = pd.read_csv('../_data/pek-sha.csv')
df.head()

Unnamed: 0,ID,flightNumber,craftTypeCode,depAirport,traAirport,arrAirport,departureDate,arrivalDate,cabinClass,priceClass,price,rate,createDate,dateDifference
0,14393,HO1252,320,PEK,,SHA,2019-01-04 06:35:00,2019-01-04 08:55:00,C,C,1860,1.0,2019-01-03 14:26:15,1
1,14409,MU5138,33L,PEK,,SHA,2019-01-04 07:00:00,2019-01-04 09:15:00,C,I,1640,0.31,2019-01-03 14:26:15,1
2,14415,MU5138,33L,PEK,,SHA,2019-01-04 07:00:00,2019-01-04 09:15:00,C,J,5360,1.0,2019-01-03 14:26:15,1
3,14429,HU7605,350,PEK,,SHA,2019-01-04 07:20:00,2019-01-04 09:35:00,C,I,1635,0.29,2019-01-03 14:26:15,1
4,14431,HU7605,350,PEK,,SHA,2019-01-04 07:20:00,2019-01-04 09:35:00,C,I,1640,0.29,2019-01-03 14:26:15,1


In [4]:
set(df['cabinClass'])

{'C', 'F', 'Y'}

In [8]:
len(df[(df['flightNumber']=='MU5138') & (df['departureDate']=='2019-01-07 07:00:00')])

40

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

class Forecasting:
    def __init__(self, df:pd.DataFrame, target):
        self.df = df
        self.target = target
        self.tts = dict()
        self.window_size = 5
        
    def data_to_X_y(self, data, window_size=5):
        data_as_np = data.to_numpy()
        X = []
        y = []
        for i in range(len(data_as_np)-window_size):
            row = [[a] for a in data_as_np[i:i+5]]
            X.append(row)
            label = data_as_np[i+5]
            y.append(label)
        return np.array(X), np.array(y)
    
    def split_train_test(self, X, y, train, test, validation):
        self.tts['xtrain'], self.tts['ytrain'] = X[:train], y[:train]
        self.tts['xtest'], self.tts['ytest'] = X[:test], y[:test]
        self.tts['xval'], self.tts['yval'] = X[:validation], y[:validation]
        
    def prepareLSTMForecasting(self, date_column, window_size=5, train_split, test_split, validation_split):
        # transform date column to index
        self.df.index = pd.to_datetime(self.df[date_column], format='%Y-%m-%d %H:%M:%S')
        
        # create time series
        if window_size:
            self.window_size = window_size
        X, y = self.data_to_X_y(self.df[self.target], window_size=self.window_size)
        print(X.shape, y.shape)
        
        # splitting data
        self.split_train_test(X, y, train_split, test_split, validation_split)

    def plotForecasting(self, x, y):
        plt.plot(x)
        plt.plot(y)
        plt.show()
        
    def saveModel(self, model, title):
        path_pickle = 'saved models/'+title+'.pkl'
        path_joblib = 'saved models/'+title+'.sav'
        pickle.dump(model, open(path_pickle, 'wb'))
        joblib.dump(model, open(path_joblib, 'wb'))
        print('model saved at',path_pickle,'and',path_joblib)
    
    def LSTMforecasting(self, epochs=5, plot=False):
        model = Sequential()
        model.add(InputLayer((self.window_size,1)))
        model.add(LSTM(64))
        model.add(Dense(8, 'relu'))
        model.add(Dense(1, 'linear'))
        print(model.summary())
        
        # callbacks for checking validity
        cb = ModelCheckpoint('model/', save_best_only=True)
        model.compile(loss=MeanSquaredError(),optimizer=Adam(learning_rate=0.0001), metrics=[RootMeanSquaredError()])
        
        # fitting the model
        model.fit(self.tts['xtrain'], self.tts['ytrain'], validation_data=[self.tts['xval'], self.tts['yval']], epochs=epochs, callbacks=[cb])
        
        model = load_model('model/')
        
        if plot:
            train_predictions = model.predict(self.tts['xtrain']).flatten()
            train_results = pd.DataFrame(data={'Train predictions':train_predictions, 'Actuals':self.tts['ytrain']})
            self.plotForecasting(train_results['Train predictions'], train_results['Actuals'])
            
            test_predictions = model.predict(self.tts['xtest']).flatten()
            test_results = pd.DataFrame(data={'Test predictions':train_predictions, 'Actuals':self.tts['ytest']})
            self.plotForecasting(train_results['Test predictions'], train_results['Actuals'])
            
            val_predictions = model.predict(self.tts['xval']).flatten()
            val_results = pd.DataFrame(data={'Val predictions':train_predictions, 'Actuals':self.tts['yval']})
            self.plotForecasting(train_results['Val predictions'], train_results['Actuals'])
        
        self.saveModel(model, 'lstmforecasting')
        return model        