## Import Data

In [1]:
from platform import python_version
print(python_version())

3.6.9


In [2]:
import csv
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import random
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tsa.stattools as ts

from arch import arch_model
from matplotlib import pyplot
from numpy.random import seed
from numpy.random import randn
from random import gauss
from random import seed
from statsmodels.graphics.gofplots import qqplot
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller

# seed pseudorandom number generator
seed(1)

dtypes = {
        'IsCanceled':                                    'float64',
        'LeadTime':                                          'float64',
        'StaysInWeekendNights':                                     'float64',
        'StaysInWeekNights':                                     'float64',
        'Adults':                            'float64',
        'Children':                            'float64',
        'Babies':                                  'float64',
        'Meal':                                    'category',
        'Country':                                               'category',
        'MarketSegment':                                    'category',
        'DistributionChannel':                                       'category',
        'IsRepeatedGuest':                               'float64',
        'PreviousCancellations':                                    'float64',
        'PreviousBookingsNotCanceled':                          'float64',
        'ReservedRoomType':                                             'category',
        'AssignedRoomType':                                            'category',
        'BookingChanges':                                                'float64',
        'DepositType':                                              'category',
        'Agent':                                              'category',
        'Company':                                 'category',
        'DaysInWaitingList':                                           'float64',
        'CustomerType':                                           'category',
        'ADR':                                          'float64',
        'RequiredCarParkingSpaces':                                      'float64',
        'TotalOfSpecialRequests':                                              'float64',
        'ReservationStatus':                                                'category'
        }

# create dataset
train_df = pd.read_csv("H1full.csv", dtype=dtypes, converters={'ArrivalDateWeekNumber': '{:0>2}'.format})

In [3]:
a=train_df.head()
b=train_df
b
c=b.sort_values(['ArrivalDateYear','ArrivalDateWeekNumber'], ascending=True)
c=pd.DataFrame(c)
c
type(c)

pandas.core.frame.DataFrame

In [4]:
from pandas import DataFrame  

df = DataFrame(c, columns= ['ArrivalDateYear', 'ArrivalDateWeekNumber']) 
df

Unnamed: 0,ArrivalDateYear,ArrivalDateWeekNumber
0,2015,27
1,2015,27
2,2015,27
3,2015,27
4,2015,27
...,...,...
40055,2017,35
40056,2017,35
40057,2017,35
40058,2017,35


In [5]:
df1 = df['ArrivalDateYear'].map(str) + df['ArrivalDateWeekNumber'].map(str)
print (df1)
df1=pd.DataFrame(df1)

0        201527
1        201527
2        201527
3        201527
4        201527
          ...  
40055    201735
40056    201735
40057    201735
40058    201735
40059    201735
Length: 40060, dtype: object


In [6]:
df2 = DataFrame(c, columns= ['ADR']) 
df2

Unnamed: 0,ADR
0,0.00
1,0.00
2,75.00
3,75.00
4,98.00
...,...
40055,89.75
40056,202.27
40057,153.57
40058,112.80


In [7]:
df3=pd.concat([df1, df2], axis = 1)
df3
df3.columns = ['FullDate', 'ADR']

In [8]:
df3
df3.sort_values(['FullDate','ADR'], ascending=True)

Unnamed: 0,FullDate,ADR
0,201527,0.00
1,201527,0.00
125,201527,0.00
14463,201527,4.00
81,201527,55.68
...,...,...
13770,201735,328.00
13745,201735,341.00
39847,201735,344.67
39870,201735,351.00


In [9]:
df4 = df3.groupby('FullDate').agg("mean")
df4
df4.sort_values(['FullDate'], ascending=True)

Unnamed: 0_level_0,ADR
FullDate,Unnamed: 1_level_1
201527,100.099096
201528,110.399188
201529,128.112023
201530,140.592271
201531,145.563833
...,...
201731,208.712512
201732,211.528351
201733,211.162040
201734,210.284883


In [10]:
data=df4['ADR']
data

FullDate
201527    100.099096
201528    110.399188
201529    128.112023
201530    140.592271
201531    145.563833
             ...    
201731    208.712512
201732    211.528351
201733    211.162040
201734    210.284883
201735    186.204007
Name: ADR, Length: 115, dtype: float64

In [11]:
# https://www.statsmodels.org/stable/generated/statsmodels.stats.diagnostic.acorr_ljungbox.html
res = sm.tsa.ARMA(tseries, (1,1)).fit(disp=-1)
sm.stats.acorr_ljungbox(res.resid, lags=[10])

NameError: name 'sm' is not defined

In [None]:
pyplot.plot(data)
pyplot.title("ADR")
pyplot.show()

## Data Converted to log format to smooth out extreme values and reduce volatility

In [None]:
data=np.log(data)
pyplot.plot(data)
pyplot.title("ADR (log values)")
pyplot.show()

In [None]:
data.rolling(window=3).var().plot(style='r')
pyplot.title("3-month Rolling Variance: All Data")

## Data is first differenced to approximate a Gaussian distribution

In [None]:
data=np.diff(data,1)
data

In [None]:
# plot
pyplot.hist(data)
pyplot.title("Histogram")
pyplot.show()

In [None]:
squared_data = [x**2 for x in data]
# create acf plot
plot_acf(squared_data)
pyplot.show()

In [None]:
plot_pacf(squared_data)
pyplot.show()

## QQ Plot: Visual Screening for Normality

In [None]:
seed(1)
qqplot(data, line='s')
pyplot.title("QQ Plot")
pyplot.show()

## ARCH Modelling

In [None]:
n_test = 10
train, test = data[:-n_test], data[-n_test:]
model = arch_model(train, mean='Zero', vol='ARCH', p=100) # where p = number of lag variances

In [None]:
model

In [None]:
train

In [None]:
test

In [None]:
model_fit = model.fit()
yhat = model_fit.forecast(horizon=n_test)

In [None]:
print(model_fit.summary())

In [None]:
yhat

In [None]:
type(yhat)

In [None]:
test
test=pd.DataFrame(test)

In [None]:
# actual variance
var = [i*0.01 for i in range(0,100)]
pyplot.plot(var[-n_test:])
# forecast variance
pyplot.plot(yhat.variance.values[-1, :])
pyplot.title("Actual vs. Forecasted Variance")
pyplot.show()

## References

### [- Machine Learning Mastery: How to Model Volatility with ARCH and GARCH for Time Series Forecasting in Python](https://machinelearningmastery.com/develop-arch-and-garch-models-for-time-series-forecasting-in-python/)

### [- Met Eireann - Irish Weather Data](https://www.met.ie/climate/available-data/historical-data)