In [None]:
import matplotlib.pyplot as plt
import datetime as dt
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.utils.validation import check_is_fitted
import statsmodels.api as sm
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_predict
from datetime import datetime




In [None]:
def wrangle(filepath):
    
    # Read xlsx.file
    df = pd.read_excel(filepath)
    
    df["Date"] = df["Year"].astype(str) + "-12-31" # Create a new column with string
    
    df['Date']= pd.to_datetime(df['Date']) # Change "Date" column type to 'date-time' type
    
    df = df.set_index("Date") # Set index 
    
    df.drop(columns="Year", inplace = True) # Drop 'Year' column
    
    
    
    return df
    

In [None]:
Data = wrangle('Documents/Real GDP annual growth rate (1986-2018).xlsx')
Data['Real GDP growth rate'].plot(figsize=(12,5))

Check for stationary

In [None]:
from statsmodels.tsa.stattools import adfuller

def adfuller_test(dataset):
    df_test = adfuller(dataset, autolag='AIC')
    print('1. ADF Statistic: %f' % df_test[0])
    print('2. p-value: %f' % df_test[1])
    print('3. Num of lags: ', df_test[2])
    print('4. Num of Obs: ', df_test[3])
    print('Critical Values:')
    for key, value in df_test[4].items():
        print('\t%s: %.3f' % (key, value))
        

In [None]:
adfuller_test(Data['Real GDP growth rate'])

0th element = -3.429359 .It is test-statistic, more negative means likely to be stationary
p-Value = 0.009998 < 0.05 --> reject null hypothesis --> Reject non-stationary
So, this data is stationary

### Figure out order for ARIMA model

In [None]:
# Because the dataset is stationary, so I set d = 0.
# Create empty list to store search results
order_aic_bic=[]

# Loop over p values from 0-2
for p in range(3):
  # Loop over q values from 0-2
    for q in range(3):
      	# create and fit ARMA(p,q) model
        model = sm.tsa.statespace.SARIMAX(Data, order=(p,0,q), trend='c')
        results = model.fit()
        
        # Append order and results tuple
        order_aic_bic.append((p, q, results.aic, results.bic))

In [None]:
# Construct DataFrame from order_aic_bic
order_df = pd.DataFrame(order_aic_bic, 
                        columns=['p', 'q', 'AIC', 'BIC'])

# Print order_df in order of increasing AIC
print(order_df.sort_values('AIC'))

# Print order_df in order of increasing BIC
print(order_df.sort_values('BIC'))

In [None]:
# Pair value (1,1) make AIC smallest, then pick (1,1) to order 
# Fit model
model = ARIMA(Data, order=(1,0,0))
#results = model.fit()

# Check diagnostics
plt.figure(figsize = (20, 20))
results.plot_diagnostics()
plt.show()
print(results.summary())

### Split Data into Training and Testing

In [None]:
print(Data.shape)
Data_train = Data.loc[:'2012']
Data_test = Data.loc['2013':]
print(Data_train.shape, Data_test.shape)

Train the model:


In [None]:
model = ARIMA(Data_train['Real GDP growth rate'], order = (1,0,0))
model = model.fit()
model.summary()

### Make prediction on test set:

In [None]:
start = len(Data_train)
end = len(Data_train) + len(Data_test) -1
pred = model.predict(start=start, end=end, type='level')
print(pred)

In [None]:
pred.plot(legend=True)
Data_test['Real GDP growth rate'].plot(legend=True)

In [None]:
Data_test.mean()

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(pred, Data_test['Real GDP growth rate']))
print(rmse)

In [None]:
modelX = ARIMA(Data['Real GDP growth rate'], order=(1,0,0))
modelX = modelX.fit()
Data.tail()

In [None]:
pred = modelX.predict(start=len(Data)-33, end=len(Data)+3, typ='level').rename('Arima Prediction')
print(pred)

In [None]:
pred.plot(figsize=(12,5), legend=True)

In [None]:

plt.plot(Data)
plt.plot(pred, color='r')
plt.show()

In [None]:
pred_fix = pred.shift(-1)
pred_fix

In [None]:
plt.plot(Data)
plt.plot(pred_fix, color='r')
plt.show()

### PREDICTED RESULTS
The GDP growth rate of 2019 = 6,44%
, The GDP growth rate of 2020 = 6,39%
, The GDP growth rate of 2021 = 6,35%

### ASSUMING DATA
Assuming, in 2019 have economic-crisis and the GDP growth rate of VietNam fell -0,1%.
We set the value the GDP growth rate of 31/12/2019 as -0,001

In [None]:
Data2 = wrangle('Documents/GDP annual growth rate (1986-2019).xlsx')
Data2

In [None]:
adfuller_test(Data2['Real GDP growth rate'])

p-value = 0.222125 > 0.05 => Accept null hypothesis --> Accept non-stationary So, this data is non-stationary

### Take the first difference of the data


In [None]:
Data2_diff = Data2.diff().dropna()

In [None]:
adfuller_test(Data2_diff['Real GDP growth rate'])

ADF Statistic = -3.553859 < 0 , p-value = 0.006707 < 0.05 =>> Data2_diff is stationary

### Figure out order for ARIMA model

In [None]:
# Data2_diff is stationary => set D = 0
# Create empty list to store search results
order_aic_bic1=[]

# Loop over p values from 0-3
for p in range(4):
  # Loop over q values from 0-3
    for q in range(4):
      	# create and fit ARMA(p,q) model
        model1 = sm.tsa.statespace.SARIMAX(Data2_diff, order=(p,0,q), trend='c')
        results1 = model1.fit()
        
        # Append order and results tuple
        order_aic_bic1.append((p, q, results1.aic, results1.bic))

In [None]:
# Construct DataFrame from order_aic_bic
order_df1 = pd.DataFrame(order_aic_bic1, 
                        columns=['p', 'q', 'AIC', 'BIC'])

# Print order_df in order of increasing AIC
print(order_df1.sort_values('AIC'))

# Print order_df in order of increasing BIC
print(order_df1.sort_values('BIC'))

Choose (p,q) = (0,2)

In [None]:
# Fit model
model1 = ARIMA(Data, order=(0,0,2))
#results = model.fit()
# Check diagnostics
plt.figure(figsize = (20, 20))
results1.plot_diagnostics()
plt.show()
print(results.summary())

### Split Data into Training and Testing

In [None]:
#print(Data.shape)
train = Data2_diff.loc[:'2018']
test = Data2_diff.loc['2019':]
#print(train.shape, test.shape)

Train the model:

In [None]:
model1 = ARIMA(train['Real GDP growth rate'], order = (0,0,2))
model1= model1.fit()


### Make prediction on test set:

In [None]:
start1 = len(train)
end1 = len(train) + len(test) -1
pred1 = model.predict(start=start1, end=end1, type='level')
print(pred1)

In [None]:
pred1.plot(legend=True)
test['Real GDP growth rate'].plot(legend=True)

In [None]:
test.mean()

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
rmse = sqrt(mean_squared_error(pred1, test['Real GDP growth rate']))
print(rmse)

In [None]:
modelY = ARIMA(Data2_diff['Real GDP growth rate'], order=(0,0,2))
modelY = modelY.fit()
Data.tail()

In [None]:
pred1 = modelY.predict(start=len(Data2_diff)-33, end=len(Data2_diff)+2, typ='level').rename('Arima1 Prediction')
print(pred1)

In [None]:
pred1.plot(figsize=(12,5), legend=True)

In [None]:
plt.plot(Data2_diff)
plt.plot(pred1, color='r')
plt.show()

In [None]:
pred_fix1 = pred1.shift(-1)
pred_fix1

In [None]:
plt.plot(Data2_diff)
plt.plot(pred_fix1, color = 'r')
plt.show()

### PREDICT RESULTS:
The GDP growth rate of 2020 = 0.031711, The GDP growth rate of 2021 = -0.000579