In [None]:
# Import necessary libraries
import utils
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.api as sm
import time 
import numpy as np

# 1. Obtain Data

## 1.1. Import Data

In [None]:
# Retrieve stock data 
bank_china_stock_data = utils.get_stock_data("601988.SS", "2007-01-04", "2022-03-18")

## 1.2. Explore Data

In [None]:
print(f"Data Shape: {bank_china_stock_data.shape}")
print(bank_china_stock_data.info()) 
bank_china_stock_data.head()

In [None]:
# Plot the time series closing price data 
plt.figure(figsize=(12, 6))

plt.plot(bank_china_stock_data['Close'], label='Bank of China Close Price', color='blue')

plt.title('Bank of China Closing Price')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.grid(True)

plt.legend()
plt.show()

In [None]:
# Perform Augmented Dickey-Fuller (ADF) test for stationarity
adf_test_df = bank_china_stock_data[['Close']].copy()
utils.perform_adf_test(adf_test_df['Close'])

In [None]:
# Perform first-order differencing (d=1)
adf_test_df['Close_diff'] = adf_test_df['Close'].diff()

# Drop any missing values created by differencing
adf_test_df.dropna(inplace=True)

# Perform the ADF test on the differenced data
utils.perform_adf_test(adf_test_df['Close_diff'])

In [None]:
# Plot Autocorrelation and Partial Autocorrelation functions to identify p and q values.
utils.plot_acf_pacf(bank_china_stock_data['Close'], acf_lags=80, pacf_lags=40, figsize=(10, 6))

In [None]:
utils.plot_acf_pacf(adf_test_df['Close_diff'], acf_lags=30, pacf_lags=40, figsize=(10, 6))

## 1.3. Split Data

In [None]:
# Define the percentage of data to be used for training
train_percentage = 0.8

# Calculate the index where the data will be split
split_index = int(len(bank_china_stock_data) * train_percentage)

# Split the data into training and testing sets
train_data = bank_china_stock_data.iloc[:split_index]
test_data = bank_china_stock_data.iloc[split_index:]

# Print the number of data points in each set
print(f"Number of training samples: {len(train_data)}")
print(f"Number of testing samples: {len(test_data)}")

# 2. Build Model

## 2.1. Baseline

In [None]:
train_mean = train_data['Close'].mean()
pred_baseline = [train_mean] * len(train_data)
mae_baseline = metrics.mean_absolute_error(train_data['Close'], pred_baseline)

print("Train Mean: {}".format(train_mean.round(2)))
print("Baseline MAE: {}".format(mae_baseline.round(2)))

In [None]:
# Plot closing price data with baseline (mean) predictions
plt.figure(figsize=(12, 6))
plt.plot(train_data.index, train_data['Close'], label='Actual Close Price', color='blue')
plt.plot(train_data.index, pred_baseline, label='Baseline Predictions', color='red', linestyle='--')

# Set labels and title
plt.title('Actual Close Price vs. Baseline Predictions')
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.grid(True)

# Add a legend
plt.legend()

# Show the plot
plt.show()


## 2.2. Train ARIMA Model

In [None]:
# Fit the ARIMA model to the training data
order = (2, 1, 0)
model = sm.tsa.ARIMA(train_data['Close'].values, order=order).fit()
model.summary()

In [None]:
# Define a range of p and q values to explore
p_values = [0, 1, 2, 3]
q_values = [0, 1, 2]

mae_grid = {}  # Initialize an empty dictionary

# Initialize best_p and best_q
best_p = None
best_q = None
best_mae = float('inf')

# Loop through different pairs of p and q
for p in p_values:
    mae_grid[p] = []  # Initialize a list for each p value
    for q in q_values:
        start_time = time.time()
        
        # Train an ARIMA model
        order = (p, 1, q)
        model = sm.tsa.ARIMA(train_data['Close'].values, order=order).fit()

        elapsed_time = round(time.time() - start_time, 2)
        
        # Make predictions on the training data
        y_pred = model.fittedvalues
        
        # Calculate MAE for this pair of p and q
        mae = metrics.mean_absolute_error(train_data['Close'].values, y_pred).round(4)
        mae_grid[p].append(mae)
        
        # Check if this model has the lowest MAE so far
        if mae < best_mae:
            best_mae = mae
            best_p = p
            best_q = q
        
        print("Order = {}, MAE={}, Elapsed Time={} seconds".format(order, mae, elapsed_time))

print("\nBest p:", best_p)
print("Best q:", best_q)
print("Best MAE:", best_mae)


## 2.4. Prediction

In [None]:
# Perform walk-forward validation on the test data

# Initialize list to store the ARIMA predictions and the history of training data
pred_arima_wfv = list()
history = list(train_data['Close'].values.copy())

for i in range(len(test_data)):
    # Train the ARIMA model with the current history 
    model_fit = sm.tsa.ARIMA(history, order=(1, 1, 1)).fit()

    # Forecast next day value (yhat)
    yhat = model_fit.forecast()
    yhat = float(yhat[0])
    pred_arima_wfv.append(yhat)

    # Retrieve the true value for the current iteration
    # Append the predicted value to history
    true_value = test_data['Close'].iloc[i]
    history.append(true_value)

    # Print the results for each iteration
    print("{}: {} True value: {:.2f}, Predicted value: {:.4f}".format(i+1, test_data.index[i].date(), true_value, yhat))

In [None]:
# Evaluation
utils.evaluation_metric(test_data['Close'].values,pred_arima_wfv)