In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">
  <td><a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/forecast/asset_price_forecast.ipynb"><img src="https://avatars.githubusercontent.com/u/33467679?s=200&v=4" width="32px" alt="Colab logo"> Run in Colab</a></td>
  <td><a href="https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/forecast/asset_price_forecast.ipynb"><img src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px" alt="GitHub logo"> View on GitHub</a></td>
  <td><a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/ai-ml-recipes/main/notebooks/forecast/asset_price_forecast.ipynb"><img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"> Open in Vertex AI Workbench</a></td>
  <td><a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/forecast/asset_price_forecast.ipynb"><img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw&s" alt="BQ logo" width="35"> Open in BQ Studio</a></td>
  <td><a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fai-ml-recipes%2Fmain%2Fnotebooks/forecast/asset_price_forecast.ipynb"><img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"> Open in Colab Enterprise</a></td>
</table>

# Asset Price Forecasting with Apache Spark and Prophet

## Overview
This notebook demonstrates how to forecast asset prices using Apache Spark for data preparation and storage (Iceberg table) and Facebook Prophet for time series forecasting. It covers data loading, transformation, model training, forecasting, and saving the combined historical and forecasted data.

## Setup and Imports

### Initialize Spark Session and Import Libraries
This cell imports necessary libraries for Apache Spark data manipulation and defines the paths for input CSV and output Iceberg table.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import make_date, col, year
from pyspark.sql.functions import max

csv_path = 'gs://<BUCKET_NAME>/public-data/finance/gc=f_price.csv'
iceberg_path = "gs://<BUCKET_NAME>/warehousing/finance/gc=f_price"

## Data Loading and Preparation

### Load Data
Reads the asset price data from a CSV file into a Spark DataFrame, inferring the schema and including the header.

In [None]:
df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_path)
df.count()

### Optimize and Prepare Data
Drops unnecessary columns and extracts the year from the 'date' column, which will be used for partitioning the Iceberg table. Displays a sample of the optimized DataFrame.

In [None]:
df_optimized = df.drop('High', 'Low', 'Open', 'Volume')
df_optimized = df_optimized.withColumn('year', year(col('date')))
df_optimized.show()

### Save Processed Data to Iceberg
Writes the optimized Spark DataFrame to an Apache Iceberg table, partitioning by year. The mode is set to 'overwrite' for initial setup.

In [None]:
df_optimized.write.format('iceberg').mode('overwrite').partitionBy('year').saveAsTable('iceberg_catalog.finance.gold_price')

## Asset Price Forecasting with Prophet

### Convert to Pandas DataFrame
Converts the optimized Spark DataFrame to a Pandas DataFrame, which is required for the Prophet library.

In [None]:
df_pandas = df_optimized.toPandas()
df_pandas.count

### Install and Import Forecasting Libraries
Ensures the Prophet library is available and imports necessary modules for time series forecasting.

In [None]:
# !pip install prophet
import pandas as pd
from prophet import Prophet

### Prepare Data for Prophet and Train Model
Renames columns to 'ds' (datestamp) and 'y' (value) as required by Prophet, initializes the Prophet model with a specified confidence interval, and trains the model on the historical data.

In [None]:
series = df_pandas.rename(columns={'Date': 'ds', 'Close': 'y'})
confidence_interval = 0.9
model = Prophet(interval_width=confidence_interval)
model.fit(series)

### Generate Future Forecast
Creates a DataFrame with future dates for a specified forecast period and generates predictions using the trained Prophet model.

In [None]:
forecast_period = 365
future = model.make_future_dataframe(periods=forecast_period)
forecast = model.predict(future)

### Visualize the Forecast
Plots the historical data, fitted trend, and future forecast, including confidence intervals.

In [None]:
import matplotlib
fig1 = model.plot(forecast)
fig1.gca().set_title("Gold Spot Price Forecast", size=16)
fig1.gca().set_xlabel("Date")
fig1.gca().set_ylabel("Price USD/Ounce")


## Save Forecasted Data

### Prepare Forecast Data for Integration
Filters the forecast to include only future dates, renames columns to match the original DataFrame schema, converts the 'ds' column to a date type, and creates a Spark DataFrame from the forecast.

In [None]:
forecast1 = forecast[['ds', 'yhat']][forecast['ds']> '2025-07-17']
forecast1.columns = ['Date', 'Close']
forecast1['Date'] = forecast1['Date'].dt.date
df_forecast = spark.createDataFrame(forecast1)
df_forecast = df_forecast.withColumn('year', year(col('date')))
df_forecast.show()

### Combine Original and Forecasted Data
Unions the original historical Spark DataFrame with the newly generated forecast Spark DataFrame, ensuring schema compatibility.

In [None]:
combined_df = df_optimized.unionByName(df_forecast)
combined_df.tail(20)

### Append Combined Data to Iceberg Table
Appends the combined historical and forecasted data to the existing Apache Iceberg table, partitioning by year. This allows for unified storage and querying of both past and predicted values.

In [None]:
combined_df.write.format('iceberg').mode('append').partitionBy('year').saveAsTable('iceberg_catalog.finance.gold_price')