In [ ]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


<table align="left">
  <td><a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/classification/asset_price_forecast.ipynb"><img src="https://avatars.githubusercontent.com/u/33467679?s=200&v=4" width="32px" alt="Colab logo"> Run in Colab</a></td>
  <td><a href="https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/classification/asset_price_forecast.ipynb"><img src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px" alt="GitHub logo"> View on GitHub</a></td>
  <td><a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/ai-ml-recipes/main/notebooks/classification/asset_price_forecast.ipynb"><img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"> Open in Vertex AI Workbench</a></td>
  <td><a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/classification/asset_price_forecast.ipynb"><img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw&s" alt="BQ logo" width="35"> Open in BQ Studio</a></td>
  <td><a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fai-ml-recipes%2Fmain%2Fnotebooks/classification/asset_price_forecast.ipynb"><img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"> Open in Colab Enterprise</a></td>
</table>


# Asset Price Forecast using Iceberg and Prophet


<img src="../../docs/images/forecast/gold-forecast.png" alt="drawing" width="800"/>


## Overview

This notebook demonstrates how to forecast asset prices using PySpark, Iceberg, and Prophet. It covers loading data, performing transformations, storing data in an Iceberg table, training a time-series forecasting model with Prophet, and then integrating the predictions back into the Iceberg table.


## Setup

This section sets up the environment by installing necessary libraries, importing modules, defining global variables, and initializing the SparkSession.


### Install dependencies

Install the required Python packages for PySpark, Google Spark Connect, Dataproc, Pandas, Prophet, and Matplotlib.


In [None]:
%pip install -q pyspark==3.5.0 google-spark-connect google-cloud-dataproc pandas prophet matplotlib


### Import libraries

Import necessary libraries for Prophet, PySpark, and Google Cloud Dataproc.


In [None]:

from prophet import Prophet

from pyspark.sql.functions import col, year

from google.cloud import dataproc_v1
from google.cloud.dataproc_v1 import Session, SparkConnectConfig
from google.cloud.spark_connect import GoogleSparkSession


### Set global variables

Define project-specific variables, including your GCP project ID, desired location, Spark template ID, and GCS paths for input data and Iceberg warehouse.


In [None]:
project_id = "<YOUR_GCP_PROJECT>"
location = "<YOUR_LOCATION>"  

serverless_spark_template_id = "spark-serverless-runtime"

csv_path = 'gs://dataproc-metastore-public-binaries/asset_price_forecast/asset_price_forecast.csv'

iceberg_warehouse_gcs_path = "gs://<YOUR_ICEBERG_GCS_BUCKET>/iceberg/data"
iceberg_catalog_name = "spark_catalog_demo"
iceberg_dataset = "finance"


### Create a Spark session template

This function creates a Dataproc Spark session template if it doesn't already exist. The template configures Spark with Iceberg extensions and BigQuery Metastore for catalog management.


In [None]:
import google.api_core.exceptions

def create_session_template():
    """Creates a session template and handles the case where it already exists."""
    full_template_name = f"projects/{project_id}/locations/{location}/sessionTemplates/{serverless_spark_template_id}"

    template = dataproc_v1.SessionTemplate()
    template.description = "A standard template for interactive PySpark sessions."
    template.runtime_config = {
        "version": "2.2",
        "properties": {
            "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
            f"spark.sql.catalog.{iceberg_catalog_name}": "org.apache.iceberg.spark.SparkCatalog",
            f"spark.sql.catalog.{iceberg_catalog_name}.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog",
            f"spark.sql.catalog.{iceberg_catalog_name}.gcp_project": project_id,
            f"spark.sql.catalog.{iceberg_catalog_name}.gcp_location": location,
            f"spark.sql.catalog.{iceberg_catalog_name}.warehouse": iceberg_warehouse_gcs_path,
            "spark.sql.warehouse.dir": "/tmp/hive/data/warehouse"
        }
    }
    template.spark_connect_session = {}
    template.name = full_template_name

    client_options = {"api_endpoint": f"{location}-dataproc.googleapis.com:443"}
    client = dataproc_v1.SessionTemplateControllerClient(client_options=client_options)
    parent = client.common_location_path(project_id, location)

    try:
        request = dataproc_v1.CreateSessionTemplateRequest(
            parent=parent,
            session_template=template
        )
        result = client.create_session_template(request=request)
        print(f"Session template created: {result.name}")
    except google.api_core.exceptions.AlreadyExists:
        print(f"Session template '{serverless_spark_template_id}' already exists.")
        request = dataproc_v1.GetSessionTemplateRequest(name=full_template_name)
        result = client.get_session_template(request=request)
        print(f"Using existing session template: {result.name}")

create_session_template()


### Initialize the SparkSession

Create a `GoogleSparkSession` connected to the specified Dataproc session template. This sets up the distributed Spark environment.


In [None]:
session_config = Session()
session_config.spark_connect_session = SparkConnectConfig()
session_config.session_template = f"projects/{project_id}/locations/{location}/sessionTemplates/{serverless_spark_template_id}"
spark = GoogleSparkSession.builder.projectId(project_id).location(location).googleSessionConfig(session_config).getOrCreate()


## Data Loading and Preparation

### Load the dataset

Load the asset price data from the specified CSV path into a Spark DataFrame. The `header` and `inferSchema` options are used to correctly parse the CSV file.


In [None]:
df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_path)
df.count()


### Optimize and transform the DataFrame

Drop unneeded columns ('High', 'Low', 'Open', 'Volume') and extract the year from the 'date' column, which will be used for partitioning the Iceberg table.


In [None]:
df_optimized = df.drop('High', 'Low', 'Open', 'Volume')
df_optimized = df_optimized.withColumn('year', year(col('date')))
df_optimized.show()


### Create Iceberg database

Create a new database within the Iceberg catalog if it doesn't already exist. This database will house our asset price table.


In [None]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {iceberg_catalog_name}.{iceberg_dataset}")


### Write data to Iceberg table

Write the optimized historical data to an Iceberg table named `gold_price`. The table is partitioned by 'year' for optimized querying, and existing data is overwritten.


In [None]:
df_optimized.write.format('iceberg').mode('overwrite').partitionBy('year').saveAsTable(f'{iceberg_catalog_name}.{iceberg_dataset}.gold_price')


## Model Training

### Convert Spark DataFrame to Pandas DataFrame

Convert the Spark DataFrame containing historical asset prices into a Pandas DataFrame, which is required as input for the Prophet library.


In [None]:
df_pandas = df_optimized.toPandas()
df_pandas.count()


### Initialize and train the Prophet model

Instantiate the Prophet model with a specified confidence interval and train it using the prepared historical data. Prophet automatically handles trend, seasonality, and holidays.


In [None]:
series = df_pandas.rename(columns={'Date': 'ds', 'Close': 'y'})
confidence_interval = 0.9
model = Prophet(interval_width=confidence_interval)
model.fit(series)


## Prediction and Visualization

### Generate future predictions

Create a DataFrame of future dates for forecasting and then use the trained Prophet model to predict asset prices for these future dates.


In [None]:
forecast_period = 365
future = model.make_future_dataframe(periods=forecast_period)
forecast = model.predict(future)


### Plot the forecast

Visualize the historical data and the forecasted asset prices, including the uncertainty intervals. The plot provides a clear overview of the predicted trend.


In [None]:
fig1 = model.plot(forecast)
fig1.gca().set_title("Gold Spot Price Forecast", size=16)
fig1.gca().set_xlabel("Date")
fig1.gca().set_ylabel("Price USD/Ounce")


## Integrate Forecasted Data

### Prepare forecast data for storage

Filter the forecast to show only future predictions, rename columns to match the original schema, convert date formats, and convert the Pandas DataFrame back into a Spark DataFrame for integration.


In [None]:
forecast1 = forecast[['ds', 'yhat']][forecast['ds']> '2025-07-17']
forecast1.columns = ['Date', 'Close']
forecast1['Date'] = forecast1['Date'].dt.date
df_forecast = spark.createDataFrame(forecast1)
df_forecast = df_forecast.withColumn('year', year(col('date')))
df_forecast.show()


### Combine historical and forecasted data

Merge the original historical data with the newly generated future forecast data into a single Spark DataFrame.


In [None]:
combined_df = df_optimized.unionByName(df_forecast)
combined_df.tail(20)


### Append combined data to Iceberg table

Append the combined historical and forecasted data to the `gold_price` Iceberg table. This updates the table with the latest predictions without overwriting historical data.


In [None]:
combined_df.write.format('iceberg').mode('append').partitionBy('year').saveAsTable(f'{iceberg_catalog_name}.{iceberg_dataset}.gold_price')
