In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table align="left">
  <td><a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/forecast/asset_price_forecast.ipynb"><img src="https://avatars.githubusercontent.com/u/33467679?s=200&v=4" width="32px" alt="Colab logo"> Run in Colab</a></td>
  <td><a href="https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/forecast/asset_price_forecast.ipynb"><img src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px" alt="GitHub logo"> View on GitHub</a></td>
  <td><a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/ai-ml-recipes/main/notebooks/forecast/asset_price_forecast.ipynb"><img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"> Open in Vertex AI Workbench</a></td>
  <td><a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/forecast/asset_price_forecast.ipynb"><img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw&s" alt="BQ logo" width="35"> Open in BQ Studio</a></td>
  <td><a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fai-ml-recipes%2Fmain%2Fnotebooks/forecast/asset_price_forecast.ipynb"><img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"> Open in Colab Enterprise</a></td>
</table>

# Asset Price Forecast using Iceberg and Prophet

<img src="../../docs/images/forecast/gold-forecast.png" alt="drawing" width="800"/>

## Overview
This notebook demonstrates how to build an asset price forecasting solution using Dataproc Serverless Spark Connect with Iceberg tables and the Prophet forecasting library. It covers data loading, preprocessing, model training, prediction, and storing results back into Iceberg.

## Setup Environment

### Install Required Libraries
This cell installs the necessary Python libraries for this notebook, including `pyspark`, `google-spark-connect`, `google-cloud-dataproc`, `pandas`, `prophet`, and `matplotlib`.

In [None]:
%pip install -q pyspark==3.5.0 google-spark-connect google-cloud-dataproc pandas prophet matplotlib

### Import Libraries
Import all the required libraries for this notebook.

In [None]:

from prophet import Prophet

from pyspark.sql.functions import col, year

from google.cloud import dataproc_v1
from google.cloud.dataproc_v1 import Session, SparkConnectConfig
from google.cloud.spark_connect import GoogleSparkSession

## Configure Google Cloud Project and Resources

### Set Configuration Variables
Define the Google Cloud project ID, region, Dataproc Serverless template ID, and GCS paths for input data and the Iceberg warehouse. Replace the placeholder values with your actual project details.

In [None]:
project_id = "<YOUR_GCP_PROJECT>"
location = "<YOUR_LOCATION>"  

serverless_spark_template_id = "spark-serverless-runtime"

csv_path = 'gs://dataproc-metastore-public-binaries/asset_price_forecast/asset_price_forecast.csv'

icerberg_warehouse_gcs_path = "gs://<YOUR_ICEBERG_GCS_BUCKET>/iceberg/data"
iceberg_catalog_name = "spark_catalog_demo"
iceberg_dataset = "finance"

### Create Dataproc Serverless Session Template
This function creates a Dataproc Serverless session template. This template configures Spark Connect to use Iceberg with BigQuery as the metastore, enabling seamless integration between Spark and Iceberg tables stored in Google Cloud Storage.

In [None]:
def create_session_template():    
  full_template_name = f"projects/{project_id}/locations/{location}/sessionTemplates/{serverless_spark_template_id}"

  template = dataproc_v1.SessionTemplate()
  template.description = "A standard template for interactive PySpark sessions."
  template.runtime_config = {
                              "version": "2.2",
                              "properties": {
                                "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
                                f"spark.sql.catalog.{iceberg_catalog_name}": "org.apache.iceberg.spark.SparkCatalog",
                                f"spark.sql.catalog.{iceberg_catalog_name}.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog",
                                f"spark.sql.catalog.{iceberg_catalog_name}.gcp_project": project_id,
                                f"spark.sql.catalog.{iceberg_catalog_name}.gcp_location": location,
                                f"spark.sql.catalog.{iceberg_catalog_name}.warehouse": icerberg_warehouse_gcs_path,
                                "spark.sql.warehouse.dir": "/tmp/hive/data/warehouse"
                              }
                            }
  template.spark_connect_session = {}
  template.name = full_template_name

  client_options = {"api_endpoint": f"{location}-dataproc.googleapis.com:443"}

  client = dataproc_v1.SessionTemplateControllerClient(
          client_options=client_options
      )
  
  parent = client.common_location_path(project_id, location)
  
  request = dataproc_v1.CreateSessionTemplateRequest(
      parent=parent,
      session_template = template
  )
  
  result = client.create_session_template(request=request)
  print(f"Session created: {result.name}")

create_session_template()

### Initialize Spark Connect Session
Create a new Spark Connect session using the previously defined session template. This connects the notebook to a Dataproc Serverless Spark runtime.

In [None]:
session_config = Session()
session_config.spark_connect_session = SparkConnectConfig()
session_config.session_template = f"projects/{project_id}/locations/{location}/sessionTemplates/{serverless_spark_template_id}"
spark = GoogleSparkSession.builder.projectId(project_id).location(location).googleSessionConfig(session_config).getOrCreate()

## Data Loading and Preparation

### Load Raw Data from GCS
Load the asset price data from a CSV file stored in Google Cloud Storage into a Spark DataFrame. The `header` option is set to `true` to indicate that the first row is a header, and `inferSchema` is set to `true` to automatically detect column types.

In [None]:
df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_path)
df.count()

### Preprocess Data
Perform basic data preprocessing by dropping unnecessary columns ('High', 'Low', 'Open', 'Volume') and extracting the year from the 'date' column. This optimized DataFrame will be used for forecasting and storage in Iceberg.

In [None]:
df_optimized = df.drop('High', 'Low', 'Open', 'Volume')
df_optimized = df_optimized.withColumn('year', year(col('date')))
df_optimized.show()

## Store Data in Iceberg Table

### Create Iceberg Database
Create a new database within your Iceberg catalog. This logical container will hold your Iceberg tables.

In [None]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {iceberg_catalog_name}.{iceberg_dataset}")

### Write Optimized Data to Iceberg Table
Write the preprocessed Spark DataFrame into an Iceberg table. The table is partitioned by 'year' for optimized querying, and the `overwrite` mode ensures that any existing table with the same name is replaced.

In [None]:
df_optimized.write.format('iceberg').mode('overwrite').partitionBy('year').saveAsTable(f'{iceberg_catalog_name}.{iceberg_dataset}.gold_price')

## Time Series Forecasting with Prophet

### Convert Spark DataFrame to Pandas DataFrame
Convert the Spark DataFrame to a Pandas DataFrame, which is required by the Prophet library for time series modeling.

In [None]:
df_pandas = df_optimized.toPandas()
df_pandas.count

### Train the Prophet Model
Rename the 'Date' column to 'ds' and 'Close' column to 'y' to match Prophet's expected input format. Initialize and train the Prophet model with a specified confidence interval.

In [None]:
series = df_pandas.rename(columns={'Date': 'ds', 'Close': 'y'})
confidence_interval = 0.9
model = Prophet(interval_width=confidence_interval)
model.fit(series)

### Generate Future Dates and Make Predictions
Generate a DataFrame with future dates for the next 365 days and use the trained Prophet model to make predictions for these future dates, along with upper and lower bounds for the forecast.

In [None]:
forecast_period = 365
future = model.make_future_dataframe(periods=forecast_period)
forecast = model.predict(future)

### Visualize the Forecast
Plot the historical data and the forecasted values, including the confidence interval. The plot is customized with a title and axis labels for clarity.

In [None]:
fig1 = model.plot(forecast)
fig1.gca().set_title("Gold Spot Price Forecast", size=16)
fig1.gca().set_xlabel("Date")
fig1.gca().set_ylabel("Price USD/Ounce")


## Store Forecasted Data

### Prepare Forecast Data for Spark
Filter the forecast to include only future predictions, rename columns to match the original DataFrame, convert the 'Date' column type, and then convert the Pandas DataFrame back into a Spark DataFrame for consistency and further operations.

In [None]:
forecast1 = forecast[['ds', 'yhat']][forecast['ds']> '2025-07-17']
forecast1.columns = ['Date', 'Close']
forecast1['Date'] = forecast1['Date'].dt.date
df_forecast = spark.createDataFrame(forecast1)
df_forecast = df_forecast.withColumn('year', year(col('date')))
df_forecast.show()

### Combine Historical and Forecasted Data
Combine the original optimized historical data with the newly generated forecasted data into a single Spark DataFrame.

In [None]:
combined_df = df_optimized.unionByName(df_forecast)
combined_df.tail(20)

### Append Combined Data to Iceberg Table
Append the combined historical and forecasted data to the existing Iceberg table. This demonstrates how to update your Iceberg table with new data incrementally.

In [None]:
combined_df.write.format('iceberg').mode('append').partitionBy('year').saveAsTable(f'{iceberg_catalog_name}.{iceberg_dataset}.gold_price')