# Asset Price Forecast using Iceberg and Prophet

<img src="../../docs/images/forecast/gold-forecast.png" alt="drawing" width="800"/>

In [None]:
%pip install -q pyspark==3.5.0 google-spark-connect google-cloud-dataproc pandas prophet matplotlib

In [None]:

from prophet import Prophet

from pyspark.sql.functions import col, year

from google.cloud import dataproc_v1
from google.cloud.dataproc_v1 import Session, SparkConnectConfig
from google.cloud.spark_connect import GoogleSparkSession

In [None]:
project_id = "<YOUR_GCP_PROJECT>"
location = "<YOUR_LOCATION>"  

serverless_spark_template_id = "spark-serverless-runtime"

csv_path = 'gs://dataproc-metastore-public-binaries/asset_price_forecast/asset_price_forecast.csv'

iceberg_warehouse_gcs_path = "gs://<YOUR_ICEBERG_GCS_BUCKET>/iceberg/data"
iceberg_catalog_name = "spark_catalog_demo"
iceberg_dataset = "finance"

In [None]:
import google.api_core.exceptions

def create_session_template():
    """Creates a session template and handles the case where it already exists."""
    full_template_name = f"projects/{project_id}/locations/{location}/sessionTemplates/{serverless_spark_template_id}"

    template = dataproc_v1.SessionTemplate()
    template.description = "A standard template for interactive PySpark sessions."
    template.runtime_config = {
        "version": "2.2",
        "properties": {
            "spark.sql.extensions": "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
            f"spark.sql.catalog.{iceberg_catalog_name}": "org.apache.iceberg.spark.SparkCatalog",
            f"spark.sql.catalog.{iceberg_catalog_name}.catalog-impl": "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog",
            f"spark.sql.catalog.{iceberg_catalog_name}.gcp_project": project_id,
            f"spark.sql.catalog.{iceberg_catalog_name}.gcp_location": location,
            f"spark.sql.catalog.{iceberg_catalog_name}.warehouse": iceberg_warehouse_gcs_path,
            "spark.sql.warehouse.dir": "/tmp/hive/data/warehouse"
        }
    }
    template.spark_connect_session = {}
    template.name = full_template_name

    client_options = {"api_endpoint": f"{location}-dataproc.googleapis.com:443"}
    client = dataproc_v1.SessionTemplateControllerClient(client_options=client_options)
    parent = client.common_location_path(project_id, location)

    try:
        request = dataproc_v1.CreateSessionTemplateRequest(
            parent=parent,
            session_template=template
        )
        result = client.create_session_template(request=request)
        print(f"Session template created: {result.name}")
    except google.api_core.exceptions.AlreadyExists:
        print(f"Session template '{serverless_spark_template_id}' already exists.")
        request = dataproc_v1.GetSessionTemplateRequest(name=full_template_name)
        result = client.get_session_template(request=request)
        print(f"Using existing session template: {result.name}")

create_session_template()

In [None]:
session_config = Session()
session_config.spark_connect_session = SparkConnectConfig()
session_config.session_template = f"projects/{project_id}/locations/{location}/sessionTemplates/{serverless_spark_template_id}"
spark = GoogleSparkSession.builder.projectId(project_id).location(location).googleSessionConfig(session_config).getOrCreate()

In [None]:
df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_path)
df.count()

In [None]:
df_optimized = df.drop('High', 'Low', 'Open', 'Volume')
df_optimized = df_optimized.withColumn('year', year(col('date')))
df_optimized.show()

In [None]:
spark.sql(f"CREATE DATABASE IF NOT EXISTS {iceberg_catalog_name}.{iceberg_dataset}")

In [None]:
df_optimized.write.format('iceberg').mode('overwrite').partitionBy('year').saveAsTable(f'{iceberg_catalog_name}.{iceberg_dataset}.gold_price')

In [None]:
df_pandas = df_optimized.toPandas()
df_pandas.count()

In [None]:
series = df_pandas.rename(columns={'Date': 'ds', 'Close': 'y'})
confidence_interval = 0.9
model = Prophet(interval_width=confidence_interval)
model.fit(series)

In [None]:
forecast_period = 365
future = model.make_future_dataframe(periods=forecast_period)
forecast = model.predict(future)

In [None]:
fig1 = model.plot(forecast)
fig1.gca().set_title("Gold Spot Price Forecast", size=16)
fig1.gca().set_xlabel("Date")
fig1.gca().set_ylabel("Price USD/Ounce")


In [None]:
forecast1 = forecast[['ds', 'yhat']][forecast['ds']> '2025-07-17']
forecast1.columns = ['Date', 'Close']
forecast1['Date'] = forecast1['Date'].dt.date
df_forecast = spark.createDataFrame(forecast1)
df_forecast = df_forecast.withColumn('year', year(col('date')))
df_forecast.show()

In [None]:
combined_df = df_optimized.unionByName(df_forecast)
combined_df.tail(20)

In [None]:
combined_df.write.format('iceberg').mode('append').partitionBy('year').saveAsTable(f'{iceberg_catalog_name}.{iceberg_dataset}.gold_price')