In [None]:
# Import python packages
import streamlit as st
import json

# Import Snowpark packages
from snowflake.snowpark.context import get_active_session
snowflake_session = get_active_session()
from snowflake.snowpark.functions import col, lit, when as sf_when, endswith as sf_endswith, count_distinct as sf_count_distinct
from snowflake.snowpark import Window as sf_window

In [None]:
use schema "DEMO";

# Schema Evolution

So far, we have established the following building blocks:

- Infer metadata from a file
- Leverage the metadata to create a new table
- Ingest data into the new table, matching by column name

Now for the main event in this session. Our next demonstration achieves the following:

1. Create a new table that only contains some generic metadata fields for data lineage and monitoring
2. Ingest our example CSV file into this new table, _automatically_ adding all the data fields
3. Ingest our example JSON file into this new table, _automatically_ adding the additional fields
4. Ingest our example Parquet file into this new table, _automatically_ adding the final additional field

First, we quickly review which fields are available in our data:

In [None]:
with "CTE__RAW" as (
  select
      "TABLE_NAME"
    , "COLUMN_NAME"
    , True as "PIVOT_VALUE"
    , ((count(distinct "TABLE_NAME") over (partition by "COLUMN_NAME")) != 3)::int as "CUSTOM_ORDER_1"
    , case "TABLE_NAME"
        when 'DATA_FROM_CSV' then 1
        when 'DATA_FROM_JSON' then 2
        when 'DATA_FROM_PARQUET' then 3
      end as "CUSTOM_ORDER_2"
    , "ORDINAL_POSITION" as "CUSTOM_ORDER_3"
    , concat(
          "CUSTOM_ORDER_1"
        , "CUSTOM_ORDER_2"
        , "CUSTOM_ORDER_3"
      ) as "CUSTOM_ORDER"
  from "INFORMATION_SCHEMA"."COLUMNS"
  where "TABLE_SCHEMA" = 'DEMO'
)
select
    "COLUMN_NAME"
  , max("'DATA_FROM_CSV'") as "DATA_FROM_CSV"
  , max("'DATA_FROM_JSON'") as "DATA_FROM_JSON"
  , max("'DATA_FROM_PARQUET'") as "DATA_FROM_PARQUET"
from "CTE__RAW"
  pivot(
    max("PIVOT_VALUE")
    for "TABLE_NAME" in (
        'DATA_FROM_CSV'
      , 'DATA_FROM_JSON'
      , 'DATA_FROM_PARQUET'
    )
  )
group by "COLUMN_NAME"
order by
    min("CUSTOM_ORDER")