In [None]:
# Import python packages
import streamlit as st
import gzip
import json
import pyarrow.parquet as pq

# Import Snowpark packages
from snowflake.snowpark.context import get_active_session
snowflake_session = get_active_session()

In [None]:
use schema "DEMO";

In [None]:
snowflake_session.sql("""
  create or replace stage "STG__DATA";
""").collect()

snowflake_session.file.put("data/1.csv.gz", '@"STG__DATA"/csv', overwrite=True)
snowflake_session.file.put("data/2.json.gz", '@"STG__DATA"/json', overwrite=True)
snowflake_session.file.put("data/3.parquet", '@"STG__DATA"/parquet', overwrite=True)

st.dataframe(
  snowflake_session.sql("""
    list @"STG__DATA";
  """).collect()
)

In [None]:
# Define function to print the set of shareable QR codes
def print_qr_codes():
  qr_col_1, qr_col_2, qr_col_3 = st.columns(3, gap="large")
  with qr_col_1:
    st.subheader("View this repository")
    st.image("images/QR-repo.png", caption="This repository")
    st.markdown("[https://github.com/InterWorks/Snowflake-Build-2024---Schema-Evolution](https://github.com/InterWorks/Snowflake-Build-2024---Schema-Evolution)")
  with qr_col_2:
    st.subheader("View my company profile")
    st.image("images/QR-IW-profile.png", caption="My company profile")
    st.markdown("[https://interworks.com/people/chris-hastie](https://interworks.com/people/chris-hastie)")
  with qr_col_3:
    st.subheader("View my LinkedIn profile")
    st.image("images/QR-LinkedIn.png", caption="My LinkedIn profile")
    st.markdown("[https://www.linkedin.com/in/chris-hastie/](https://www.linkedin.com/in/chris-hastie/)")

In [None]:
st.markdown("""
  # Schema Evolution for Automated Metadata-Driven Ingestion

  In this session, learn about multiple features in Snowflake for automated data ingestion, using data from multiple file formats. We will:
  
    1. Leverage Snowflake's schema inference functionality to parse the metadata from the various different file formats
    2. Demonstrate several ways that this metadata can be used
    3. Ingest the data into new tables using Snowflake’s column-matching ingestion functionality

  The icing on the cake will then be a **demonstration of Snowflake’s automated schema evolution functionality**, which supports changing workloads as new fields are added to landing tables automatically.
  
  Within all of this, we will also demonstrate how to parse and store a variety of other pieces of metadata during ingestion using metadata columns.
""")

print_qr_codes()

In [None]:
st.header("File Formats - CSV")

st.markdown("""
  The typical file format that we're all familiar with.
  
  For our example, we have a pipe-delimited file of some basic event logging.
""")

file_path__csv = r"data/1.csv.gz"
with gzip.open(file_path__csv, "rt") as csv_file:
  st.code(csv_file.read(), language="csv")

st.markdown("""
  The file format here is fairly simple, however we leverage two options that are less standard:

    - parse_header: Key option that ensures the first header is leveraged as a header when parsing metadata
    - error_on_column_count_mismatch: Option that allows files to have missing/new columns compared to the destination table
""")

In [None]:
create or replace file format "FF_CSV"
  type = CSV
  field_delimiter = '|'
  parse_header = TRUE
  error_on_column_count_mismatch = FALSE

In [None]:
st.header("File Formats - JSON")

st.markdown("""
  The typical semi-structured file format that most of us are familiar with.
  
  For our example, we have a file with some more basic event logging.
""")

file_path__json = r"data/2.json.gz"
with gzip.open(file_path__json, "rt") as json_file:
  json_string = json_file.read()
  json_data = json.loads(json_string)
  json_pretty = json.dumps(json_data, indent = 2)
    
st.code(json_pretty, language="json")

st.markdown("""
  The file format here is more simple than for CSVs, however we leverage one options that is worth explaining:

    - strip_outer_array: Reads each element as its own record instead of reading the entire file into a single record
""")

In [None]:
create or replace file format "FF_JSON"
  type = JSON
  strip_outer_array = TRUE

In [None]:
st.header("File Formats - Parquet")

st.markdown("""
  The common optimised semi-structured file format that many of us are familiar with.
  
  For our example, we have a file with some more basic event logging.
""")

file_path__parquet = r"data/3.parquet"
parquet_data = pq.read_table(file_path__parquet)
    
st.code(parquet_data, language="parquet")

st.markdown("The file format here is the most simple of our examples")

In [None]:
create or replace file format "FF_PARQUET"
  type = PARQUET

In [None]:
st.header("File Formats - Summary")

st.markdown("So we have three file formats:")

ff_col_csv, ff_col_json, ff_col_parquet = st.columns(3, gap="large")
with ff_col_csv:
  st.subheader("CSV")
  st.code(sql__file_formats__csv.__getattribute__("query_executed"), "sql")
with ff_col_json:
  st.subheader("JSON")
  st.code(sql__file_formats__json.__getattribute__("query_executed"), "sql")
with ff_col_parquet:
  st.subheader("Parquet")
  st.code(sql__file_formats__parquet.__getattribute__("query_executed"), "sql")

# Inferring schemas directly

These files were all uploaded into an internal stage earlier, so we can dive straight into inferring the schema.

Inferring a schema from a file directly is achieved using a table function. This uses a very similar structure regardless of the file format:

```sql
table(
  infer_schema(
      location => '<location of file(s), including stage>'
    , file_format => '<Snowflake File Format object to use when reading the file>'
  )
)
```


## Inferring schemas directly - CSV

Let's quickly see the output from this table function using our example CSV file:

In [None]:
select *
from table(
  infer_schema(
      location => '@"STG__DATA"/csv'
    , file_format => '"FF_CSV"'
  )
)

## Inferring schemas directly - JSON

And now let's see the output from this table function using our example JSON file:

In [None]:
select *
from table(
  infer_schema(
      location => '@"STG__DATA"/json'
    , file_format => '"FF_JSON"'
  )
)

## Inferring schemas directly - Parquet

Finally, let's see the output from this table function using our example Parquet file:

In [None]:
select *
from table(
  infer_schema(
      location => '@"STG__DATA"/parquet'
    , file_format => '"FF_PARQUET"'
  )
)

In [None]:
st.header("Inferring schemas directly - Summary")

st.markdown("So we have three very similar queries that output similar results, however there are some important differences:")

isd_col_csv, isd_col_json, isd_col_parquet = st.columns(3, gap="large")
with isd_col_csv:
  st.subheader("CSV")
  st.code(sql__inferring_schemas_directly__csv.__getattribute__("query_executed"), "sql")
  st.markdown("""
    Metadata inferred from CSV data is the least reliable
      - Number accuracy is estimated
      - Strings may incorrectly be inferred as other types
  """)
with isd_col_json:
  st.subheader("JSON")
  st.code(sql__inferring_schemas_directly__json.__getattribute__("query_executed"), "sql")
  st.markdown("""
    Metadata inferred from JSON data is still not optimal
      - Number accuracy is estimated
  """)
with isd_col_parquet:
  st.subheader("Parquet")
  st.code(sql__inferring_schemas_directly__parquet.__getattribute__("query_executed"), "sql")
  st.markdown("""
    Metadata inferred from Parquet files is exact
      - Metadata is stored within the file format
  """)


# Table templates

The output of the table function used to infer metadata from a file can be leveraged to create dictionaries that contain all the metadata for a given file.

```sql
select array_agg(object_construct(*))
table(
  infer_schema(
      location => '<location of file(s), including stage>'
    , file_format => '<Snowflake File Format object to use when reading the file>'
  )
)
```


## Table templates - CSV

Let's quickly see the table template from our example CSV file:

In [None]:
select array_agg(object_construct(*))
from table(
  infer_schema(
      location => '@"STG__DATA"/csv'
    , file_format => '"FF_CSV"'
  )
)

## Table templates - JSON

And now let's see the table template from our example JSON file:

In [None]:
select array_agg(object_construct(*))
from table(
  infer_schema(
      location => '@"STG__DATA"/json'
    , file_format => '"FF_JSON"'
  )
)

## Table templates - Parquet

Finally, let's see the table template from our example Parquet file:

In [None]:
select array_agg(object_construct(*))
from table(
  infer_schema(
      location => '@"STG__DATA"/parquet'
    , file_format => '"FF_PARQUET"'
  )
)

In [None]:
st.header("Table templates - Summary")

st.markdown("Again, we have three very similar queries that output similar results:")

tt_col_csv, tt_col_json, tt_col_parquet = st.columns(3, gap="large")
with tt_col_csv:
  st.subheader("CSV")
  st.code(sql__table_templates__csv.__getattribute__("query_executed"), "sql")
  st.code(
      json.dumps(
          json.loads(
            sql__table_templates__csv.__getattribute__("results").to_dict("records")[0]["ARRAY_AGG(OBJECT_CONSTRUCT(*))"]
          )
        , indent = 2
      )
    , "json"
  )
with tt_col_json:
  st.subheader("JSON")
  st.code(sql__table_templates__json.__getattribute__("query_executed"), "sql")
  st.code(
      json.dumps(
          json.loads(
            sql__table_templates__json.__getattribute__("results").to_dict("records")[0]["ARRAY_AGG(OBJECT_CONSTRUCT(*))"]
          )
        , indent = 2
      )
    , "json"
  )
with tt_col_parquet:
  st.subheader("Parquet")
  st.code(sql__table_templates__parquet.__getattribute__("query_executed"), "sql")
  st.code(
      json.dumps(
          json.loads(
            sql__table_templates__parquet.__getattribute__("results").to_dict("records")[0]["ARRAY_AGG(OBJECT_CONSTRUCT(*))"]
          )
        , indent = 2
      )
    , "json"
  )

# Create tables

These table templates can be used directly inside a `create table` statement:

```sql
create or replace table "MY_TABLE"
  using template(
    select array_agg(object_construct(*))
    from table(
      infer_schema(
          location => '<location of file(s), including stage>'
        , file_format => '<Snowflake File Format object to use when reading the file>'
      )
    )
  )
  comment = 'Table created using the metadata inferred from the source file(s)'
```


## Create table - CSV

Let's quickly create a table using the table template from our CSV file:

In [None]:
create or replace table "DATA_FROM_CSV"
  using template(
    select array_agg(object_construct(*))
    from table(
      infer_schema(
          location => '@"STG__DATA"/csv'
        , file_format => '"FF_CSV"'
      )
    )
  )
  comment = 'Table created using the metadata inferred from the source file(s) in CSV format'

## Table templates - JSON

And now let's create a table using the table template from our example JSON file:

In [None]:
create or replace table "DATA_FROM_JSON"
  using template(
    select array_agg(object_construct(*))
    from table(
      infer_schema(
          location => '@"STG__DATA"/json'
        , file_format => '"FF_JSON"'
      )
    )
  )
  comment = 'Table created using the metadata inferred from the source file(s) in JSON format'

## Table templates - Parquet

Finally, let's create a table using the table template from our example Parquet file:

In [None]:
create or replace table "DATA_FROM_PARQUET"
  using template(
    select array_agg(object_construct(*))
    from table(
      infer_schema(
          location => '@"STG__DATA"/parquet'
        , file_format => '"FF_PARQUET"'
      )
    )
  )
  comment = 'Table created using the metadata inferred from the source file(s) in Parquet format'

# Metadata-driven ingestion

Once we have created our landing tables (whether using a template or by manually defining appropriate fields) data can be ingested:

```sql
copy into table "MY_TABLE"
from '@<location of file(s), including stage>'
  file_format = "<Snowflake File Format object to use when reading the file>"
  match_by_column_name = CASE_INSENSITIVE
```

The key option here is "match_by_column_name":

- Removes the need for fields in the file to be in the same order as in the destination table
- Most useful for CSVs where field order used to be far more important
- Allows files to be ingested even if they are missing fields

## Create table - CSV

Let's quickly create a table using the table template from our CSV file:

## Table templates - JSON

And now let's create a table using the table template from our example JSON file:

## Table templates - Parquet

Finally, let's create a table using the table template from our example Parquet file: