In [None]:
# Import python packages
import streamlit as st
import gzip
import json
import pyarrow.parquet as pq

# Import Snowpark packages
from snowflake.snowpark.context import get_active_session
snowflake_session = get_active_session()

In [None]:
use schema "DEMO";
create or replace stage "STG__DATA";

In [None]:
snowflake_session.file.put("data/1.csv.gz", '@"STG__DATA"/csv', overwrite=True)
snowflake_session.file.put("data/2.json.gz", '@"STG__DATA"/json', overwrite=True)
snowflake_session.file.put("data/3.parquet", '@"STG__DATA"/parquet', overwrite=True)

In [None]:
list @"STG__DATA";

In [None]:
st.header("File Formats - CSV")

st.markdown("""
  The typical file format that we're all familiar with.
  
  For our example, we have a pipe-delimited file of some basic event logging.
""")

file_path__csv = r"data/1.csv.gz"
with gzip.open(file_path__csv, "rt") as csv_file:
  st.code(csv_file.read(), language="csv")

st.markdown("""
  The file format here is fairly simple, however we leverage two options that are less standard:

    - parse_header: Key option that ensures the first header is leveraged as a header when parsing metadata
    - error_on_column_count_mismatch: Option that allows files to have missing/new columns compared to the destination table
""")

In [None]:
create or replace file format "FF_CSV"
  type = CSV
  field_delimiter = '|'
  parse_header = TRUE
  error_on_column_count_mismatch = FALSE

In [None]:
st.header("File Formats - JSON")

st.markdown("""
  The typical semi-structured file format that most of us are familiar with.
  
  For our example, we have a file with some more basic event logging.
""")

file_path__json = r"data/2.json.gz"
with gzip.open(file_path__json, "rt") as json_file:
  json_string = json_file.read()
  json_data = json.loads(json_string)
  json_pretty = json.dumps(json_data, indent = 2)
    
st.code(json_pretty, language="json")

st.markdown("""
  The file format here is more simple than for CSVs, however we leverage one options that is worth explaining:

    - strip_outer_array: Reads each element as its own record instead of reading the entire file into a single record
""")

In [None]:
create or replace file format "FF_JSON"
  type = JSON
  strip_outer_array = TRUE

In [None]:
st.header("File Formats - Parquet")

st.markdown("""
  The common optimised semi-structured file format that many of us are familiar with.
  
  For our example, we have a file with some more basic event logging.
""")

file_path__parquet = r"data/3.parquet"
parquet_data = pq.read_table(file_path__parquet)
    
st.code(parquet_data, language="parquet")

st.markdown("The file format here is the most simple of our examples")

In [None]:
create or replace file format "FF_PARQUET"
  type = PARQUET

In [None]:
st.header("File Formats - Summary")

st.markdown("So we have three file formats:")

ff_col_csv, ff_col_json, ff_col_parquet = st.columns(3, gap="large")
with ff_col_csv:
  st.subheader("CSV")
  st.code(sql__file_formats__csv.__getattribute__("query_executed"), "sql")
with ff_col_json:
  st.subheader("JSON")
  st.code(sql__file_formats__json.__getattribute__("query_executed"), "sql")
with ff_col_parquet:
  st.subheader("Parquet")
  st.code(sql__file_formats__parquet.__getattribute__("query_executed"), "sql")