In [0]:
%run ./00_Setup

In [0]:
%sql
-- Create a text widget for the book store path
CREATE WIDGET TEXT book_store_path DEFAULT '/Volumes/workspace/strata_lab/entrenamiento/book_store';
-- Drop books table if it exists
DROP TABLE IF EXISTS books

In [0]:
%python
# Get the value of the book_store_path widget
book_store_path = dbutils.widgets.get("book_store_path")

In [0]:
%sql
-- Query a single JSON file from the customers directory
SELECT * FROM json.`${book_store_path}/customers/part-00000-*.json`;

In [0]:
%sql
-- Query all JSON files in the customers directory
SELECT * FROM json.`${book_store_path}/customers/`

In [0]:
%sql
-- Count total number of customers in all JSON files
SELECT COUNT(*) FROM json.`${book_store_path}/customers/`

In [0]:
%sql
-- Show each record with its source file path
SELECT *, _metadata.file_path AS source_file
FROM json.`${book_store_path}/customers/`
LIMIT 5

In [0]:
%sql
-- Read customer files as raw text
SELECT *
FROM text.`${book_store_path}/customers/`
LIMIT 5

In [0]:
%sql
-- Read customer files as binary and show file metadata
SELECT *
FROM binaryFile.`${book_store_path}/customers/`
LIMIT 5

In [0]:
%sql
-- Read all CSV files in books-csv directory
SELECT *
FROM csv.`${book_store_path}/books-csv/`

In [0]:
%python
# Read CSV files and create books table using Python
# Used because schema inference for SQL CSV table creation is limited
# External table refresh cannot be tested here
df = (
  spark.read.format("csv")
  .option("header", "true")
  .option("delimiter", ";")
  .load(f"dbfs:{book_store_path}/books-csv/")
)

df.write.saveAsTable("books")

In [0]:
%sql
-- Query all records from books table
SELECT * FROM books

In [0]:
%sql
-- Show extended metadata for books table
DESCRIBE EXTENDED books;

In [0]:
%sql
-- Create customers table from JSON files and show its metadata
CREATE OR REPLACE TABLE customers AS
SELECT * FROM json.`${customer_path}/customers/`;

DESCRIBE EXTENDED customers;

In [0]:
%sql
-- Create a temporary view for books CSV with schema definition
CREATE OR REPLACE TEMP VIEW books_tmp_vw 
(book_id STRING, title STRING, author STRING, category STRING, price DOUBLE) 
USING csv 
OPTIONS (path "/Volumes/workspace/strata_lab/entrenamiento/book_store/books-csv/", header "true", inferSchema "true", delimiter ";");

In [0]:
%sql
-- Create books table from the temporary view (results in a Delta table)
CREATE OR REPLACE TABLE books AS
SELECT * FROM books_tmp_vw;

In [0]:
%sql
-- Query all records from books table
SELECT * FROM books

# Simplified File Querying

In [0]:
%sql
-- Use read_files function to read CSV files with schema rescue for mismatches
SELECT * FROM read_files(
  '${book_store_path}/books-csv',
  format => 'csv',
  header => 'true',
  delimiter => ';'
)

# Metadata columns
input_file_name() ya no es funcional, ahora se utiliza _metadata.filepath (entre otros) para traer los atributos de metadata de los archivos

In [0]:
%sql
-- Show all records and metadata columns from read_files output
SELECT *, _metadata.* -- (file_path, file_name, file_size, file_block_start, file_block_length, file_modification_time)
FROM read_files(
  '${book_store_path}/books-csv',
  format => 'csv',
  header => 'true',
  delimiter => ';'
) 