In [1]:
#******************************************************
#*
#* Name:         nb-02-explore-csv-files
#*     
#* Design Phase:
#*     Author:   John Miner
#*     Date:     12-04-2024
#*     Purpose:  Shortcut vs dataframe commands.
#*               Managed vs unmanaged tables.
#* 
#******************************************************/

StatementMeta(, 8537cb99-0cba-4ac2-90dc-faaa8ff00f64, 3, Finished, Available, Finished)

In [2]:
%%sql

--
--  1 - Short hand notation has limited options (header + schema)
--

select 
    * 
from 
   csv.`Files/Stocks/all_stock_data.csv`
limit 
   5


StatementMeta(, 8537cb99-0cba-4ac2-90dc-faaa8ff00f64, 4, Finished, Available, Finished)

<Spark SQL result set with 5 rows and 9 fields>

In [3]:
#
#  2 - same can be said of spark.read method
#

file2 = "Files/Stocks/all_stock_data.csv"
df = spark.read.csv(file2)
df.printSchema()

StatementMeta(, 8537cb99-0cba-4ac2-90dc-faaa8ff00f64, 5, Finished, Available, Finished)

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)
 |-- _c4: string (nullable = true)
 |-- _c5: string (nullable = true)
 |-- _c6: string (nullable = true)
 |-- _c7: string (nullable = true)
 |-- _c8: string (nullable = true)



In [4]:
%%sql

--
--  3 - create managed table using short hand (1m 42s)
--

-- del
drop table if exists csv_stocks_ex1;

-- add
 create table if not exists csv_stocks_ex1 as
    select 
        cast(_c0 as date) as _date,
        cast(substr(_c0, 1, 4) as integer) as _year, 
        cast(substr(_c0, 6, 2) as integer) as _month,
        _c1 as _ticker,
        cast(_c2 as decimal(12, 4)) as _open,
        cast(_c3 as decimal(12, 4)) as _high,
        cast(_c4 as decimal(12, 4)) as _low,
        cast(_c5 as decimal(12, 4)) as _close,
        cast(_c6 as long) as _volume,
        cast(_c7 as decimal(12, 4)) as _dividends,
        cast(_c8 as decimal(12, 4)) as _stock_splits
    from 
        csv.`Files/Stocks/*.csv`
    where 
        _c0 <> 'Date'
    -- limit 10000



StatementMeta(, 8537cb99-0cba-4ac2-90dc-faaa8ff00f64, 7, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

In [5]:
%%sql

--
--  4 - show meta data details
--

describe table extended csv_stocks_ex1


StatementMeta(, 8537cb99-0cba-4ac2-90dc-faaa8ff00f64, 8, Finished, Available, Finished)

<Spark SQL result set with 19 rows and 3 fields>

In [6]:
%%sql

--
--  5 - show top 5 records
--

select * from csv_stocks_ex1 limit 5


StatementMeta(, 8537cb99-0cba-4ac2-90dc-faaa8ff00f64, 9, Finished, Available, Finished)

<Spark SQL result set with 5 rows and 11 fields>

In [7]:
 %%sql

 --
 --  6 - Create unmanaged CSV table (39s)
 --

-- del
drop table if exists csv_stocks_ex2;

-- add
create table if not exists csv_stocks_ex2
using csv 
options (path "Files/Stocks/*.csv", header "true", inferSchema "true")

StatementMeta(, 8537cb99-0cba-4ac2-90dc-faaa8ff00f64, 11, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

In [8]:
%%sql

--
--  7 - show top 5 records
--

select * from csv_stocks_ex2 limit 5

StatementMeta(, 8537cb99-0cba-4ac2-90dc-faaa8ff00f64, 12, Finished, Available, Finished)

<Spark SQL result set with 5 rows and 9 fields>

In [9]:
%%sql

--
--  8 - show meta data details
--

describe table extended csv_stocks_ex2

StatementMeta(, 8537cb99-0cba-4ac2-90dc-faaa8ff00f64, 13, Finished, Available, Finished)

<Spark SQL result set with 25 rows and 3 fields>

In [10]:
#
#  9 - spark.read has the most options (4m 43s)
#

# define path
path2 = "Files/Stocks/all_stock_data.csv"

# define schema
custom_schema = """
    _date date, 
    _year int, 
    _month int, 
    _ticker string, 
    _open decimal(12,4),
    _high decimal(12,4),
    _low decimal(12,4),
    _close decimal(12,4),
    _volume bigint,
    _dividends decimal(12,4),
    _stock_splits decimal(12,4)
"""

# read in csv data
df2 = (
  spark.read.format("csv")                    
  .option("sep", ",")        
  .option("header", "true")
  .option("inferSchema", "false")  
  .schema(custom_schema)
  .load(path2)               
)

# write delta table
df2.write.saveAsTable("csv_stocks_ex3")


StatementMeta(, 8537cb99-0cba-4ac2-90dc-faaa8ff00f64, 14, Finished, Available, Finished)

In [11]:
%%sql

--
--  10 - show meta data details
--

describe table extended csv_stocks_ex3

StatementMeta(, 8537cb99-0cba-4ac2-90dc-faaa8ff00f64, 15, Finished, Available, Finished)

<Spark SQL result set with 19 rows and 3 fields>

In [1]:
%%sql

--
--  11 - counts should be the same regardless of format
--

select 'managed table using sql', count(*) as rec_cnt from csv_stocks_ex1
union
select 'unmanaged table using sql', count(*) as rec_cnt from csv_stocks_ex2
union
select 'managed table using spark', count(*) as rec_cnt from csv_stocks_ex3

StatementMeta(, ff72bf39-1400-4729-97eb-1b2e12004bd8, 2, Finished, Available, Finished)

<Spark SQL result set with 3 rows and 2 fields>