In [None]:
#******************************************************
#*
#* Name:         nb-09-explore-orc-files
#*     
#* Design Phase:
#*     Author:   John Miner
#*     Date:     12-04-2024
#*     Purpose:  
#*               1 - create delta table
#*               2 - trickle load table is bad
#*               3 - bulk load table is good
#*               4 - playing around with CRUD statements
#* 
#******************************************************/

In [1]:
#
#  1 - before spark there was pandas
# 

# import lib
import pandas as pd

# read file into df
pdf = pd.read_csv("file:/lakehouse/default/Files/Fisher/Iris.csv")

# show top 5 rows
pdf.head()

StatementMeta(, 02efe25e-e6fc-49da-9e0e-0689a1d3f72b, 3, Finished, Available, Finished)

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
%%sql

--
--  2 - create empty delta table
--

-- del
drop table if exists delta_iris_ds;

-- add
create table if not exists delta_iris_ds 
(
    Id int,
    SepalLength float,
    SepalWidth float,
    PetalLength float,
    PetalWidth float,
    Species string
);

-- show blank table
select * from delta_iris_ds;

StatementMeta(, 02efe25e-e6fc-49da-9e0e-0689a1d3f72b, 14, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 6 fields>

In [5]:
%%sql

--
--  3 - show table history
--

describe history delta_iris_ds


StatementMeta(, 02efe25e-e6fc-49da-9e0e-0689a1d3f72b, 9, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 15 fields>

In [6]:
#
#  4 - insert data into delta table (3m 1s)
# 

#  function to call for each row
def process_row(row):

    # make insert stmt
    stmt = f"""
           insert into delta_iris_ds 
           (
               Id,
               SepalLength,
               SepalWidth,
               PetalLength,
               PetalWidth,
               Species
           )
           values 
           (
            {row.Id},
            {row.SepalLengthCm},
            {row.SepalWidthCm},
            {row.PetalLengthCm},
            {row.PetalWidthCm},
            '{row.Species}'
           );
           """
    # debugging           
    # print(stmt)

    # execute
    spark.sql(stmt)

#  apply function to data set
pdf.apply(process_row, axis=1)


StatementMeta(, 02efe25e-e6fc-49da-9e0e-0689a1d3f72b, 10, Finished, Available, Finished)

0      None
1      None
2      None
3      None
4      None
       ... 
145    None
146    None
147    None
148    None
149    None
Length: 150, dtype: object

In [7]:
%%sql

--
--  5 - show table history
--

describe history delta_iris_ds


StatementMeta(, 02efe25e-e6fc-49da-9e0e-0689a1d3f72b, 11, Finished, Available, Finished)

<Spark SQL result set with 151 rows and 15 fields>

In [None]:
#
#  6 - re-run step 2 to drop old + make new table
#


In [11]:
#
#  7 - merge datasets - batches instead of rows (4s)
#

import datetime
from delta.tables import *

# create spark from pandas df
sdf = spark.createDataFrame(pdf)

# align column names
sdf = sdf.withColumnRenamed('SepalLengthCm', 'SepalLength')
sdf = sdf.withColumnRenamed('SepalWidthCm', 'SepalWidth')
sdf = sdf.withColumnRenamed('PetalLengthCm', 'PetalLength')
sdf = sdf.withColumnRenamed('PetalWidthCm', 'PetalWidth')

# grab path to delta table    
path = "Tables/delta_iris_ds"
# path = "abfss://a668a328-9f67-4678-93f2-10d5afdfe3ad@onelake.dfs.fabric.microsoft.com/1bc3c138-54ea-4c7e-b759-03edb4673eba/Tables/delta_iris_ds"

# merge dataframe and existing table
trg = DeltaTable.forPath(spark, path)
trg.alias("trg").merge(sdf.alias("src"), 'trg.Id = src.Id').whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
    

StatementMeta(, 02efe25e-e6fc-49da-9e0e-0689a1d3f72b, 17, Finished, Available, Finished)

In [14]:
%%sql

--
--  8 - show top 5 rows
--

select * from delta_iris_ds where Id < 6 order by Id



StatementMeta(, 02efe25e-e6fc-49da-9e0e-0689a1d3f72b, 20, Finished, Available, Finished)

<Spark SQL result set with 5 rows and 6 fields>

In [16]:
%%sql

--
--  9 - update row
--

update delta_iris_ds set SepalLength = 5.1 where Id = 5;

StatementMeta(, 02efe25e-e6fc-49da-9e0e-0689a1d3f72b, 22, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 1 fields>

In [17]:
%%sql

--
--  10 - delete row
--

delete from delta_iris_ds where Id = 1;

StatementMeta(, 02efe25e-e6fc-49da-9e0e-0689a1d3f72b, 23, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 1 fields>

In [18]:
%%sql

--
--  11 - show top 5 rows
--

select * from delta_iris_ds where Id < 6 order by Id


StatementMeta(, 02efe25e-e6fc-49da-9e0e-0689a1d3f72b, 24, Finished, Available, Finished)

<Spark SQL result set with 4 rows and 6 fields>

In [19]:
%%sql

--
--  12 - show table history
--

describe history delta_iris_ds


StatementMeta(, 02efe25e-e6fc-49da-9e0e-0689a1d3f72b, 25, Finished, Available, Finished)

<Spark SQL result set with 4 rows and 15 fields>