In [2]:
from pyspark.sql import SparkSession, functions as F
from pyspark.pandas import DataFrame
import pandas as pd



In [3]:
spark = SparkSession \
        .builder \
        .config(
                "spark.jars.packages", 
                "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.1",
        ) \
        .config(
                "spark.sql.extensions",
                "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
        ) \
        .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
        .config("spark.sql.catalog.spark_catalog.type", "hive") \
        .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
        .config("spark.sql.catalog.local.type", "hadoop") \
        .config("spark.sql.catalog.local.warehouse", "warehouse") \
        .getOrCreate()
        # .config("spark.sql.warehouse.dir", "warehouse") \
        # .config("spark.sql.defaultCatalog", "local") \

In [4]:
spark.version

'3.5.0'

In [5]:
spark.sql("use local")

DataFrame[]

# READ SOURCE

In [None]:
# df = spark.read \
#     .options(
#         header=True,
#         inferSchema=True,
#         sep=","
#     ) \
#     .csv("src/animes.csv")

In [None]:
#let create some dataframe
from pyspark.sql.types import StructType, StructField, LongType, FloatType, DoubleType, StringType
schema = StructType([
    StructField("vendor_id", LongType(), True),
    StructField("trip_id", LongType(), True),
    StructField("trip_distance", FloatType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("store_and_fwd_flag", StringType(), True)
])
data = [
    (1, 1000371, 1.8, 15.32, "N"),
    (2, 1000372, 2.5, 22.15, "N"),
    (2, 1000373, 0.9, 9.01, "N"),
    (1, 1000374, 8.4, 42.13, "Y")
]
df = spark.createDataFrame(data, schema)
df.show()

# Create Lake House Table

In [None]:
# df.writeTo("example_test").create()

# Read Lake House

In [None]:
# Show the schema
spark.table("example_test").printSchema()

# Add the new Row

In [None]:
schema = spark.table("example_test").schema
new_data = [
    (9, 1000999, 9.9, 99.99, "Y")
]

new_df = spark.createDataFrame(new_data, schema)

In [None]:
df = df.union(new_df)

In [None]:
df.writeTo("example_test").append()

In [None]:
df.show()

# Delete the row

In [6]:
df = spark.table("example_test").drop_duplicates()

In [7]:
df.show()

+---------+-------+-------------+-----------+------------------+
|vendor_id|trip_id|trip_distance|fare_amount|store_and_fwd_flag|
+---------+-------+-------------+-----------+------------------+
|        2|1000373|          0.9|       9.01|                 N|
|        1|1000371|          1.8|      15.32|                 N|
|        2|1000372|          2.5|      22.15|                 N|
|        1|1000374|          8.4|      42.13|                 Y|
|        9|1000999|          9.9|      99.99|                 Y|
+---------+-------+-------------+-----------+------------------+



In [8]:
df = df.filter(df["vendor_id"] != 1).select("*")

In [9]:
df.show()

+---------+-------+-------------+-----------+------------------+
|vendor_id|trip_id|trip_distance|fare_amount|store_and_fwd_flag|
+---------+-------+-------------+-----------+------------------+
|        2|1000372|          2.5|      22.15|                 N|
|        2|1000373|          0.9|       9.01|                 N|
|        9|1000999|          9.9|      99.99|                 Y|
+---------+-------+-------------+-----------+------------------+



In [14]:
df.writeTo("local.example_test").replace()

# Spark SQL

In [15]:
spark.sql(
    """
    SELECT 
        *
    FROM 
        local.example_test
    """
).show()

+---------+-------+-------------+-----------+------------------+
|vendor_id|trip_id|trip_distance|fare_amount|store_and_fwd_flag|
+---------+-------+-------------+-----------+------------------+
|        2|1000373|          0.9|       9.01|                 N|
|        2|1000372|          2.5|      22.15|                 N|
|        9|1000999|          9.9|      99.99|                 Y|
+---------+-------+-------------+-----------+------------------+



In [17]:
spark.sql(
    """
INSERT INTO local.example_test
VALUES (1, 1000371, 2, 20, 'N'), 
(2, 1000372, 4, 39, 'Y');    
    """
)

DataFrame[]

In [19]:
spark.sql(
    """
        CREATE OR REPLACE TABLE new_table_example(
            vendor_id bigint,
            trip_id bigint,
            trip_distance float,
            fare_amount double,
            store_and_fwd_flag string
        ) USING iceberg
        PARTITIONED BY (vendor_id);
    """
).show()

++
||
++
++



In [23]:
spark.sql(
    """
    INSERT INTO new_table_example (
        SELECT * FROM example_test
    )
    """
).show()

++
||
++
++



In [25]:
spark.sql(
    """
    SELECT 
        *
    FROM new_table_example
    """
).show()

+---------+-------+-------------+-----------+------------------+
|vendor_id|trip_id|trip_distance|fare_amount|store_and_fwd_flag|
+---------+-------+-------------+-----------+------------------+
|        9|1000999|          9.9|      99.99|                 Y|
|        1|1000371|          2.0|       20.0|                 N|
|        2|1000373|          0.9|       9.01|                 N|
|        2|1000372|          2.5|      22.15|                 N|
|        2|1000372|          4.0|       39.0|                 Y|
+---------+-------+-------------+-----------+------------------+



# Rollback

In [27]:
spark.sql(
    """
    SELECT *
    FROM example_test.snapshots
    """
).show()

+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|        committed_at|        snapshot_id|          parent_id|operation|       manifest_list|             summary|
+--------------------+-------------------+-------------------+---------+--------------------+--------------------+
|2024-05-05 16:30:...|7756389368573949685|               NULL|   append|warehouse/example...|{spark.app.id -> ...|
|2024-05-05 16:39:...|8750744565593823205|7756389368573949685|   append|warehouse/example...|{spark.app.id -> ...|
|2024-05-05 20:23:...|8121411867246861750|8750744565593823205|   append|warehouse/example...|{spark.app.id -> ...|
| 2024-05-05 20:26:20|4456078782926854740|               NULL|   append|warehouse/example...|{spark.app.id -> ...|
|2024-05-05 20:28:...|6201281357802780287|4456078782926854740|   append|warehouse/example...|{spark.app.id -> ...|
+--------------------+-------------------+-------------------+---------+--------

In [28]:
# Version 5
spark.sql(
    """
    SELECT * FROM new_table_example
    """
).show()

+---------+-------+-------------+-----------+------------------+
|vendor_id|trip_id|trip_distance|fare_amount|store_and_fwd_flag|
+---------+-------+-------------+-----------+------------------+
|        9|1000999|          9.9|      99.99|                 Y|
|        1|1000371|          2.0|       20.0|                 N|
|        2|1000373|          0.9|       9.01|                 N|
|        2|1000372|          2.5|      22.15|                 N|
|        2|1000372|          4.0|       39.0|                 Y|
+---------+-------+-------------+-----------+------------------+



In [31]:
spark.sql(
    """
        CALL system.rollback_to_snapshot('example_test', 4456078782926854740)
    """
)

DataFrame[previous_snapshot_id: bigint, current_snapshot_id: bigint]

In [34]:
# Rollback to version 4 (current version 6)
spark.sql(
    """
        SELECT *
        FROM example_test
    """
).show()

+---------+-------+-------------+-----------+------------------+
|vendor_id|trip_id|trip_distance|fare_amount|store_and_fwd_flag|
+---------+-------+-------------+-----------+------------------+
|        2|1000373|          0.9|       9.01|                 N|
|        2|1000372|          2.5|      22.15|                 N|
|        9|1000999|          9.9|      99.99|                 Y|
+---------+-------+-------------+-----------+------------------+

