# Install Snowpark

In [None]:
!pip install snowflake-snowpark-python



---



# Connect to Snowflake via SnowPark (& without PySpark)

In [22]:
import time
# --->  REMOVE PYSPARK REFERENCES

# import pyspark.sql.functions as f
# from pyspark.sql import SparkSession
# from pyspark.sql.functions import udf,col
# from pyspark.sql.types import IntegerType
# spark = SparkSession.builder.appName("DataEngeering1").getOrCreate()


# <---  REPLACE WITH SNOWPARK REFERENCES (Rest of code is almost identical)

import snowflake.snowpark.functions as f
from snowflake.snowpark import Session, DataFrame
from snowflake.snowpark.functions import udf, col
from snowflake.snowpark.types import IntegerType
from snowflake.snowpark.functions import call_udf


# <----- Make these changes before running the notebook -------
# Change Connection params to match your environment
# <----------------------------------------------------------------------------

Warehouse_Name = 'MY_DEMO_WH'
DB_name = 'DEMO_SNOWPARK'
Schema_Name = 'Public'

CONNECTION_PARAMETER = {
    "host": "<YourAccount>.snowflakecomputing.com",
    'account': '<YourAccount>',
    'user': '<Your_UserID>',
    'password': '<Your_Password>',
    'role': 'SYSADMIN',
}


print("Connecting to Snowflake.....\n")
session = Session.builder.configs(CONNECTION_PARAMETERS).create()
print("Connected Successfully!...\n")

sql_cmd = "CREATE OR REPLACE WAREHOUSE {} WAREHOUSE_SIZE = 'X-Small' ".format(Warehouse_Name)
session.sql(sql_cmd).collect() 

sql_cmd = "CREATE OR REPLACE DATABASE {}".format(DB_name)
session.sql(sql_cmd).collect() 

session.use_database(DB_name)
session.use_schema(Schema_Name)
session.use_warehouse(Warehouse_Name)


# 1 - INCREASE COMPUTE TO 4 NODES
print("Demo Environment Created \n Resizing to from XS(1 Node) to LARGE(8 Nodes) ..\n")

sql_cmd = "ALTER WAREHOUSE {} SET WAREHOUSE_SIZE = 'LARGE' WAIT_FOR_COMPLETION = TRUE".format(Warehouse_Name)
session.sql(sql_cmd).collect()  

print("Completed!...\n\n")


Connecting to Snowflake.....

Connected Successfully!...

Demo Environment Created 
 Resizing to from XS(1 Node) to LARGE(8 Nodes) ..

Completed!...




## Start Data Engineering Process

In [23]:


# 2 - READ & JOIN 2 LARGE TABLES (600M & 1M rows)
print("Joining, Aggregating with 2 large tables(600M & 1M rows) & Writing results to new table(80M rows) ..\n")

dfLineItems = session.table("SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM")  # 600 Million Rows
dfSuppliers = session.table("SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.SUPPLIER")  # 1 Million Rows

print('Lineitems Table: %s rows' % dfLineItems.count())
print('Suppliers Table: %s rows' % dfSuppliers.count())

# 3 - JOIN TABLES
dfJoinTables = dfLineItems.join(dfSuppliers,
                                dfLineItems.col("L_SUPPKEY") == dfSuppliers.col("S_SUPPKEY"))  

# 4 - SUMMARIZE THE DATA BY SUPPLIER, PART, SUM, MIN & MAX
dfSummary = dfJoinTables.groupBy("S_NAME", "L_PARTKEY").agg([
    f.sum("L_QUANTITY").alias("TOTAL_QTY"),
    f.min("L_QUANTITY").alias("MIN_QTY"),
    f.max("L_QUANTITY").alias("MAX_QTY"),
])


Joining, Aggregating with 2 large tables(600M & 1M rows) & Writing results to new table(80M rows) ..

Lineitems Table: 600037902 rows
Suppliers Table: 1000000 rows


### **↑ Compute is NOT used** up to this point. (Lazy Execution Model) !!!

## 3. Storing the Results in Table or Showing results triggers the compute & previous steps.

In [24]:
start_time = time.time()
# 5 - WRITE THE RESULTS TO A NEW TABLE ( 80 Million Rows)
# <-- This is when all the previous operations are compiled & executed as a single job
dfSummary.write.mode("overwrite").saveAsTable("SALES_SUMMARY")
print("Target Table Created!...\n\n")

# 6 - QUERY THE RESULTS (80 Million Rows)
print("Query the results..\n")
dfSales = session.table("SALES_SUMMARY")
dfSales.show()
end_time = time.time()


# 7 - SCALE DOWN COMPUTE TO 1 NODE
print("Reducing the warehouse to XS..\n")
sql_cmd = "ALTER WAREHOUSE {} SET WAREHOUSE_SIZE = 'XSMALL'".format(Warehouse_Name)
session.sql(sql_cmd).collect()  

print("Completed!...\n")

print("--- %s seconds to Join, Summarize & Write Results to a new Table --- \n" % int(end_time - start_time))
print("--- %s Rows Written to SALES_SUMMARY table" % dfSales.count())

Target Table Created!...


Query the results..

--------------------------------------------------------------------------
|"S_NAME"            |"L_PARTKEY"  |"TOTAL_QTY"  |"MIN_QTY"  |"MAX_QTY"  |
--------------------------------------------------------------------------
|Supplier#000146124  |11646101     |331.00       |17.00      |50.00      |
|Supplier#000527051  |10277040     |169.00       |4.00       |45.00      |
|Supplier#000541796  |6541795      |330.00       |3.00       |50.00      |
|Supplier#000512146  |5512145      |238.00       |3.00       |49.00      |
|Supplier#000436838  |19686780     |283.00       |3.00       |50.00      |
|Supplier#000351952  |11351951     |105.00       |13.00      |32.00      |
|Supplier#000438967  |5188961      |163.00       |5.00       |37.00      |
|Supplier#000970465  |14220422     |266.00       |7.00       |48.00      |
|Supplier#000908398  |3158388      |273.00       |5.00       |48.00      |
|Supplier#000205184  |14205183     |221.00       |21

# **Benefits of Snowpark Over Spark & PySpark**
### - **Quick to Migrate** as code is mostly identical & does not require re-learning new language 
### - **Cheaper** as compute is serverless & runs only when needed.
### - **Faster** as all unnecesseary data movement is eliminated = **Less time** using Compute = **Less Cost**
### - **Easier to use = Less FTE** as Little to No Maintanence needed for Compute & Storage. 