# Pip Install Snowpark (Requires Python 3.8. This only need to run this once)

In [None]:
!pip install snowflake-snowpark-python



---



# 1. Connect via Snowpark

In [None]:
import time
import snowflake.snowpark.functions as f
from snowflake.snowpark import Session, DataFrame
from snowflake.snowpark.functions import udf, col
from snowflake.snowpark.types import IntegerType
from snowflake.snowpark.functions import call_udf


# <----- Make these changes before running the notebook -------
# 1. Change Connection params to match your environment
# EDIT <..> items and add your Account, Creds, Warehouse & DB name

Warehouse_Name = '<Your_WarehouseName>'
DB_NAME = '<Your_DbName>'


CONNECTION_PARAMETERS = {
    "host": "<YourAccount>.snowflakecomputing.com",
    'account': '<YourAccount>',
    'user': '<Your_UserID>',
    'password': '<Your_Password>',
    'role': 'SYSADMIN',
}

# --- Finish editing this section before running the code
# <----------------------------------------------------------------------------


print("Connecting to Snowflake.....\n")
session = Session.builder.configs(CONNECTION_PARAMETERS).create()
print("Connected Successfully!...\n\n")

sql_cmd = "USE SCHEMA {}.PUBLIC".format(DB_NAME)
session.sql(sql_cmd).collect() 

sql_cmd = "USE WAREHOUSE {}".format(Warehouse_Name)
session.sql(sql_cmd).collect() 




## 2. Perform Data Engineering Tasks

In [None]:



# 1 - INCREASE COMPUTE TO 4 COMPUTE NODES
print("Scale UP compute from XS(1 Node) to MEDIUM(4 Nodes) ..\n")

sql_cmd = "ALTER WAREHOUSE {} SET WAREHOUSE_SIZE = 'MEDIUM' WAIT_FOR_COMPLETION = TRUE".format(Warehouse_Name)
session.sql(sql_cmd).collect()  

print("Completed!...\n\n")



# 2 - READ & JOIN 2 LARGE TABLES (600M & 1M rows)
print("Joining, Aggregating with 2 large tables(600M & 1M rows) & Writing results to new table(80M rows) ..\n")
dfLineItems = session.table("SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.LINEITEM")  # 600 Million Rows
dfSuppliers = session.table("SNOWFLAKE_SAMPLE_DATA.TPCH_SF100.SUPPLIER")  # 1 Million Rows

print('Lineitems Table: %s rows' % dfLineItems.count())
print('Suppliers Table: %s rows' % dfSuppliers.count())


# 3 - JOIN TABLES
dfJoinTables = dfLineItems.join(dfSuppliers,
                                dfLineItems.col("L_SUPPKEY") == dfSuppliers.col("S_SUPPKEY"))  

# 4 - SUMMARIZE THE DATA BY SUPPLIER, PART, SUM, MIN & MAX
dfSummary = dfJoinTables.groupBy("S_NAME", "L_PARTKEY").agg([
    f.sum("L_QUANTITY").alias("TOTAL_QTY"),
    f.min("L_QUANTITY").alias("MIN_QTY"),
    f.max("L_QUANTITY").alias("MAX_QTY"),
])


#### **↑ Compute is NOT used** up to this point !!! (Lazy Execution model. Resulting dataframe was not used for any I/O ops )

## 3. Storing or Showing results triggers the compute & executes the previous steps.

In [None]:
start_time = time.time()

# 5 - WRITE THE RESULTS TO A NEW TABLE (80 Million Rows)
# -- This is when all the previous operations are compiled & executed as a single job
# -- because dfSummary is being written to a new target table
dfSummary.write.mode("overwrite").saveAsTable("SALES_SUMMARY")
print("Completed!...\n\n")


# 6 - QUERY THE RESULTS (80 Million Rows)
print("Query the results..\n")
dfSales = session.table("SALES_SUMMARY")
dfSales.show()
end_time = time.time()

print("Completed!...\n\n")

print("Scale DOWN compute to XS(1 node)..\n")

sql_cmd = "ALTER WAREHOUSE {} SET WAREHOUSE_SIZE = 'XSMALL'".format(Warehouse_Name)
session.sql(sql_cmd).collect()  

print("Completed!...\n")

print("--- %s seconds to Join, Summarize & Write Results to a new Table --- \n" % int(end_time - start_time))
print("--- %s Rows Written to SALES_SUMMARY table" % dfSales.count())