# Lab 08 - Redshift
#### Izzy Valdivia
#### 11/22/2025


### Native Optimized CTAS Table:  
`optimized_orders`  
This represents the data that gets loaded into Redshift that gets queries more frequently
 
______________
  
### External Table:  
`spectrum_raw.orders`  
Not in redshift. This was used to load data into redshift, though. 


#### Goal: 
Analyze the tradeoff between query performance and query flexibility when using an optimized CTAS table versus an external table.

In [52]:
import boto3
import time
import json


In [53]:
# Establish connection to redshift 
session = boto3.Session(region_name='us-west-2', profile_name='default')
client = session.client('redshift-data')

# Function to execute the sql query!
def execute_and_wait(sql, cluster_id, database, db_user):
    """Execute SQL and wait for completion, return statement metadata"""
    response = client.execute_statement(
        ClusterIdentifier=cluster_id,
        Database=database,
        DbUser=db_user,
        Sql=sql
    )
    stmt_id = response['Id']

    # Poll until complete
    while True:
        status = client.describe_statement(Id=stmt_id)
        if status['Status'] in ['FINISHED', 'FAILED', 'ABORTED']:
            break
        time.sleep(1)
    return status  # Contains Duration, ResultRows, etc.


def print_duration_in_seconds(result):
    duration_seconds = result['Duration'] / 1_000_000_000  # Convert from nanoseconds
    print(f"{duration_seconds} seconds")

## 4.1 Query Definitions:
### Analysis of `optimized_orders` table: 

In [54]:
query_optimized_sql = """
SELECT
    category,
    region,
    COUNT(*) as order_count,
    SUM(extended_price) as total_revenue,
    AVG(discount_rate) as avg_discount,
    COUNT(DISTINCT product_sku) as unique_products
FROM optimized_orders
WHERE ts >= '2024-06-01' AND ts < '2024-07-01'
    AND category IN ('analytics', 'compute', 'observability')
    AND region IN ('us-east', 'us-west', 'eu-west')
GROUP BY category, region
ORDER BY total_revenue DESC;"""

optimized_result_explain = execute_and_wait(
    f"EXPLAIN {query_optimized_sql}",
    # grab the below values from your stack output
    cluster_id='wk08-redshift-cluster',
    database='dev',
    db_user='rsadmin'
)

query2_optimized_sql = """
SELECT
    category,
    status,
    COUNT(*) as order_count,
    SUM(extended_price) as total_revenue,
    AVG(quantity) as avg_quantity,
    MIN(ts) as earliest_order,
    MAX(ts) as latest_order
FROM optimized_orders
WHERE (
    (ts >= '2023-03-01' AND ts < '2023-03-15')
    OR (ts >= '2023-09-01' AND ts < '2023-09-15')
    OR (ts >= '2024-03-01' AND ts < '2024-03-15')
)
    AND category = 'analytics'
GROUP BY category, status
ORDER BY total_revenue DESC;
"""

# Execute actual query and measure duration
optimized_query_result = execute_and_wait(
    query_optimized_sql,
    cluster_id='wk08-redshift-cluster',
    database='dev',
    db_user='rsadmin'
)

# Execute actual query and measure duration
optimized_query2_result = execute_and_wait(
    query_optimized_sql,
    cluster_id='wk08-redshift-cluster',
    database='dev',
    db_user='rsadmin'
)
duration_seconds = optimized_query_result['Duration'] / 1_000_000_000  # Convert from nanoseconds
duration2_seconds = optimized_query2_result['Duration'] / 1_000_000_000  # Convert from nanoseconds


print("Optimized table query 1 time: ")
print_duration_in_seconds(optimized_query_result)
print(" ")
print("Optimized table query 2 time: ")
print_duration_in_seconds(optimized_query2_result)


Optimized table query 1 time: 
0.451773412 seconds
 
Optimized table query 2 time: 
0.024278919 seconds


In [55]:
analyze_sql = "ANALYZE optimized_orders"
analyze_result = execute_and_wait(analyze_sql, cluster_id='wk08-redshift-cluster',
    database='dev',
    db_user='rsadmin')

print_duration_in_seconds(analyze_result)

table_stats_sql = "SELECT * FROM svv_table_info WHERE \"table\" = 'optimized_orders'"
table_stats = execute_and_wait(table_stats_sql, cluster_id='wk08-redshift-cluster',
    database='dev',
    db_user='rsadmin')

print_duration_in_seconds(table_stats)

0.21158678 seconds
1.828933462 seconds


### Analysis of `specturm_raw.orders` table:

In [56]:
query_external_sql = """
SELECT
    category,
    region,
    COUNT(*) as order_count,
    SUM(extended_price) as total_revenue,
    AVG(discount_rate) as avg_discount,
    COUNT(DISTINCT product_sku) as unique_products
FROM spectrum_raw.orders
WHERE ts >= '2024-06-01' AND ts < '2024-07-01'
    AND category IN ('analytics', 'compute', 'observability')
    AND region IN ('us-east', 'us-west', 'eu-west')
GROUP BY category, region
ORDER BY total_revenue DESC;"""

query2_external_sql = """
SELECT
    category,
    status,
    COUNT(*) as order_count,
    SUM(extended_price) as total_revenue,
    AVG(quantity) as avg_quantity,
    MIN(ts) as earliest_order,
    MAX(ts) as latest_order
FROM spectrum_raw.orders
WHERE (
    (ts >= '2023-03-01' AND ts < '2023-03-15')
    OR (ts >= '2023-09-01' AND ts < '2023-09-15')
    OR (ts >= '2024-03-01' AND ts < '2024-03-15')
)
    AND category = 'analytics'
GROUP BY category, status
ORDER BY total_revenue DESC;
"""


external_result = execute_and_wait(
    query_external_sql,
    cluster_id='wk08-redshift-cluster',
    database='dev',
    db_user='rsadmin'
)

external_result2 = execute_and_wait(
    query2_external_sql,
    cluster_id='wk08-redshift-cluster',
    database='dev',
    db_user='rsadmin'
)
print("External table query 1 time: ")
print_duration_in_seconds(external_result)
print(" ")
print("External table query 2 time: ")
print_duration_in_seconds(external_result2)

External table query 1 time: 
43.436711703 seconds
 
External table query 2 time: 
39.102723374 seconds


In [57]:
analyze_sql = "ANALYZE spectrum_raw.orders"
analyze_result = execute_and_wait(analyze_sql, cluster_id='wk08-redshift-cluster',
    database='dev',
    db_user='rsadmin')

print_duration_in_seconds(analyze_result)

table_stats_sql = "SELECT * FROM svv_table_info WHERE \"table\" = 'spectrum_raw.orders'"
table_stats = execute_and_wait(table_stats_sql, cluster_id='wk08-redshift-cluster',
    database='dev',
    db_user='rsadmin')

print_duration_in_seconds(table_stats)

-1e-09 seconds
1.263937752 seconds
