# Exercise 3: Spark SQL

## Learning Objectives
- Load data from CSV, JSON, and Parquet formats
- Execute SQL queries on DataFrames
- Join datasets from different sources
- Write results back to HDFS in optimized formats

---

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, sum, avg, count, desc
from pyspark.sql.types import *

spark = SparkSession.builder \
    .appName("Spark SQL Lab") \
    .master("yarn") \
    .config("spark.executor.memory", "1g") \
    .config("spark.executor.instances", "2") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

print(f"Spark SQL session ready: {spark.sparkContext.applicationId}")

## Part 1: Loading Data from Multiple Formats

In [None]:
# Load transactions from CSV
transactions = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv("hdfs:///user/student/data/transactions.csv")

print(f"Transactions: {transactions.count()} rows")
transactions.printSchema()

In [None]:
# Load products from JSON
products = spark.read.json("hdfs:///user/student/data/catalog.json")

print(f"Products: {products.count()} rows")
products.printSchema()

In [None]:
# Register as temporary views for SQL
transactions.createOrReplaceTempView("transactions")
products.createOrReplaceTempView("products")

## Part 2: Running SQL Queries

In [None]:
# Basic aggregation query
spark.sql("""
    SELECT 
        store_region,
        COUNT(*) as num_transactions,
        ROUND(SUM(total_amount), 2) as total_sales,
        ROUND(AVG(total_amount), 2) as avg_sale
    FROM transactions
    GROUP BY store_region
    ORDER BY total_sales DESC
""").show()

In [None]:
# Time-based analysis
spark.sql("""
    SELECT 
        YEAR(TO_DATE(transaction_date, 'yyyy-MM-dd')) as year,
        MONTH(TO_DATE(transaction_date, 'yyyy-MM-dd')) as month,
        COUNT(*) as transactions,
        ROUND(SUM(total_amount), 2) as revenue
    FROM transactions
    GROUP BY year, month
    ORDER BY year, month
""").show(24)

In [None]:
# Payment method analysis
spark.sql("""
    SELECT 
        payment_method,
        is_online,
        COUNT(*) as count,
        ROUND(AVG(total_amount), 2) as avg_amount
    FROM transactions
    GROUP BY payment_method, is_online
    ORDER BY payment_method, is_online
""").show()

## Part 3: Joining Datasets

In [None]:
# Join transactions with products
sales_with_products = spark.sql("""
    SELECT 
        t.transaction_id,
        t.transaction_date,
        p.product_name,
        p.category,
        t.quantity,
        t.unit_price,
        t.total_amount,
        p.cost_price * t.quantity as total_cost,
        t.total_amount - (p.cost_price * t.quantity) as profit
    FROM transactions t
    JOIN products p ON t.product_id = p.product_id
""")

sales_with_products.show(10)

In [None]:
# Profit by category
sales_with_products.createOrReplaceTempView("sales_detail")

spark.sql("""
    SELECT 
        category,
        COUNT(*) as transactions,
        ROUND(SUM(total_amount), 2) as revenue,
        ROUND(SUM(profit), 2) as total_profit,
        ROUND(AVG(profit), 2) as avg_profit_per_sale
    FROM sales_detail
    GROUP BY category
    ORDER BY total_profit DESC
""").show()

### üîç Checkpoint Question 3
Look at the Spark UI (SQL tab). Find the join query.
- What type of join was used (broadcast, sort-merge, shuffle-hash)?
- Why do you think Spark chose that strategy?

**Your Answer:**

## Part 4: Writing Data in Optimized Formats

In [None]:
# Write to Parquet (columnar, compressed)
sales_with_products.write \
    .mode("overwrite") \
    .parquet("hdfs:///user/student/output/sales_parquet")

print("Written to Parquet!")

In [None]:
# Write partitioned by category (useful for filtering)
sales_with_products.write \
    .mode("overwrite") \
    .partitionBy("category") \
    .parquet("hdfs:///user/student/output/sales_by_category")

print("Written with partitioning!")

In [None]:
# Check the output
!hdfs dfs -ls /user/student/output/sales_by_category/

In [None]:
# Read back and verify
parquet_data = spark.read.parquet("hdfs:///user/student/output/sales_parquet")
print(f"Read back {parquet_data.count()} rows from Parquet")

In [None]:
spark.stop()