In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.window import Window

spark= SparkSession \
       .builder \
       .appName("daily_practice") \
       .config('spark.ui.port', '4050') \
       .getOrCreate()

spark

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [75.2 kB]
Get:12 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,604 kB]
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/mai

In [2]:
from google.colab import output
output.serve_kernel_port_as_window(4050, path ='/jobs/index.html')

Try `serve_kernel_port_as_iframe` instead. [0m


<IPython.core.display.Javascript object>

# 🚩 **Day 5 - 2025/04/21**

### Q1: Missing Record Detection
Find records that are expected to exist but are missing from a dataset. You have sales data for multiple stores over several months, but some stores didn't report data for certain months. Identify all store/month combinations that are missing.

```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, lit, sequence, to_date

# Sample data
sales = spark.createDataFrame([
    (1, "2024-01-15", 1200),
    (1, "2024-02-15", 1450),
    (1, "2024-04-15", 1300),
    (2, "2024-01-15", 950),
    (2, "2024-03-15", 1100),
    (3, "2024-01-15", 800),
    (3, "2024-02-15", 850),
    (3, "2024-03-15", 900),
    (3, "2024-04-15", 950)
], ["store_id", "report_date", "sales_amount"])
```

Expected output (all store/month combinations that are missing):
```
+--------+----------------+
|store_id|missing_month   |
+--------+----------------+
|1       |2024-03-15      |
|2       |2024-02-15      |
|2       |2024-04-15      |
+--------+----------------+
```

### Q2: Time Series Forecasting Preparation
Prepare data for a time series forecasting model by creating a complete series with moving averages and lagged values for multiple time windows. Convert daily temperature readings into features suitable for predicting future temperatures.

```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, avg, date_format, to_date
from pyspark.sql.window import Window

# Sample data - daily temperature readings
temp_readings = spark.createDataFrame([
    ("2024-01-01", 32.5),
    ("2024-01-02", 31.8),
    ("2024-01-03", 33.2),
    ("2024-01-04", 34.0),
    ("2024-01-05", 32.1),
    ("2024-01-06", 30.9),
    ("2024-01-07", 31.5),
    ("2024-01-08", 32.7),
    ("2024-01-09", 33.8),
    ("2024-01-10", 34.5)
], ["date", "temperature"])
```

Expected output (should include 3-day moving average, 7-day moving average, 1-day lag, 3-day lag, and 7-day lag):
```
+----------+------------+------------------+------------------+-------------+-------------+-------------+
|date      |temperature |avg_temp_3day     |avg_temp_7day     |lag_1day     |lag_3day     |lag_7day     |
+----------+------------+------------------+------------------+-------------+-------------+-------------+
|2024-01-10|34.5        |34.3              |32.9              |33.8         |32.1         |32.5         |
|2024-01-09|33.8        |33.7              |32.7              |32.7         |30.9         |31.8         |
|2024-01-08|32.7        |32.7              |32.1              |31.5         |34.0         |null         |
...
+----------+------------+------------------+------------------+-------------+-------------+-------------+
```


### Q3: Incremental ETL with Change Data Capture
Implement a change data capture (CDC) process that accurately merges a source table into a target table, accounting for inserts, updates, and deletes based on operation flags.

```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when

# Target table - current state of data
target = spark.createDataFrame([
    (1, "Widget A", 19.99, "2023-12-01"),
    (2, "Widget B", 24.99, "2023-12-05"),
    (3, "Widget C", 14.99, "2023-12-10"),
    (4, "Widget D", 29.99, "2023-12-15")
], ["id", "product_name", "price", "last_updated"])

# Source CDC table with change operations (I=Insert, U=Update, D=Delete)
source_cdc = spark.createDataFrame([
    (2, "Widget B Premium", 27.99, "2024-01-05", "U"),
    (3, "Widget C", 14.99, "2023-12-10", "D"),
    (5, "Widget E", 22.99, "2024-01-07", "I"),
    (6, "Widget F", 17.99, "2024-01-10", "I")
], ["id", "product_name", "price", "last_updated", "operation"])
```

Expected output (after merging changes):
```
+---+----------------+-----+------------+
|id |product_name    |price|last_updated|
+---+----------------+-----+------------+
|1  |Widget A        |19.99|2023-12-01  |
|2  |Widget B Premium|27.99|2024-01-05  |
|4  |Widget D        |29.99|2023-12-15  |
|5  |Widget E        |22.99|2024-01-07  |
|6  |Widget F        |17.99|2024-01-10  |
+---+----------------+-----+------------+
```

### Q4: Complex Sessionization Algorithm
Implement a custom sessionization algorithm for web clickstream data that groups user activities into sessions based on both time gaps between consecutive events and page type transitions. A new session starts when either the time gap exceeds 30 minutes OR when a user transitions from a product page to the homepage.

```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, when, sum, lit, row_number
from pyspark.sql.window import Window

# Sample clickstream data
clicks = spark.createDataFrame([
    ("user1", "2024-01-01 10:00:00", "homepage"),
    ("user1", "2024-01-01 10:05:30", "product_page"),
    ("user1", "2024-01-01 10:08:45", "product_page"),
    ("user1", "2024-01-01 10:30:00", "homepage"),      # New session (page transition)
    ("user1", "2024-01-01 10:32:15", "product_page"),
    ("user1", "2024-01-01 11:15:00", "product_page"),  # New session (time gap > 30 min)
    ("user2", "2024-01-01 14:20:10", "homepage"),
    ("user2", "2024-01-01 14:22:30", "product_page"),
    ("user2", "2024-01-01 14:55:00", "homepage"),      # New session (page transition)
    ("user2", "2024-01-01 14:57:20", "product_page")
], ["user_id", "timestamp", "page_type"])
```

Expected output:
```
+-------+-------------------+------------+----------+------------------+
|user_id|timestamp          |page_type   |session_id|session_duration  |
+-------+-------------------+------------+----------+------------------+
|user1  |2024-01-01 10:00:00|homepage    |1         |8.75              |
|user1  |2024-01-01 10:05:30|product_page|1         |8.75              |
|user1  |2024-01-01 10:08:45|product_page|1         |8.75              |
|user1  |2024-01-01 10:30:00|homepage    |2         |2.25              |
|user1  |2024-01-01 10:32:15|product_page|2         |2.25              |
|user1  |2024-01-01 11:15:00|product_page|3         |0.0               |
|user2  |2024-01-01 14:20:10|homepage    |1         |2.33              |
|user2  |2024-01-01 14:22:30|product_page|1         |2.33              |
|user2  |2024-01-01 14:55:00|homepage    |2         |2.33              |
|user2  |2024-01-01 14:57:20|product_page|2         |2.33              |
+-------+-------------------+------------+----------+------------------+
```

### Q5: Nested ETL and Schema Evolution Detection
You're building a data pipeline that must detect schema changes in nested JSON data structures and report differences between the current data schema and the target table schema. Design a solution that identifies added, modified, or removed fields (including nested fields) and suggests ALTER TABLE statements to evolve the schema.

```python
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType, MapType
import json

# Target table schema (current)
target_schema = StructType([
    StructField("id", StringType(), False),
    StructField("name", StringType(), True),
    StructField("details", StructType([
        StructField("category", StringType(), True),
        StructField("price", FloatType(), True),
        StructField("tags", ArrayType(StringType()), True)
    ]), True),
    StructField("inventory", MapType(StringType(), IntegerType()), True)
])

# New data with schema changes
new_data_json = """
[
  {
    "id": "item1",
    "name": "Product X",
    "details": {
      "category": "Electronics",
      "price": 299.99,
      "tags": ["new", "featured"],
      "weight": 1.5,
      "dimensions": {"length": 10, "width": 5, "height": 2}
    },
    "inventory": {"store1": 20, "store2": 15},
    "reviews": [{"user": "user1", "rating": 4.5, "comment": "Great product"}]
  }
]
"""
new_data = spark.read.json(sc.parallelize([new_data_json]))
```

Hint: You'll need to write a recursive function to compare schema objects and identify differences at all levels of nesting. Your output should include a DataFrame showing all schema differences and the SQL ALTER TABLE statements needed to evolve the schema.