In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

import pyspark

from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.window import Window

spark= SparkSession \
       .builder \
       .appName("daily_practice") \
       .config('spark.ui.port', '4050') \
       .getOrCreate()


Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:6 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:7 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Hit:8 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,844 kB]
Get:12 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,244 kB]
Get:13 https://ppa.launchpadcontent.net/graphics-drive

In [2]:
from google.colab import output
output.serve_kernel_port_as_window(4050, path ='/jobs/index.html')

Try `serve_kernel_port_as_iframe` instead. [0m


<IPython.core.display.Javascript object>

# 🚩 **Day 5 - 2025/04/21**

### Q1: Missing Record Detection
Find records that are expected to exist but are missing from a dataset. You have sales data for multiple stores over several months, but some stores didn't report data for certain months. Identify all store/month combinations that are missing.

```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, lit, sequence, to_date

# Sample data
sales = spark.createDataFrame([
    (1, "2024-01-15", 1200),
    (1, "2024-02-15", 1450),
    (1, "2024-04-15", 1300),
    (2, "2024-01-15", 950),
    (2, "2024-03-15", 1100),
    (3, "2024-01-15", 800),
    (3, "2024-02-15", 850),
    (3, "2024-03-15", 900),
    (3, "2024-04-15", 950)
], ["store_id", "report_date", "sales_amount"])
```

Expected output (all store/month combinations that are missing):
```
+--------+----------------+
|store_id|missing_month   |
+--------+----------------+
|1       |2024-03-15      |
|2       |2024-02-15      |
|2       |2024-04-15      |
+--------+----------------+
```

In [None]:
# Sample data
sales = spark.createDataFrame([
    (1, "2024-01-15", 1200),
    (1, "2024-02-15", 1450),
    (1, "2024-04-15", 1300),
    (2, "2024-01-15", 950),
    (2, "2024-03-15", 1100),
    (3, "2024-01-15", 800),
    (3, "2024-02-15", 850),
    (3, "2024-03-15", 900),
    (3, "2024-04-15", 950)
], ["store_id", "report_date", "sales_amount"])

In [None]:
sales.printSchema()

root
 |-- store_id: long (nullable = true)
 |-- report_date: string (nullable = true)
 |-- sales_amount: long (nullable = true)



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, lit, sequence, to_date, expr

# Get all distinct store IDs
store_ids = sales.select("store_id").distinct()

# Create a single row DataFrame with date range
date_range = spark.createDataFrame([("2024-01-15", "2024-04-15")], ["start_date", "end_date"])
date_range = date_range.withColumn("start_date", to_date("start_date"))
date_range = date_range.withColumn("end_date", to_date("end_date"))

# Cross join to get all stores with the date range
expected_dates_base = store_ids.crossJoin(date_range)

# Generate sequence of monthly dates for each store
expected_report_dates = expected_dates_base.withColumn(
    "date_sequence",
    sequence("start_date", "end_date", expr("interval 1 month"))
).withColumn(
    "expected_date",
    explode("date_sequence")
).select("store_id", "expected_date")

# Anti-join to find missing dates
missing_reports = expected_report_dates.join(
    sales.withColumn("date", to_date("report_date")),
    (expected_report_dates.store_id == sales.store_id) &
    (expected_report_dates.expected_date == to_date(sales.report_date)),
    "leftanti"
).select(
    expected_report_dates.store_id,
    expected_report_dates.expected_date.alias("missing_month")
)

missing_reports.show()

+--------+-------------+
|store_id|missing_month|
+--------+-------------+
|       1|   2024-03-15|
|       2|   2024-02-15|
|       2|   2024-04-15|
+--------+-------------+



### Q2: Time Series Forecasting Preparation
Prepare data for a time series forecasting model by creating a complete series with moving averages and lagged values for multiple time windows. Convert daily temperature readings into features suitable for predicting future temperatures.

```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, avg, date_format, to_date
from pyspark.sql.window import Window

# Sample data - daily temperature readings
temp_readings = spark.createDataFrame([
    ("2024-01-01", 32.5),
    ("2024-01-02", 31.8),
    ("2024-01-03", 33.2),
    ("2024-01-04", 34.0),
    ("2024-01-05", 32.1),
    ("2024-01-06", 30.9),
    ("2024-01-07", 31.5),
    ("2024-01-08", 32.7),
    ("2024-01-09", 33.8),
    ("2024-01-10", 34.5)
], ["date", "temperature"])
```

Expected output (should include 3-day moving average, 7-day moving average, 1-day lag, 3-day lag, and 7-day lag):
```
+----------+------------+------------------+------------------+-------------+-------------+-------------+
|date      |temperature |avg_temp_3day     |avg_temp_7day     |lag_1day     |lag_3day     |lag_7day     |
+----------+------------+------------------+------------------+-------------+-------------+-------------+
|2024-01-10|34.5        |34.3              |32.9              |33.8         |32.1         |32.5         |
|2024-01-09|33.8        |33.7              |32.7              |32.7         |30.9         |31.8         |
|2024-01-08|32.7        |32.7              |32.1              |31.5         |34.0         |null         |
...
+----------+------------+------------------+------------------+-------------+-------------+-------------+
```


In [None]:
from pyspark.sql.functions import col, lag, avg, to_date
from pyspark.sql.window import Window

# Sample data - daily temperature readings
temp_readings = spark.createDataFrame([
    ("2024-01-01", 32.5),
    ("2024-01-02", 31.8),
    ("2024-01-03", 33.2),
    ("2024-01-04", 34.0),
    ("2024-01-05", 32.1),
    ("2024-01-06", 30.9),
    ("2024-01-07", 31.5),
    ("2024-01-08", 32.7),
    ("2024-01-09", 33.8),
    ("2024-01-10", 34.5)
], ["date", "temperature"])

# Convert date to date type
temp_readings = temp_readings.withColumn("date", to_date("date"))

# Define windows
window_3day = Window.orderBy("date").rowsBetween(-2, 0)  # Current + 2 previous days
window_7day = Window.orderBy("date").rowsBetween(-6, 0)  # Current + 6 previous days
window_lag = Window.orderBy("date")  # For lag calculations

# Apply transformations
result = temp_readings \
    .withColumn("avg_temp_3day", avg("temperature").over(window_3day)) \
    .withColumn("avg_temp_7day", avg("temperature").over(window_7day)) \
    .withColumn("lag_1day", lag("temperature", 1).over(window_lag)) \
    .withColumn("lag_3day", lag("temperature", 3).over(window_lag)) \
    .withColumn("lag_7day", lag("temperature", 7).over(window_lag)) \
    .orderBy(col("date").desc())  # Optional: show most recent first

result.show()

+----------+-----------+------------------+------------------+--------+--------+--------+
|      date|temperature|     avg_temp_3day|     avg_temp_7day|lag_1day|lag_3day|lag_7day|
+----------+-----------+------------------+------------------+--------+--------+--------+
|2024-01-10|       34.5|33.666666666666664|32.785714285714285|    33.8|    31.5|    33.2|
|2024-01-09|       33.8|32.666666666666664| 32.60000000000001|    32.7|    30.9|    31.8|
|2024-01-08|       32.7|              31.7| 32.31428571428571|    31.5|    32.1|    32.5|
|2024-01-07|       31.5|              31.5|32.285714285714285|    30.9|    34.0|    NULL|
|2024-01-06|       30.9|32.333333333333336|32.416666666666664|    32.1|    33.2|    NULL|
|2024-01-05|       32.1|              33.1|             32.72|    34.0|    31.8|    NULL|
|2024-01-04|       34.0|              33.0|            32.875|    33.2|    32.5|    NULL|
|2024-01-03|       33.2|              32.5|              32.5|    31.8|    NULL|    NULL|
|2024-01-0

### Q3: Incremental ETL with Change Data Capture
Implement a change data capture (CDC) process that accurately merges a source table into a target table, accounting for inserts, updates, and deletes based on operation flags.

```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when

# Target table - current state of data
target = spark.createDataFrame([
    (1, "Widget A", 19.99, "2023-12-01"),
    (2, "Widget B", 24.99, "2023-12-05"),
    (3, "Widget C", 14.99, "2023-12-10"),
    (4, "Widget D", 29.99, "2023-12-15")
], ["id", "product_name", "price", "last_updated"])

# Source CDC table with change operations (I=Insert, U=Update, D=Delete)
source_cdc = spark.createDataFrame([
    (2, "Widget B Premium", 27.99, "2024-01-05", "U"),
    (3, "Widget C", 14.99, "2023-12-10", "D"),
    (5, "Widget E", 22.99, "2024-01-07", "I"),
    (6, "Widget F", 17.99, "2024-01-10", "I")
], ["id", "product_name", "price", "last_updated", "operation"])
```

Expected output (after merging changes):
```
+---+----------------+-----+------------+
|id |product_name    |price|last_updated|
+---+----------------+-----+------------+
|1  |Widget A        |19.99|2023-12-01  |
|2  |Widget B Premium|27.99|2024-01-05  |
|4  |Widget D        |29.99|2023-12-15  |
|5  |Widget E        |22.99|2024-01-07  |
|6  |Widget F        |17.99|2024-01-10  |
+---+----------------+-----+------------+
```

In [None]:
from pyspark.sql.functions import col, lit, when

# Target table - current state of data
target = spark.createDataFrame([
    (1, "Widget A", 19.99, "2023-12-01"),
    (2, "Widget B", 24.99, "2023-12-05"),
    (3, "Widget C", 14.99, "2023-12-10"),
    (4, "Widget D", 29.99, "2023-12-15")
], ["id", "product_name", "price", "last_updated"])

# Source CDC table with change operations (I=Insert, U=Update, D=Delete)
source_cdc = spark.createDataFrame([
    (2, "Widget B Premium", 27.99, "2024-01-05", "U"),
    (3, "Widget C", 14.99, "2023-12-10", "D"),
    (5, "Widget E", 22.99, "2024-01-07", "I"),
    (6, "Widget F", 17.99, "2024-01-10", "I")
], ["id", "product_name", "price", "last_updated", "operation"])

inserts = source_cdc.filter(F.col("operation") == "I").select("id", "product_name", "price", "last_updated")

updates = source_cdc.filter(F.col("operation") == "U").select("id", "product_name", "price", "last_updated")

deletes = source_cdc.filter(F.col("operation") == "D").select("id", "product_name", "price", "last_updated")

# first select only those target records that are not being updated and then just add the updated records
target_updated = target.join(updates, updates.id == target.id, "leftanti")\
                        .union(updates)

# perform a left anti join to remove deleted records and then add the inserts
target_final = target_updated.join(deletes, deletes.id == target_updated.id, "leftanti")\
                      .union(inserts)

target_final.show()

+---+----------------+-----+------------+
| id|    product_name|price|last_updated|
+---+----------------+-----+------------+
|  1|        Widget A|19.99|  2023-12-01|
|  4|        Widget D|29.99|  2023-12-15|
|  2|Widget B Premium|27.99|  2024-01-05|
|  5|        Widget E|22.99|  2024-01-07|
|  6|        Widget F|17.99|  2024-01-10|
+---+----------------+-----+------------+



### Q4: Complex Sessionization Algorithm
Implement a custom sessionization algorithm for web clickstream data that groups user activities into sessions based on both time gaps between consecutive events and page type transitions. A new session starts when either the time gap exceeds 30 minutes OR when a user transitions from a product page to the homepage.

```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lag, when, sum, lit, row_number
from pyspark.sql.window import Window

# Sample clickstream data
clicks = spark.createDataFrame([
    ("user1", "2024-01-01 10:00:00", "homepage"),
    ("user1", "2024-01-01 10:05:30", "product_page"),
    ("user1", "2024-01-01 10:08:45", "product_page"),
    ("user1", "2024-01-01 10:30:00", "homepage"),      # New session (page transition)
    ("user1", "2024-01-01 10:32:15", "product_page"),
    ("user1", "2024-01-01 11:15:00", "product_page"),  # New session (time gap > 30 min)
    ("user2", "2024-01-01 14:20:10", "homepage"),
    ("user2", "2024-01-01 14:22:30", "product_page"),
    ("user2", "2024-01-01 14:55:00", "homepage"),      # New session (page transition)
    ("user2", "2024-01-01 14:57:20", "product_page")
], ["user_id", "timestamp", "page_type"])
```

Expected output:
```
+-------+-------------------+------------+----------+------------------+
|user_id|timestamp          |page_type   |session_id|session_duration  |
+-------+-------------------+------------+----------+------------------+
|user1  |2024-01-01 10:00:00|homepage    |1         |8.75              |
|user1  |2024-01-01 10:05:30|product_page|1         |8.75              |
|user1  |2024-01-01 10:08:45|product_page|1         |8.75              |
|user1  |2024-01-01 10:30:00|homepage    |2         |2.25              |
|user1  |2024-01-01 10:32:15|product_page|2         |2.25              |
|user1  |2024-01-01 11:15:00|product_page|3         |0.0               |
|user2  |2024-01-01 14:20:10|homepage    |1         |2.33              |
|user2  |2024-01-01 14:22:30|product_page|1         |2.33              |
|user2  |2024-01-01 14:55:00|homepage    |2         |2.33              |
|user2  |2024-01-01 14:57:20|product_page|2         |2.33              |
+-------+-------------------+------------+----------+------------------+
```

In [None]:
from pyspark.sql.functions import col, lag, when, sum, lit, row_number, expr
from pyspark.sql.window import Window

# Sample clickstream data
clicks = spark.createDataFrame([
    ("user1", "2024-01-01 10:00:00", "homepage"),
    ("user1", "2024-01-01 10:05:30", "product_page"),
    ("user1", "2024-01-01 10:08:45", "product_page"),
    ("user1", "2024-01-01 10:30:00", "homepage"),      # New session (page transition)
    ("user1", "2024-01-01 10:32:15", "product_page"),
    ("user1", "2024-01-01 11:15:00", "product_page"),  # New session (time gap > 30 min)
    ("user2", "2024-01-01 14:20:10", "homepage"),
    ("user2", "2024-01-01 14:22:30", "product_page"),
    ("user2", "2024-01-01 14:55:00", "homepage"),      # New session (page transition)
    ("user2", "2024-01-01 14:57:20", "product_page")
], ["user_id", "timestamp", "page_type"])

clicks = clicks.withColumn("timestamp", F.to_timestamp("timestamp"))
clicks.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- page_type: string (nullable = true)



In [None]:
from pyspark.sql.functions import to_timestamp, expr, sum, first, last, round

# Create window for session analysis
window = Window.partitionBy("user_id").orderBy("timestamp")

# Identify new sessions
sessions_df = clicks \
    .withColumn("timestamp", to_timestamp(col("timestamp"))) \
    .withColumn("prev_page_type", lag("page_type", 1).over(window)) \
    .withColumn("prev_timestamp", lag("timestamp", 1).over(window)) \
    .withColumn("time_gap", expr("(unix_timestamp(timestamp) - unix_timestamp(prev_timestamp)) / 60")) \
    .withColumn("is_new_session",
                when((col("time_gap") > 30) |
                     ((col("prev_page_type") == "product_page") & (col("page_type") == "homepage")),
                     1).otherwise(0))

# Create cumulative session ID
sessions_df = sessions_df \
    .withColumn("session_id",
                sum("is_new_session").over(Window.partitionBy("user_id").orderBy("timestamp")) + 1)

# Calculate session duration
window_session = Window.partitionBy("user_id", "session_id")

final_df = sessions_df \
    .withColumn("session_start", first("timestamp").over(window_session)) \
    .withColumn("session_end", last("timestamp").over(window_session)) \
    .withColumn("session_duration",
                round(expr("(unix_timestamp(session_end) - unix_timestamp(session_start)) / 60"), 2)) \
    .select("user_id", "timestamp", "page_type", "session_id", "session_duration") \
    .orderBy("user_id", "timestamp")

final_df.show()

+-------+-------------------+------------+----------+----------------+
|user_id|          timestamp|   page_type|session_id|session_duration|
+-------+-------------------+------------+----------+----------------+
|  user1|2024-01-01 10:00:00|    homepage|         1|            8.75|
|  user1|2024-01-01 10:05:30|product_page|         1|            8.75|
|  user1|2024-01-01 10:08:45|product_page|         1|            8.75|
|  user1|2024-01-01 10:30:00|    homepage|         2|            2.25|
|  user1|2024-01-01 10:32:15|product_page|         2|            2.25|
|  user1|2024-01-01 11:15:00|product_page|         3|             0.0|
|  user2|2024-01-01 14:20:10|    homepage|         1|            2.33|
|  user2|2024-01-01 14:22:30|product_page|         1|            2.33|
|  user2|2024-01-01 14:55:00|    homepage|         2|            2.33|
|  user2|2024-01-01 14:57:20|product_page|         2|            2.33|
+-------+-------------------+------------+----------+----------------+



### Q5: Nested ETL and Schema Evolution Detection
You're building a data pipeline that must detect schema changes in nested JSON data structures and report differences between the current data schema and the target table schema. Design a solution that identifies added, modified, or removed fields (including nested fields) and suggests ALTER TABLE statements to evolve the schema.

```python
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType, MapType
import json

# Target table schema (current)
target_schema = StructType([
    StructField("id", StringType(), False),
    StructField("name", StringType(), True),
    StructField("details", StructType([
        StructField("category", StringType(), True),
        StructField("price", FloatType(), True),
        StructField("tags", ArrayType(StringType()), True)
    ]), True),
    StructField("inventory", MapType(StringType(), IntegerType()), True)
])

# New data with schema changes
new_data_json = """
[
  {
    "id": "item1",
    "name": "Product X",
    "details": {
      "category": "Electronics",
      "price": 299.99,
      "tags": ["new", "featured"],
      "weight": 1.5,
      "dimensions": {"length": 10, "width": 5, "height": 2}
    },
    "inventory": {"store1": 20, "store2": 15},
    "reviews": [{"user": "user1", "rating": 4.5, "comment": "Great product"}]
  }
]
"""
new_data = spark.read.json(sc.parallelize([new_data_json]))
```

Hint: You'll need to write a recursive function to compare schema objects and identify differences at all levels of nesting. Your output should include a DataFrame showing all schema differences and the SQL ALTER TABLE statements needed to evolve the schema.

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, ArrayType, MapType
import json

sparkContext=spark.sparkContext

# Target table schema (current)
target_schema = StructType([
    StructField("id", StringType(), False),
    StructField("name", StringType(), True),
    StructField("details", StructType([
        StructField("category", StringType(), True),
        StructField("price", FloatType(), True),
        StructField("tags", ArrayType(StringType()), True)
    ]), True),
    StructField("inventory", MapType(StringType(), IntegerType()), True)
])

# New data with schema changes
new_data_json = """
[
  {
    "id": "item1",
    "name": "Product X",
    "details": {
      "category": "Electronics",
      "price": 299.99,
      "tags": ["new", "featured"],
      "weight": 1.5,
      "dimensions": {"length": 10, "width": 5, "height": 2}
    },
    "inventory": {"store1": 20, "store2": 15},
    "reviews": [{"user": "user1", "rating": 4.5, "comment": "Great product"}]
  }
]
"""
new_data = spark.read.json(sparkContext.parallelize([new_data_json]))
new_data.printSchema()

root
 |-- details: struct (nullable = true)
 |    |-- category: string (nullable = true)
 |    |-- dimensions: struct (nullable = true)
 |    |    |-- height: long (nullable = true)
 |    |    |-- length: long (nullable = true)
 |    |    |-- width: long (nullable = true)
 |    |-- price: double (nullable = true)
 |    |-- tags: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- weight: double (nullable = true)
 |-- id: string (nullable = true)
 |-- inventory: struct (nullable = true)
 |    |-- store1: long (nullable = true)
 |    |-- store2: long (nullable = true)
 |-- name: string (nullable = true)
 |-- reviews: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- comment: string (nullable = true)
 |    |    |-- rating: double (nullable = true)
 |    |    |-- user: string (nullable = true)



In [13]:
new_data_schema = new_data.schema
new_data_schema

StructType([StructField('details', StructType([StructField('category', StringType(), True), StructField('dimensions', StructType([StructField('height', LongType(), True), StructField('length', LongType(), True), StructField('width', LongType(), True)]), True), StructField('price', DoubleType(), True), StructField('tags', ArrayType(StringType(), True), True), StructField('weight', DoubleType(), True)]), True), StructField('id', StringType(), True), StructField('inventory', StructType([StructField('store1', LongType(), True), StructField('store2', LongType(), True)]), True), StructField('name', StringType(), True), StructField('reviews', ArrayType(StructType([StructField('comment', StringType(), True), StructField('rating', DoubleType(), True), StructField('user', StringType(), True)]), True), True)])

In [11]:
target_schema

StructType([StructField('id', StringType(), False), StructField('name', StringType(), True), StructField('details', StructType([StructField('category', StringType(), True), StructField('price', FloatType(), True), StructField('tags', ArrayType(StringType(), True), True)]), True), StructField('inventory', MapType(StringType(), IntegerType(), True), True)])

In [24]:
def flatten_schema(schema, prefix=""):
    fields = []
    for field in schema.fields:
        field_path = f"{prefix}.{field.name}" if prefix else field.name
        if isinstance(field.dataType, StructType):
            fields += flatten_schema(field.dataType, field_path)
        else:
            fields.append((field_path, field.dataType, field.nullable))
    return fields

flatten_schema(target_schema)

[('id', StringType(), False),
 ('name', StringType(), True),
 ('details.category', StringType(), True),
 ('details.price', FloatType(), True),
 ('details.tags', ArrayType(StringType(), True), True),
 ('inventory', MapType(StringType(), IntegerType(), True), True)]

In [25]:
target_fields = dict((path, (dtype, nullable)) for path, dtype, nullable in flatten_schema(target_schema))
new_fields = dict((path, (dtype, nullable)) for path, dtype, nullable in flatten_schema(new_data.schema))

target_fields

{'id': (StringType(), False),
 'name': (StringType(), True),
 'details.category': (StringType(), True),
 'details.price': (FloatType(), True),
 'details.tags': (ArrayType(StringType(), True), True),
 'inventory': (MapType(StringType(), IntegerType(), True), True)}

In [26]:
added = set(new_fields) - set(target_fields)
removed = set(target_fields) - set(new_fields)
modified = {k for k in (set(new_fields) & set(target_fields))
            if new_fields[k] != target_fields[k]}


In [31]:
print(f"Fields added: {added}\nFields removed: {removed}\nFields modified: {modified}")

Fields added: {'inventory.store1', 'reviews', 'details.dimensions.height', 'inventory.store2', 'details.dimensions.width', 'details.dimensions.length', 'details.weight'}
Fields removed: {'inventory'}
Fields modified: {'details.price', 'id'}


In [37]:
from pyspark.sql.types import *

def pyspark_type_to_sql(dtype):
    if isinstance(dtype, StringType):
        return "STRING"
    elif isinstance(dtype, IntegerType):
        return "INT"
    elif isinstance(dtype, FloatType):
        return "FLOAT"
    elif isinstance(dtype, DoubleType):
        return "DOUBLE"
    elif isinstance(dtype, LongType):
        return "BIGINT"
    elif isinstance(dtype, BooleanType):
        return "BOOLEAN"
    elif isinstance(dtype, ArrayType):
        return f"ARRAY<{pyspark_type_to_sql(dtype.elementType)}>"
    elif isinstance(dtype, MapType):
        return f"MAP<{pyspark_type_to_sql(dtype.keyType)}, {pyspark_type_to_sql(dtype.valueType)}>"
    elif isinstance(dtype, StructType):
        fields = [f"{f.name}: {pyspark_type_to_sql(f.dataType)}" for f in dtype.fields]
        return f"STRUCT<{', '.join(fields)}>"
    else:
        return "UNKNOWN"


for field in added:
    dtype = new_fields[field][0]
    sql_type = pyspark_type_to_sql(dtype)
    print(f"ALTER TABLE my_table ADD COLUMN {field} {sql_type};")

for field in removed:
    dtype = target_fields[field][0]
    sql_type = pyspark_type_to_sql(dtype)
    print(f"ALTER TABLE my_table DROP COLUMN {field};")

for field in modified:
    dtype = new_fields[field][0]
    sql_type = pyspark_type_to_sql(dtype)
    print(f"ALTER TABLE my_table ALTER COLUMN {field} {sql_type};")


ALTER TABLE my_table ADD COLUMN inventory.store1 BIGINT;
ALTER TABLE my_table ADD COLUMN reviews ARRAY<STRUCT<comment: STRING, rating: DOUBLE, user: STRING>>;
ALTER TABLE my_table ADD COLUMN details.dimensions.height BIGINT;
ALTER TABLE my_table ADD COLUMN inventory.store2 BIGINT;
ALTER TABLE my_table ADD COLUMN details.dimensions.width BIGINT;
ALTER TABLE my_table ADD COLUMN details.dimensions.length BIGINT;
ALTER TABLE my_table ADD COLUMN details.weight DOUBLE;
ALTER TABLE my_table DROP COLUMN inventory;
ALTER TABLE my_table ALTER COLUMN details.price DOUBLE;
ALTER TABLE my_table ALTER COLUMN id STRING;
