1 Ingestion & Time Fields

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, datediff

spark = SparkSession.builder.appName("CourseAnalytics").getOrCreate()

# Load into PySpark with inferred schema
enrollments_df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/course_enrollments.csv")

# Convert EnrollDate and CompletionDate to date type
enrollments_df = enrollments_df \
    .withColumn("EnrollDate", to_date("EnrollDate")) \
    .withColumn("CompletionDate", to_date("CompletionDate"))

# Add DaysToComplete column if completed
enrollments_df = enrollments_df.withColumn("DaysToComplete", 
    datediff("CompletionDate", "EnrollDate"))

enrollments_df.show(truncate=False)

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+
|EnrollID|UserID|CourseID|CourseName       |Category    |EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+
|E001    |U001  |C001    |Python Basics    |Programming |2024-04-01|2024-04-10    |100            |4     |9             |
|E002    |U002  |C002    |Excel for Finance|Productivity|2024-04-02|NULL          |45             |NULL  |NULL          |
|E003    |U001  |C003    |ML with PySpark  |Data Science|2024-04-03|NULL          |30             |NULL  |NULL          |
|E004    |U003  |C001    |Python Basics    |Programming |2024-04-04|2024-04-20    |100            |5     |16            |
|E005    |U004  |C004    |Digital Marketing|Marketing   |2024-04-05|2024-04-16    |100            |4     |11            |
+--------+------+-------

2 User Learning Path Progress

In [0]:
from pyspark.sql.functions import avg, count, col, when

# Count of courses per user + avg progress
user_progress = enrollments_df.groupBy("UserID") \
    .agg(
        count("*").alias("CoursesEnrolled"),
        avg("ProgressPercent").alias("AvgProgress")
    )

# Flag IsCompleted = ProgressPercent = 100
enrollments_df = enrollments_df.withColumn("IsCompleted", col("ProgressPercent") == 100)

user_progress.show()
enrollments_df.select("EnrollID", "UserID", "ProgressPercent", "IsCompleted").show()

+------+---------------+-----------+
|UserID|CoursesEnrolled|AvgProgress|
+------+---------------+-----------+
|  U004|              1|      100.0|
|  U002|              1|       45.0|
|  U003|              1|      100.0|
|  U001|              2|       65.0|
+------+---------------+-----------+

+--------+------+---------------+-----------+
|EnrollID|UserID|ProgressPercent|IsCompleted|
+--------+------+---------------+-----------+
|    E001|  U001|            100|       true|
|    E002|  U002|             45|      false|
|    E003|  U001|             30|      false|
|    E004|  U003|            100|       true|
|    E005|  U004|            100|       true|
+--------+------+---------------+-----------+



3 Engagement Scoring

In [0]:
# Replace null Rating with 0 before computing
enrollments_df = enrollments_df.fillna({"Rating": 0})

#Create a score: ProgressPercent * Rating
enrollments_df = enrollments_df.withColumn("EngagementScore", 
    col("ProgressPercent") * col("Rating"))

enrollments_df.select("EnrollID", "UserID", "ProgressPercent", "Rating", "EngagementScore").show()

+--------+------+---------------+------+---------------+
|EnrollID|UserID|ProgressPercent|Rating|EngagementScore|
+--------+------+---------------+------+---------------+
|    E001|  U001|            100|     4|            400|
|    E002|  U002|             45|     0|              0|
|    E003|  U001|             30|     0|              0|
|    E004|  U003|            100|     5|            500|
|    E005|  U004|            100|     4|            400|
+--------+------+---------------+------+---------------+



4 Identify Drop-offs

In [0]:
#Filter all records with ProgressPercent < 50 and CompletionDate is null
dropouts_df = enrollments_df.filter(
    (col("ProgressPercent") < 50) & (col("CompletionDate").isNull())
)

#Create a view called Dropouts
dropouts_df.createOrReplaceTempView("Dropouts")
spark.sql("SELECT * FROM Dropouts").show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+---------------+
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|              0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|              0|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+---------------+



5 Joins with Metadata

In [0]:
catalog_df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/course_catalog.csv")

joined_df = enrollments_df.join(catalog_df, "CourseID")

# Join to find average progress per instructor
joined_df.groupBy("Instructor").agg(avg("ProgressPercent").alias("AvgProgress")).show()

# Show who teaches the most enrolled course
from pyspark.sql.functions import count

most_enrolled = joined_df.groupBy("CourseName", "Instructor") \
    .agg(count("*").alias("TotalEnrollments")) \
    .orderBy(col("TotalEnrollments").desc())

most_enrolled.show(1)

+-------------+-----------+
|   Instructor|AvgProgress|
+-------------+-----------+
|  Zoya Sheikh|      100.0|
|   Sana Gupta|       45.0|
| Ibrahim Khan|       30.0|
|Abdullah Khan|      100.0|
+-------------+-----------+

+-------------+-------------+----------------+
|   CourseName|   Instructor|TotalEnrollments|
+-------------+-------------+----------------+
|Python Basics|Abdullah Khan|               2|
+-------------+-------------+----------------+
only showing top 1 row



6 Delta Lake Practice

In [0]:
from delta.tables import DeltaTable

# Save as Delta Table enrollments_delta
enrollments_df.write.format("delta").mode("overwrite").save("/tmp/enrollments_delta")

delta_table = DeltaTable.forPath(spark, "/tmp/enrollments_delta")

# Update: Set all ratings to 5 where Course = 'Python Basics'
delta_table.update(
    condition="CourseName = 'Python Basics'",
    set={"Rating": "5"}
)

# Delete: All rows where ProgressPercent = 0
delta_table.delete("ProgressPercent = 0")

# Show describe history
spark.sql("DESCRIBE HISTORY delta.`/tmp/enrollments_delta`").show()

+-------+-------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|          timestamp|          userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+-------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      3|2025-06-19 05:26:33|4028198190791787|azuser3553_mml.lo...| OPTIMIZE|{predicate -> [],...|NULL|{1052067078041127}|0611-043506-43vn1hs6|          1|SnapshotIsolation|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|      2|2025-06-19 05:26:31

7 Window Functions

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank, lead

# Use dense_rank() to rank courses by number of enrollments
course_rank_window = Window.orderBy(col("TotalEnrollments").desc())

course_counts = enrollments_df.groupBy("CourseID", "CourseName") \
    .agg(count("*").alias("TotalEnrollments")) \
    .withColumn("Rank", dense_rank().over(course_rank_window))

course_counts.show()

# lead() to find next course by each user
lead_window = Window.partitionBy("UserID").orderBy("EnrollDate")
enrollments_df = enrollments_df.withColumn("NextCourseID", lead("CourseID").over(lead_window))

enrollments_df.select("UserID", "CourseID", "EnrollDate", "NextCourseID").show()

+--------+-----------------+----------------+----+
|CourseID|       CourseName|TotalEnrollments|Rank|
+--------+-----------------+----------------+----+
|    C001|    Python Basics|               2|   1|
|    C004|Digital Marketing|               1|   2|
|    C002|Excel for Finance|               1|   2|
|    C003|  ML with PySpark|               1|   2|
+--------+-----------------+----------------+----+

+------+--------+----------+------------+
|UserID|CourseID|EnrollDate|NextCourseID|
+------+--------+----------+------------+
|  U001|    C001|2024-04-01|        C003|
|  U001|    C003|2024-04-03|        NULL|
|  U002|    C002|2024-04-02|        NULL|
|  U003|    C001|2024-04-04|        NULL|
|  U004|    C004|2024-04-05|        NULL|
+------+--------+----------+------------+



8 SQL Logic for Dashboard Views

In [0]:
from pyspark.sql.functions import to_date, avg

# Daily enrollments
enrollments_df.withColumn("EnrollDay", to_date("EnrollDate")) \
    .groupBy("EnrollDay").count().createOrReplaceTempView("daily_enrollments")

# Category performance
enrollments_df.groupBy("Category") \
    .agg(avg("Rating").alias("AvgRating")) \
    .createOrReplaceTempView("category_performance")

# Create a temporary view for course enrollments
enrollments_df.createOrReplaceTempView("course_enrollments")

# Top 3 courses by enrollments
spark.sql("""
    SELECT CourseName, COUNT(*) as TotalEnrollments
    FROM course_enrollments
    GROUP BY CourseName
    ORDER BY TotalEnrollments DESC
    LIMIT 3
""").createOrReplaceTempView("top_3_courses")

spark.sql("SELECT * FROM daily_enrollments").show()
spark.sql("SELECT * FROM category_performance").show()
spark.sql("SELECT * FROM top_3_courses").show()

+----------+-----+
| EnrollDay|count|
+----------+-----+
|2024-04-02|    1|
|2024-04-01|    1|
|2024-04-04|    1|
|2024-04-05|    1|
|2024-04-03|    1|
+----------+-----+

+------------+---------+
|    Category|AvgRating|
+------------+---------+
| Programming|      4.5|
|Productivity|      0.0|
|   Marketing|      4.0|
|Data Science|      0.0|
+------------+---------+

+-----------------+----------------+
|       CourseName|TotalEnrollments|
+-----------------+----------------+
|    Python Basics|               2|
|Digital Marketing|               1|
|Excel for Finance|               1|
+-----------------+----------------+



9 Time Travel

In [0]:
# View previous version before update/delete
spark.read.format("delta").option("versionAsOf", 0).load("/tmp/enrollments_delta").show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|            400|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|              0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|              0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|            16|            500|
|    E005|  U004|    C004|Digital 

10 Export Reporting

In [0]:
#Write to JSON, partitioned by Category
enrollments_df.write.mode("overwrite").partitionBy("Category").json("output/enrollments_json")

#Create summary DataFrame 
summary_df = enrollments_df.groupBy("CourseName").agg(
    count("*").alias("TotalEnrollments"),
    avg("Rating").alias("AvgRating"),
    avg("ProgressPercent").alias("AvgProgress")
)

summary_df.show()

#Save as parquet
summary_df.write.mode("overwrite").parquet("output/course_summary_parquet")

+-----------------+----------------+---------+-----------+
|       CourseName|TotalEnrollments|AvgRating|AvgProgress|
+-----------------+----------------+---------+-----------+
|Digital Marketing|               1|      4.0|      100.0|
|    Python Basics|               2|      4.5|      100.0|
|Excel for Finance|               1|      0.0|       45.0|
|  ML with PySpark|               1|      0.0|       30.0|
+-----------------+----------------+---------+-----------+

