In [1]:
!pip install pyspark==3.5.1 delta-spark==3.1.0

Collecting delta-spark==3.1.0
  Downloading delta_spark-3.1.0-py3-none-any.whl.metadata (1.9 kB)
Downloading delta_spark-3.1.0-py3-none-any.whl (21 kB)
Installing collected packages: delta-spark
Successfully installed delta-spark-3.1.0


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, datediff, count, avg, when, lit, dense_rank, lead
from pyspark.sql.window import Window
from delta import *

# Step 1: Start Spark Session with Delta support
builder = SparkSession.builder \
    .appName("Course Analytics") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [4]:
# Step 2: Load CSV files
enroll_path = "/content/drive/My Drive/course_enrolments.csv"
catalog_path = "/content/drive/My Drive/course_catalog.csv"

df = spark.read.option("header", True).option("inferSchema", True).csv(enroll_path)
catalog = spark.read.option("header", True).option("inferSchema", True).csv(catalog_path)

In [5]:
# Step 3: Date conversion and DaysToComplete
df = df.withColumn("EnrollDate", to_date("EnrollDate")) \
       .withColumn("CompletionDate", to_date("CompletionDate")) \
       .withColumn("DaysToComplete", datediff("CompletionDate", "EnrollDate"))

In [6]:
# Step 4: User Learning Path
user_progress = df.withColumn("IsCompleted", (col("ProgressPercent") == 100)) \
    .groupBy("UserID") \
    .agg(count("*").alias("CoursesEnrolled"),
         avg("ProgressPercent").alias("AvgProgress"))
user_progress.show()


+------+---------------+-----------+
|UserID|CoursesEnrolled|AvgProgress|
+------+---------------+-----------+
|  U004|              1|      100.0|
|  U002|              1|       45.0|
|  U003|              1|      100.0|
|  U001|              2|       65.0|
+------+---------------+-----------+



In [7]:
# Step 5: Engagement Score
df = df.withColumn("Rating", when(col("Rating").isNull(), 0).otherwise(col("Rating"))) \
       .withColumn("EngagementScore", col("ProgressPercent") * col("Rating"))

In [8]:
# Step 6: Dropouts
dropouts = df.filter((col("ProgressPercent") < 50) & col("CompletionDate").isNull())
dropouts.createOrReplaceTempView("Dropouts")

In [10]:
# Step 7: Join with Catalog
df_joined = df.join(catalog, "CourseID", "left")
df_joined.groupBy("Instructor").agg(avg("ProgressPercent").alias("AvgProgress")).show()

most_enrolled = df.groupBy("CourseID").count().orderBy(col("count").desc()).limit(1)
most_enrolled.join(catalog, "CourseID").show()

+-------------+-----------+
|   Instructor|AvgProgress|
+-------------+-----------+
|  Zoya Sheikh|      100.0|
|   Sana Gupta|       45.0|
| Ibrahim Khan|       30.0|
|Abdullah Khan|      100.0|
+-------------+-----------+

+--------+-----+-------------+-------------+--------+
|CourseID|count|   Instructor|DurationHours|   Level|
+--------+-----+-------------+-------------+--------+
|    C001|    2|Abdullah Khan|            8|Beginner|
+--------+-----+-------------+-------------+--------+



In [12]:
# Step 8: Delta Save, Update, Delete
delta_path = "/content/drive/My Drive/enrollments_delta"
df.write.format("delta").mode("overwrite").save(delta_path)

from delta.tables import DeltaTable
delta_table = DeltaTable.forPath(spark, delta_path)

# Update: All ratings to 5 for Python Basics
delta_table.update(
    condition=col("CourseName") == "Python Basics",
    set={"Rating": lit(5)}
)

# Delete: Where progress = 0
delta_table.delete(condition=col("ProgressPercent") == 0)

# History
delta_table.history().show(truncate=False)

+-------+-------------------+------+--------+---------+----------------------------------------------------+----+--------+---------+-----------+--------------+-------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-----------------------------------+
|version|timestamp          |userId|userName|operation|operationParameters                                 |job |notebook|clusterId|readVersion|isolationLevel|isBlindAppend|operationMetrics                                                                                                                                                                                                                                                                             

In [13]:
# Step 9: Window Functions
rank_window = Window.orderBy(col("count").desc())
ranked = df.groupBy("CourseID").count() \
    .withColumn("Rank", dense_rank().over(rank_window))
ranked.show()

lead_window = Window.partitionBy("UserID").orderBy("EnrollDate")
df = df.withColumn("NextCourse", lead("CourseName").over(lead_window))

+--------+-----+----+
|CourseID|count|Rank|
+--------+-----+----+
|    C001|    2|   1|
|    C003|    1|   2|
|    C004|    1|   2|
|    C002|    1|   2|
+--------+-----+----+



In [14]:
# Step 10: SQL Views
df.createOrReplaceTempView("enrollments")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW daily_enrollments AS
    SELECT EnrollDate, COUNT(*) AS Total FROM enrollments GROUP BY EnrollDate
""")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW category_performance AS
    SELECT Category, AVG(Rating) AS AvgRating FROM enrollments GROUP BY Category
""")

spark.sql("""
    CREATE OR REPLACE TEMP VIEW top_3_courses AS
    SELECT CourseName, COUNT(*) AS Total FROM enrollments GROUP BY CourseName ORDER BY Total DESC LIMIT 3
""")

DataFrame[]

In [15]:
# Step 11: Time Travel
version0 = spark.read.format("delta").option("versionAsOf", 0).load(delta_path)
version0.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|            400|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|              0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|              0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|            16|            500|
|    E005|  U004|    C004|Digital 

In [16]:
# Step 12: Export to JSON partitioned by Category
df.write.mode("overwrite").partitionBy("Category").json("/content/drive/My Drive/course_json_output")

In [17]:
# Step 13: Summary and export as Parquet
summary = df.groupBy("CourseName").agg(
    count("*").alias("TotalEnrollments"),
    avg("Rating").alias("AvgRating"),
    avg("ProgressPercent").alias("AvgProgress")
)

summary.write.mode("overwrite").parquet("/content/drive/My Drive/course_summary_parquet")
