In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Set-2").getOrCreate()
spark

**Ingestion & Time Fields**

In [5]:
#Load into PySpark with inferred schema
from google.colab import drive
drive.mount('/content/drive')

df_enrollment = spark.read.csv('/content/drive/MyDrive/course_enrollments.csv',header = True,inferSchema = True)
df_enrollment.show()

Mounted at /content/drive
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-04-05|    2024-04-16|            100|     4|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+



In [6]:
# Convert EnrollDate and CompletionDate to date type
from pyspark.sql.functions import col, to_date, datediff, when

df_enrollment = df_enrollment.withColumn("EnrollDate", to_date(col("EnrollDate"), "yyyy-MM-dd")).withColumn("CompletionDate", to_date(col("CompletionDate"), "yyyy-MM-dd"))

#Add DaysToComplete column if completed
df_enrollment = df_enrollment.withColumn("DaysToComplete", when(col("CompletionDate").isNotNull(), datediff(col("CompletionDate"), col("EnrollDate"))))
df_enrollment.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|          NULL|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|          NULL|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|            16|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-04-05|    2024-04-16|            100|     4|            11|
+--------+------+-------

**User Learning Path Progress**

In [7]:
# Group by UserID : count of courses enrolled
from pyspark.sql.functions import count

df_enrollment.groupBy("UserID").agg(count("CourseID").alias("enrolled")).show()

+------+--------+
|UserID|enrolled|
+------+--------+
|  U004|       1|
|  U002|       1|
|  U003|       1|
|  U001|       2|
+------+--------+



In [8]:
# Avg progress % across all enrollments
from pyspark.sql.functions import avg

df_enrollment.groupBy("EnrollID").agg(avg("ProgressPercent").alias("AvgProgressPercent")).show()

+--------+------------------+
|EnrollID|AvgProgressPercent|
+--------+------------------+
|    E004|             100.0|
|    E002|              45.0|
|    E003|              30.0|
|    E001|             100.0|
|    E005|             100.0|
+--------+------------------+



In [9]:
# Flag IsCompleted = ProgressPercent = 100
from pyspark.sql.functions import col

df_enrollment = df_enrollment.withColumn("IsCompleted", (col("ProgressPercent") == 100).cast("int"))
df_enrollment.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|          1|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|          NULL|          0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|          NULL|          0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100|     5|            16|          1|
|    E005|  U004|    C004|Digital Marketing|   Marketing|2024-

**Engagement Scoring**

In [10]:
df_enrollment = df_enrollment.withColumn("EngagementScore",when(col("Rating").isNotNull(), col("ProgressPercent") * col("Rating")).otherwise(0))
df_enrollment.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|          1|            400|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|  NULL|          NULL|          0|              0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|  NULL|          NULL|          0|              0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100| 

In [11]:
# Replace null Rating with 0 before computing

df_enrollment = df_enrollment.fillna({'Rating': 0})
df_enrollment.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|          1|            400|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|          0|              0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|          0|              0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100| 

**Identify Drop-offs**

In [12]:
# Filter all records with ProgressPercent < 50 and CompletionDate is null

df_enrollment.filter((col("ProgressPercent") < 50) & col("CompletionDate").isNull()).show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|          0|              0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|          0|              0|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+



In [13]:
# Create a view called Dropouts

df_enrollment.createOrReplaceTempView("Dropouts")
spark.sql("SELECT * FROM Dropouts").show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|          1|            400|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|          0|              0|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|          0|              0|
|    E004|  U003|    C001|    Python Basics| Programming|2024-04-04|    2024-04-20|            100| 

**Joins with Metadata**

In [14]:
df_catalog = spark.read.csv('/content/drive/MyDrive/course_catalog.csv',header = True,inferSchema = True)
df_catalog.show()

+--------+-------------+-------------+------------+
|CourseID|   Instructor|DurationHours|       Level|
+--------+-------------+-------------+------------+
|    C001|Abdullah Khan|            8|    Beginner|
|    C002|   Sana Gupta|            5|    Beginner|
|    C003| Ibrahim Khan|           10|Intermediate|
|    C004|  Zoya Sheikh|            6|    Beginner|
+--------+-------------+-------------+------------+



In [16]:
#Join to find average progress per instructor

joined_df = df_enrollment.join(df_catalog, on="CourseID", how="inner")

from pyspark.sql.functions import avg
joined_df.groupBy("Instructor").agg(avg("ProgressPercent").alias("AvgProgress")).show()

+-------------+-----------+
|   Instructor|AvgProgress|
+-------------+-----------+
|  Zoya Sheikh|      100.0|
|   Sana Gupta|       45.0|
| Ibrahim Khan|       30.0|
|Abdullah Khan|      100.0|
+-------------+-----------+



In [19]:
# Show who teaches the most enrolled course
from pyspark.sql.functions import count

joined_df.groupBy("Instructor").agg(count("*").alias("Enrollments")).orderBy("Enrollments", ascending=False).limit(1).show()

+-------------+-----------+
|   Instructor|Enrollments|
+-------------+-----------+
|Abdullah Khan|          2|
+-------------+-----------+



**Delta Lake Practice**


In [None]:
# Save as Delta Table enrollments_delta
# Apply:
# Update: Set all ratings to 5 where Course = 'Python Basics'
# Delete: All rows where ProgressPercent = 0
# Show DESCRIBE HISTORY

df_enrollment.write.format("delta").mode("overwrite").save("/tmp/enrollments_delta")

from delta.tables import DeltaTable
from pyspark.sql.functions import lit

delta_table = DeltaTable.forPath(spark, "/tmp/enrollments_delta")

delta_table.update(condition = col("CourseName") == "Python Basics",set = { "Rating": lit(5) })

delta_table.delete(condition = col("ProgressPercent") == 0)

spark.sql("DESCRIBE HISTORY delta.`/tmp/enrollments_delta`").show(truncate=False)

**Window Functions**

In [20]:
# Use dense_rank() to rank courses by number of enrollments
from pyspark.sql.window import Window
from pyspark.sql.functions import count, dense_rank

course_counts = df_enrollment.groupBy("CourseID").agg(count("*").alias("Enrollments"))

rank_window = Window.orderBy(course_counts["Enrollments"].desc())
ranked_courses = course_counts.withColumn("Rank", dense_rank().over(rank_window))
ranked_courses.show()

+--------+-----------+----+
|CourseID|Enrollments|Rank|
+--------+-----------+----+
|    C001|          2|   1|
|    C003|          1|   2|
|    C004|          1|   2|
|    C002|          1|   2|
+--------+-----------+----+



In [21]:
# lead() to find next course by each user (sorted by EnrollDate)
from pyspark.sql.functions import lead

user_window = Window.partitionBy("UserID").orderBy("EnrollDate")
df_enrollment = df_enrollment.withColumn("NextCourseID", lead("CourseID").over(user_window))
df_enrollment.show()

+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+------------+
|EnrollID|UserID|CourseID|       CourseName|    Category|EnrollDate|CompletionDate|ProgressPercent|Rating|DaysToComplete|IsCompleted|EngagementScore|NextCourseID|
+--------+------+--------+-----------------+------------+----------+--------------+---------------+------+--------------+-----------+---------------+------------+
|    E001|  U001|    C001|    Python Basics| Programming|2024-04-01|    2024-04-10|            100|     4|             9|          1|            400|        C003|
|    E003|  U001|    C003|  ML with PySpark|Data Science|2024-04-03|          NULL|             30|     0|          NULL|          0|              0|        NULL|
|    E002|  U002|    C002|Excel for Finance|Productivity|2024-04-02|          NULL|             45|     0|          NULL|          0|              0|        NULL|
|    E004|  U003|    C

**SQL Logic for Dashboard Views**

In [None]:
# Create views:
# daily_enrollments
# category_performance (avg rating by category)
# top_3_courses

spark.sql("""CREATE OR REPLACE VIEW daily_enrollments AS
SELECT
    EnrollDate,
    COUNT(*) AS TotalEnrollments
FROM enrollments_delta
GROUP BY EnrollDate
ORDER BY EnrollDate;
""")

spark.sql("""CREATE OR REPLACE VIEW category_performance AS
SELECT
    Category,
    ROUND(AVG(Rating), 2) AS AvgRating
FROM enrollments_delta
WHERE Rating IS NOT NULL
GROUP BY Category
ORDER BY AvgRating DESC;""")

spark.sql("""CREATE OR REPLACE VIEW top_3_courses AS
SELECT
    CourseID,
    CourseName,
    COUNT(*) AS Enrollments
FROM enrollments_delta
GROUP BY CourseID, CourseName
ORDER BY Enrollments DESC
LIMIT 3;""")

**Time Travel**

In [None]:
# View previous version before update/delete

spark.sql("SELECT * FROM delta.`/tmp/enrollments_delta` VERSION AS OF 0;")

spark.sql("DESCRIBE HISTORY delta.`/tmp/enrollments_delta`;")

In [None]:
# Use VERSION AS OF and TIMESTAMP AS OF
spark.sql('''SELECT * FROM delta.`/tmp/enrollments_delta` TIMESTAMP AS OF '2024-04-11T10:00:00';''')

**Export Reporting**

In [23]:
df_enrollment.write.partitionBy("Category").mode("overwrite").json("/content/drive/MyDrive/enrollments_json_by_category")

In [24]:
from pyspark.sql.functions import count, avg

summary_df = df_enrollment.groupBy("CourseName").agg(count("*").alias("TotalEnrollments"),avg("Rating").alias("AvgRating"),avg("ProgressPercent").alias("AvgProgress"))

summary_df.write.mode("overwrite").parquet("/content/drive/MyDrive/course_summary_parquet")