In [1]:

from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName("june 16th")\
.getOrCreate()

spark

**Exercise Set – Online Course Use Case
Data Loading**

In [2]:
# 1. Load the data with schema inference enabled.

from google.colab import drive
drive.mount('/content/drive')

df_course = spark.read.csv('/content/drive/MyDrive/course_enrollments.csv',header= True,inferSchema=True)
df_course.show()
df_course.printSchema()

Mounted at /content/drive
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|2024-05-15|              0|  NULL| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|2024-05-17|             60|   4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|2024-05-18|             90|   4.6|Completed|
+------------+-----------+------------

In [4]:
# 2. Manually define schema and compare both approaches.

from pyspark.sql.types import StringType, StructField, StructType, IntegerType, DoubleType, DateType

manual_schema = StructType([
    StructField("EnrollmentID",StringType(),True),
    StructField("StudentName",StringType(), True),
    StructField("CourseName",StringType(), True),
    StructField("Category",StringType(), True),
    StructField("EnrollmentDate",DateType(), True),
    StructField("ProgressPercent",IntegerType(), True),
    StructField("Rating",DoubleType(), True),
    StructField("Status",StringType(), True)
])

df_maual = spark.read.csv('/content/drive/MyDrive/course_enrollments.csv',header= True,schema=manual_schema)

df_maual.show()
df_maual.printSchema()

+------------+-----------+--------------------+-----------+--------------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollmentDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+--------------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|    2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|    2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|    2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|    2024-05-15|              0|  NULL| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|    2024-05-17|             60|   4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|    2024-05-18|             90|   4.6|Completed|
+------------+-----------+--

**Filtering and Transformation**

In [6]:
# 3. Filter records where ProgressPercent < 50 .

df_maual.filter(df_maual['ProgressPercent'] < 50).show()

+------------+-----------+-------------------+-----------+--------------+---------------+------+--------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollmentDate|ProgressPercent|Rating|  Status|
+------------+-----------+-------------------+-----------+--------------+---------------+------+--------+
|      ENR003|     Aakash|Power BI Essentials|  Analytics|    2024-05-13|             30|   3.8|  Active|
|      ENR004|       Neha|        Java Basics|Programming|    2024-05-15|              0|  NULL|Inactive|
+------------+-----------+-------------------+-----------+--------------+---------------+------+--------+



In [7]:
# 4. Replace null ratings with average rating.
from pyspark.sql.functions import avg

avg_rating = df_maual.select(avg("Rating")).first()[0]

df_no_null = df_maual.fillna({"Rating":avg_rating})
df_no_null.show()

+------------+-----------+--------------------+-----------+--------------+---------------+-----------------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollmentDate|ProgressPercent|           Rating|   Status|
+------------+-----------+--------------------+-----------+--------------+---------------+-----------------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|    2024-05-10|             80|              4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|    2024-05-12|            100|              4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|    2024-05-13|             30|              3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|    2024-05-15|              0|4.359999999999999| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|    2024-05-17|             60|              4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|P

In [8]:
# 5. Add column IsActive → 1 if Status is Active, else 0.
from pyspark.sql.functions import when

df_no_null = df_no_null.withColumn("IsActive",when(df_no_null["Status"] == "Active",1).otherwise(0))
df_no_null.show()

+------------+-----------+--------------------+-----------+--------------+---------------+-----------------+---------+--------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollmentDate|ProgressPercent|           Rating|   Status|IsActive|
+------------+-----------+--------------------+-----------+--------------+---------------+-----------------+---------+--------+
|      ENR001|     Aditya|Python for Beginners|Programming|    2024-05-10|             80|              4.5|   Active|       1|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|    2024-05-12|            100|              4.7|Completed|       0|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|    2024-05-13|             30|              3.8|   Active|       1|
|      ENR004|       Neha|         Java Basics|Programming|    2024-05-15|              0|4.359999999999999| Inactive|       0|
|      ENR005|       Zara|Machine Learning 101|         AI|    2024-05-17|             60|              

**Aggregations & Metrics**

In [9]:
# 6. Find average progress by course.
from pyspark.sql.functions import avg

df_no_null.groupBy("CourseName").agg(avg("ProgressPercent").alias("AvgProgress")).show()

+--------------------+-----------+
|          CourseName|AvgProgress|
+--------------------+-----------+
|Data Analysis wit...|      100.0|
|         Java Basics|        0.0|
|Machine Learning 101|       60.0|
|Python for Beginners|       85.0|
| Power BI Essentials|       30.0|
+--------------------+-----------+



In [10]:
# 7. Get count of students in each course category.
from pyspark.sql.functions import count

df_no_null.groupBy("category").agg(count("EnrollmentID").alias("CountEnrollment")).show()

+-----------+---------------+
|   category|CountEnrollment|
+-----------+---------------+
|Programming|              3|
|         AI|              1|
|  Analytics|              2|
+-----------+---------------+



In [13]:
# 8. Identify the most enrolled course.
from pyspark.sql.functions import desc, count

df_no_null.groupBy("CourseName").agg(count("EnrollmentID").alias("CountEnrollment")).orderBy(desc("CountEnrollment")).limit(1).show()

+--------------------+---------------+
|          CourseName|CountEnrollment|
+--------------------+---------------+
|Python for Beginners|              2|
+--------------------+---------------+



**Joins**

In [14]:
# 9. Create second CSV: course_details.csv
df_details = spark.read.csv('/content/drive/MyDrive/course_details.csv',header= True,inferSchema=True)

df_details.show()

+--------------------+-------------+----------+
|          CourseName|DurationWeeks|Instructor|
+--------------------+-------------+----------+
|Python for Beginners|            4|    Rakesh|
|Data Analysis wit...|            3|    Anjali|
| Power BI Essentials|            5|     Rekha|
|         Java Basics|            6|     Manoj|
|Machine Learning 101|            8|     Samir|
+--------------------+-------------+----------+



In [15]:
# 10. Join course_enrollments with course_details to include duration and instructor.

joined_df = df_no_null.join(df_details, on = "CourseName",how = "inner")
joined_df.show()

+--------------------+------------+-----------+-----------+--------------+---------------+-----------------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollmentDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+--------------+---------------+-----------------+---------+--------+-------------+----------+
|Python for Beginners|      ENR001|     Aditya|Programming|    2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|
|Data Analysis wit...|      ENR002|     Simran|  Analytics|    2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|
| Power BI Essentials|      ENR003|     Aakash|  Analytics|    2024-05-13|             30|              3.8|   Active|       1|            5|     Rekha|
|         Java Basics|      ENR004|       Neha|Programming|    2024-05-15|        

**Window Functions**

In [16]:
# 11. Rank students in each course based on ProgressPercent .
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

course_window = Window.partitionBy("CourseName").orderBy(df_no_null["ProgressPercent"].desc())

df_ranked = df_no_null.withColumn("ProgressPercent",rank().over(course_window))
df_ranked.show()

+------------+-----------+--------------------+-----------+--------------+---------------+-----------------+---------+--------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollmentDate|ProgressPercent|           Rating|   Status|IsActive|
+------------+-----------+--------------------+-----------+--------------+---------------+-----------------+---------+--------+
|      ENR002|     Simran|Data Analysis wit...|  Analytics|    2024-05-12|              1|              4.7|Completed|       0|
|      ENR004|       Neha|         Java Basics|Programming|    2024-05-15|              1|4.359999999999999| Inactive|       0|
|      ENR005|       Zara|Machine Learning 101|         AI|    2024-05-17|              1|              4.2|   Active|       1|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|    2024-05-13|              1|              3.8|   Active|       1|
|      ENR006|    Ibrahim|Python for Beginners|Programming|    2024-05-18|              1|              

In [17]:
# 12. Get lead and lag of EnrollDate by Category.
from pyspark.sql.functions import lead,lag

date_window = Window.partitionBy("Category").orderBy("EnrollmentDate")

df_leg_lag = df_no_null.withColumn("NextEnrollmentDate", lead("EnrollmentDate").over(date_window))\
.withColumn("PrevEnrollmentDate",lag("EnrollmentDate").over(date_window))

df_leg_lag.show()

+------------+-----------+--------------------+-----------+--------------+---------------+-----------------+---------+--------+------------------+------------------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollmentDate|ProgressPercent|           Rating|   Status|IsActive|NextEnrollmentDate|PrevEnrollmentDate|
+------------+-----------+--------------------+-----------+--------------+---------------+-----------------+---------+--------+------------------+------------------+
|      ENR005|       Zara|Machine Learning 101|         AI|    2024-05-17|             60|              4.2|   Active|       1|              NULL|              NULL|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|    2024-05-12|            100|              4.7|Completed|       0|        2024-05-13|              NULL|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|    2024-05-13|             30|              3.8|   Active|       1|              NULL|        2024-05-12|
|   

**Pivoting & Formatting**

In [18]:
# 13. Pivot data to show total enrollments by Category and Status.

pivot_df = df_no_null.groupBy("Category").pivot("Status").count()
pivot_df.show()

+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|         AI|     1|     NULL|    NULL|
|  Analytics|     1|        1|    NULL|
+-----------+------+---------+--------+



In [21]:
# 14. Extract year and month from EnrollDate .
from pyspark.sql.functions import year, month

extract_date = df_no_null.withColumn("EnrollYear", year("EnrollmentDate")) \
                         .withColumn("EnrollMonth", month("EnrollmentDate"))

extract_date.show()

+------------+-----------+--------------------+-----------+--------------+---------------+-----------------+---------+--------+----------+-----------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollmentDate|ProgressPercent|           Rating|   Status|IsActive|EnrollYear|EnrollMonth|
+------------+-----------+--------------------+-----------+--------------+---------------+-----------------+---------+--------+----------+-----------+
|      ENR001|     Aditya|Python for Beginners|Programming|    2024-05-10|             80|              4.5|   Active|       1|      2024|          5|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|    2024-05-12|            100|              4.7|Completed|       0|      2024|          5|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|    2024-05-13|             30|              3.8|   Active|       1|      2024|          5|
|      ENR004|       Neha|         Java Basics|Programming|    2024-05-15|              0|4.35

**Cleaning and Deduplication**

In [22]:
# 15. Drop rows where Status is null or empty.
from pyspark.sql.functions import col, trim

df_cleaned = df_maual.filter((col("Status").isNotNull()) & (trim(col("Status")) != ""))
df_cleaned.show()

+------------+-----------+--------------------+-----------+--------------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollmentDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+--------------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|    2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|    2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|    2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|    2024-05-15|              0|  NULL| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|    2024-05-17|             60|   4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|    2024-05-18|             90|   4.6|Completed|
+------------+-----------+--

In [24]:
# 16. Remove duplicate enrollments using dropDuplicates() .

df_remove_duplicate = df_cleaned.dropDuplicates(["EnrollmentID"])
df_remove_duplicate.show()

+------------+-----------+--------------------+-----------+--------------+---------------+------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollmentDate|ProgressPercent|Rating|   Status|
+------------+-----------+--------------------+-----------+--------------+---------------+------+---------+
|      ENR001|     Aditya|Python for Beginners|Programming|    2024-05-10|             80|   4.5|   Active|
|      ENR002|     Simran|Data Analysis wit...|  Analytics|    2024-05-12|            100|   4.7|Completed|
|      ENR003|     Aakash| Power BI Essentials|  Analytics|    2024-05-13|             30|   3.8|   Active|
|      ENR004|       Neha|         Java Basics|Programming|    2024-05-15|              0|  NULL| Inactive|
|      ENR005|       Zara|Machine Learning 101|         AI|    2024-05-17|             60|   4.2|   Active|
|      ENR006|    Ibrahim|Python for Beginners|Programming|    2024-05-18|             90|   4.6|Completed|
+------------+-----------+--

**Export**

In [25]:
# 17. Write the final cleaned DataFrame to:
# CSV (overwrite mode)
# JSON (overwrite mode)
# Parquet (snappy compression)

df_remove_duplicate.write.mode("overwrite").csv('/content/drive/MyDrive/final_output_csv')

df_remove_duplicate.write.mode("overwrite").json('/content/drive/MyDrive/final_output_json')

df_remove_duplicate.write.mode("overwrite").option("compression","snappy").parquet('/content/drive/MyDrive/final_output_parquet')