In [1]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

# Initialize Spark Session
spark = SparkSession.builder.appName("CourseAnalysis").getOrCreate()



In [2]:
# Load with schema inference
inferred_df = spark.read.option("header", True).csv("course_enrollments.csv")
print("Inferred Schema:")
inferred_df.printSchema()

Inferred Schema:
root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: string (nullable = true)
 |-- ProgressPercent: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Status: string (nullable = true)



In [3]:
# Define schema manually
manual_schema = StructType([
    StructField("EnrollmentID", StringType(), True),
    StructField("StudentName", StringType(), True),
    StructField("CourseName", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("EnrollDate", DateType(), True),
    StructField("ProgressPercent", IntegerType(), True),
    StructField("Rating", DoubleType(), True),
    StructField("Status", StringType(), True)
])

# Load data with manual schema
manual_df = spark.read.option("header", True).schema(manual_schema).csv("course_enrollments.csv")
print("\nManual Schema:")
manual_df.printSchema()


Manual Schema:
root
 |-- EnrollmentID: string (nullable = true)
 |-- StudentName: string (nullable = true)
 |-- CourseName: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- EnrollDate: date (nullable = true)
 |-- ProgressPercent: integer (nullable = true)
 |-- Rating: double (nullable = true)
 |-- Status: string (nullable = true)



In [4]:
low_progress = manual_df.filter(col("ProgressPercent") < 50)
print("\nStudents with Progress < 50%:")
low_progress.show()


Students with Progress < 50%:
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|EnrollmentID|StudentName|         CourseName|   Category|EnrollDate|ProgressPercent|Rating|  Status|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+
|      ENRQ03|     Aakash|Power BI Essentials|  Analytics|2024-05-13|             30|   3.8|  Active|
|      ENRQ04|       Meha|        Java Basics|Programming|2024-05-15|              0|  NULL|Inactive|
+------------+-----------+-------------------+-----------+----------+---------------+------+--------+



In [5]:
avg_rating = manual_df.select(avg("Rating")).collect()[0][0]
df_with_ratings = manual_df.withColumn("Rating", when(col("Rating").isNull(), avg_rating).otherwise(col("Rating")))
print("\nData after filling null ratings:")
df_with_ratings.show()


Data after filling null ratings:
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+
|      ENRQ01|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|
|      ENRQ02|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|
|      ENRQ03|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|
|      ENRQ04|       Meha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|
|      ENRQ05|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|   Active|
|      ENRQ06|    Ibrahim|Python for Beginners

In [6]:
final_df = df_with_ratings.withColumn("IsActive", when(col("Status") == "Active", 1).otherwise(0))
print("\nData with IsActive column:")
final_df.show()


Data with IsActive column:
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|EnrollmentID|StudentName|          CourseName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|
+------------+-----------+--------------------+-----------+----------+---------------+-----------------+---------+--------+
|      ENRQ01|     Aditya|Python for Beginners|Programming|2024-05-10|             80|              4.5|   Active|       1|
|      ENRQ02|     Simran|Data Analysis wit...|  Analytics|2024-05-12|            100|              4.7|Completed|       0|
|      ENRQ03|     Aakash| Power BI Essentials|  Analytics|2024-05-13|             30|              3.8|   Active|       1|
|      ENRQ04|       Meha|         Java Basics|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|
|      ENRQ05|       Zara|Machine Learning 101|         AI|2024-05-17|             60|              4.2|

In [7]:
avg_progress = final_df.groupBy("CourseName").agg(avg("ProgressPercent").alias("AvgProgress"))
print("\nAverage Progress by Course:")
avg_progress.show()


Average Progress by Course:
+--------------------+-----------+
|          CourseName|AvgProgress|
+--------------------+-----------+
|Data Analysis wit...|      100.0|
|         Java Basics|        0.0|
|Machine Learning 101|       60.0|
|Python for Beginners|       85.0|
| Power BI Essentials|       30.0|
+--------------------+-----------+



In [8]:
category_counts = final_df.groupBy("Category").count().orderBy("count", ascending=False)
print("\nEnrollments by Category:")
category_counts.show()


Enrollments by Category:
+-----------+-----+
|   Category|count|
+-----------+-----+
|Programming|    3|
|  Analytics|    2|
|         AI|    1|
+-----------+-----+



In [9]:
most_enrolled = final_df.groupBy("CourseName").count().orderBy("count", ascending=False).first()
print(f"\nMost Enrolled Course: {most_enrolled['CourseName']} ({most_enrolled['count']} enrollments)")


Most Enrolled Course: Python for Beginners (2 enrollments)


In [10]:
course_details = spark.read.option("header", True).csv("course_details.csv")
enriched_df = final_df.join(course_details, "CourseName", "left")
print("\nData after joining with course details:")
enriched_df.show()


Data after joining with course details:
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+
|Python for Beginners|      ENRQ01|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|
|Data Analysis wit...|      ENRQ02|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|
| Power BI Essentials|      ENRQ03|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Active|       1|            5|     Rekha|
|         Java Basics|      ENRQ04|       Meha|Programming|2024-0

In [11]:
from pyspark.sql.window import Window
window_spec = Window.partitionBy("CourseName").orderBy(col("ProgressPercent").desc())
ranked_df = enriched_df.withColumn("Rank", rank().over(window_spec))
print("\nStudents Ranked by Progress:")
ranked_df.show()


Students Ranked by Progress:
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|Rank|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----+
|Data Analysis wit...|      ENRQ02|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|   1|
|         Java Basics|      ENRQ04|       Meha|Programming|2024-05-15|              0|4.359999999999999| Inactive|       0|            6|     Manoj|   1|
|Machine Learning 101|      ENRQ05|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|            8|     Samir|   1|
| Power BI Essentials|      ENRQ03|     Aakash

In [12]:
category_window = Window.partitionBy("Category").orderBy("EnrollDate")
date_diff_df = enriched_df.withColumn("NextEnrollment", lead("EnrollDate").over(category_window)) \
                         .withColumn("PrevEnrollment", lag("EnrollDate").over(category_window))
print("\nEnrollment Dates with Lead/Lag:")
date_diff_df.show()


Enrollment Dates with Lead/Lag:
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+--------------+--------------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|NextEnrollment|PrevEnrollment|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+--------------+--------------+
|Machine Learning 101|      ENRQ05|       Zara|         AI|2024-05-17|             60|              4.2|   Active|       1|            8|     Samir|          NULL|          NULL|
|Data Analysis wit...|      ENRQ02|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|    2024-05-13|          NULL|
| Power BI Essentials|      ENRQ03|     Aakash|  Analytics|2024-05-13|  

In [13]:
pivot_df = enriched_df.groupBy("Category").pivot("Status").count().na.fill(0)
print("\nPivoted Enrollment Counts:")
pivot_df.show()


Pivoted Enrollment Counts:
+-----------+------+---------+--------+
|   Category|Active|Completed|Inactive|
+-----------+------+---------+--------+
|Programming|     1|        1|       1|
|         AI|     1|        0|       0|
|  Analytics|     1|        1|       0|
+-----------+------+---------+--------+



In [14]:
date_extracted = enriched_df.withColumn("EnrollYear", year("EnrollDate")) \
                           .withColumn("EnrollMonth", month("EnrollDate"))
print("\nData with Year & Month Extracted:")
date_extracted.show()


Data with Year & Month Extracted:
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|EnrollYear|EnrollMonth|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|Python for Beginners|      ENRQ01|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|      2024|          5|
|Data Analysis wit...|      ENRQ02|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|      2024|          5|
| Power BI Essentials|      ENRQ03|     Aakash|  Analytics|2024-05-13|             30|              3.8| 

In [15]:
cleaned_df = date_extracted.filter(col("Status").isNotNull() & (col("Status") != ""))
print("\nAfter Dropping Null/Empty Status:")
cleaned_df.show()


After Dropping Null/Empty Status:
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|EnrollYear|EnrollMonth|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|Python for Beginners|      ENRQ01|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|      2024|          5|
|Data Analysis wit...|      ENRQ02|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|      2024|          5|
| Power BI Essentials|      ENRQ03|     Aakash|  Analytics|2024-05-13|             30|              3.8| 

In [16]:
deduped_df = cleaned_df.dropDuplicates(["EnrollmentID", "StudentName", "CourseName"])
print("\nAfter Removing Duplicates:")
deduped_df.show()


After Removing Duplicates:
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|          CourseName|EnrollmentID|StudentName|   Category|EnrollDate|ProgressPercent|           Rating|   Status|IsActive|DurationWeeks|Instructor|EnrollYear|EnrollMonth|
+--------------------+------------+-----------+-----------+----------+---------------+-----------------+---------+--------+-------------+----------+----------+-----------+
|Python for Beginners|      ENRQ01|     Aditya|Programming|2024-05-10|             80|              4.5|   Active|       1|            4|    Rakesh|      2024|          5|
|Data Analysis wit...|      ENRQ02|     Simran|  Analytics|2024-05-12|            100|              4.7|Completed|       0|            3|    Anjali|      2024|          5|
| Power BI Essentials|      ENRQ03|     Aakash|  Analytics|2024-05-13|             30|              3.8|   Activ

In [17]:
deduped_df.write.mode("overwrite").csv("course_enrollments_cleaned.csv")
print("CSV saved successfully!")

CSV saved successfully!


In [18]:
deduped_df.write.mode("overwrite").json("course_enrollments_cleaned.json")
print("JSON saved successfully!")

JSON saved successfully!


In [19]:
deduped_df.write.mode("overwrite").parquet("course_enrollments_cleaned.parquet", compression="snappy")
print("Parquet saved successfully!")

Parquet saved successfully!


In [20]:
from google.colab import files

# Download CSV
!ls course_enrollments_cleaned.csv/part-*.csv | head -1 | xargs -I {} cp {} "final_output.csv"
files.download("final_output.csv")

# Download JSON (zipped)
!zip -r json_output.zip course_enrollments_cleaned.json
files.download("json_output.zip")

# Download Parquet (zipped)
!zip -r parquet_output.zip course_enrollments_cleaned.parquet
files.download("parquet_output.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: course_enrollments_cleaned.json/ (stored 0%)
  adding: course_enrollments_cleaned.json/_SUCCESS (stored 0%)
  adding: course_enrollments_cleaned.json/.part-00000-2c73a208-7681-4818-aea4-e6745f757e84-c000.json.crc (stored 0%)
  adding: course_enrollments_cleaned.json/._SUCCESS.crc (stored 0%)
  adding: course_enrollments_cleaned.json/part-00000-2c73a208-7681-4818-aea4-e6745f757e84-c000.json (deflated 73%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  adding: course_enrollments_cleaned.parquet/ (stored 0%)
  adding: course_enrollments_cleaned.parquet/_SUCCESS (stored 0%)
  adding: course_enrollments_cleaned.parquet/.part-00000-042158a9-612b-4864-97e7-1f59a2555c7e-c000.snappy.parquet.crc (stored 0%)
  adding: course_enrollments_cleaned.parquet/._SUCCESS.crc (stored 0%)
  adding: course_enrollments_cleaned.parquet/part-00000-042158a9-612b-4864-97e7-1f59a2555c7e-c000.snappy.parquet (deflated 57%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>