In [1]:
import pandas as pd
enrollments_data = {
    "enrollment_id": ["E001", "E002", "E003", "E004", "E005", "E006", "E007", "E008", "E009", "E010"],
    "student_id": ["S001", "S002", "S003", "S004", "S005", "S006", "S007", "S008", "S009", "S010"],
    "course_id": ["C001", "C001", "C002", "C002", "C003", "C001", "C003", "C004", "C004", "C002"],
    "enrollment_date": ["2025-06-01", "2025-06-03", "2025-06-05", "2025-06-07", "2025-06-10", "2025-06-12", "2025-06-15", "2025-06-17", "2025-06-20", "2025-06-22"]
}



In [2]:
pd.DataFrame(enrollments_data).to_csv('enrollment.csv',index = False)

In [4]:
progress_data = {
    "enrollment_id": ["E001", "E002", "E003", "E004", "E005", "E006", "E007", "E008", "E009", "E010"],
    "completion_percentage": [95, 20, 88, 92, 30, 100, 10, 0, 96, 65],
    "last_active_date": ["2025-07-15", "2025-07-05", "2025-07-12", "2025-07-18", "2025-07-11", "2025-07-20", "2025-07-03", "2025-07-01", "2025-07-25", "2025-07-21"]
}
pd.DataFrame(progress_data).to_csv("progress.csv", index=False)


In [9]:
courses_data = {
    "course_id": ["C001", "C002", "C003", "C004", "FSD", "EMBEDDED"],
    "course_name": ["Python for Beginners", "Data Science Fundamentals", "Web Development Basics",
        "Machine Learning 101", "Website Development", "Hardware + Software"],
    "instructor": ["John Doe", "Jane Smith", "Alan Turing", "Grace Hopper", "HARI", "KARPAGAM"],
    "description": ["Intro to Python with basics like loops and functions.","Learn data analysis and basic statistics.",
        "HTML, CSS, and JavaScript for websites.","Overview of ML models and concepts.","Frontend and backend development skills.","Build embedded systems with C and sensors."]
}
pd.DataFrame(courses_data).to_csv("courses.csv", index=False)


In [10]:
from google.colab import files
files.download("enrollment.csv")
files.download("progress.csv")
files.download("courses.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Load large enrollment and progress data using PySpark

In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('CourseAnalysis').getOrCreate()

In [13]:
enrollemnt = spark.read.csv('enrollment.csv',header = True,inferSchema=True)
progress = spark.read.csv('progress.csv',header = True,inferSchema=True)
course = spark.read.csv('courses.csv',header = True,inferSchema=True)

In [15]:
enrollemnt.show()
progress.show()
course.show(truncate= False)

+-------------+----------+---------+---------------+
|enrollment_id|student_id|course_id|enrollment_date|
+-------------+----------+---------+---------------+
|         E001|      S001|     C001|     2025-06-01|
|         E002|      S002|     C001|     2025-06-03|
|         E003|      S003|     C002|     2025-06-05|
|         E004|      S004|     C002|     2025-06-07|
|         E005|      S005|     C003|     2025-06-10|
|         E006|      S006|     C001|     2025-06-12|
|         E007|      S007|     C003|     2025-06-15|
|         E008|      S008|     C004|     2025-06-17|
|         E009|      S009|     C004|     2025-06-20|
|         E010|      S010|     C002|     2025-06-22|
+-------------+----------+---------+---------------+

+-------------+---------------------+----------------+
|enrollment_id|completion_percentage|last_active_date|
+-------------+---------------------+----------------+
|         E001|                   95|      2025-07-15|
|         E002|                   20|

Join tables to get course-wise progress

In [16]:
joined = enrollemnt.join(progress ,on='enrollment_id',how='inner' )


In [22]:
from pyspark.sql.functions import when,col
joined = joined.withColumn(
    "is_completed", when(col("completion_percentage") >= 100, 'Yes').otherwise('No')
).withColumn(
    "is_dropped", when(col("completion_percentage") < 30, 'Yes').otherwise('No')
)

In [23]:
full = joined.join(course,on='course_id',how='left')
full.show()

+---------+-------------+----------+---------------+---------------------+----------------+------------+----------+--------------------+------------+--------------------+
|course_id|enrollment_id|student_id|enrollment_date|completion_percentage|last_active_date|is_completed|is_dropped|         course_name|  instructor|         description|
+---------+-------------+----------+---------------+---------------------+----------------+------------+----------+--------------------+------------+--------------------+
|     C001|         E001|      S001|     2025-06-01|                   95|      2025-07-15|          No|        No|Python for Beginners|    John Doe|Intro to Python w...|
|     C001|         E002|      S002|     2025-06-03|                   20|      2025-07-05|          No|       Yes|Python for Beginners|    John Doe|Intro to Python w...|
|     C002|         E003|      S003|     2025-06-05|                   88|      2025-07-12|          No|        No|Data Science Fund...|  Jane Sm

Group by course to count total enrolled and completed students

In [25]:
from pyspark.sql.functions import count
summary = full.groupBy('course_name').agg(count('*').alias('total enrolled'),
    count(when(col("is_completed") == 'Yes', True)).alias("completed"),
    count(when(col("is_dropped") == 'Yes', True)).alias("dropped"))
summary.show()

+--------------------+--------------+---------+-------+
|         course_name|total enrolled|completed|dropped|
+--------------------+--------------+---------+-------+
|Web Development B...|             2|        0|      1|
|Data Science Fund...|             3|        0|      0|
|Machine Learning 101|             2|        0|      1|
|Python for Beginners|             3|        1|      1|
+--------------------+--------------+---------+-------+



In [26]:
summary.coalesce(1).write.mode("overwrite").option("header", "true").csv("course_summary")

In [28]:
import shutil
import glob
path = glob.glob('course_summary/part-*.csv')[0]
shutil.copy(path,'course_summary.csv')


'course_summary.csv'

In [29]:
from google.colab import files
files.download('course_summary.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>