In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
spark = SparkSession.builder.appName("Education").getOrCreate()

In [None]:
students_df = spark.read.csv("../data/students.csv", inferSchema=True, header=True)
courses_df = spark.read.csv("../data/courses.csv", inferSchema=True, header=True)
grades_df = spark.read.csv("../data/grades.csv", inferSchema=True, header=True)

In [None]:
students_df.show(5)
courses_df.show(5)
grades_df.show(5)

#### Analysis:
- Find the average grade for each course.
- Use DataFrame operations to find the top 3 students with the highest average grades.

In [None]:
# The average grade for each course 
grades_df.groupBy("course_id").agg(round(avg("grade"), 2).alias("Average Grade")).show()

In [None]:
# Top 3 students with the highest average grades
grades_df.groupBy("student_id"). \
    agg(round(avg("grade"), 2).alias("Average Grade")) \
    .orderBy(col("Average Grade").desc()).limit(3).show()

#### Join and Filtering:
- Join the grades and courses tables to create a new DataFrame (gradesWithCoursesDF) containing information about each student's grade along with the course details.
- Filter the gradesWithCoursesDF to include only students who scored below 70.

In [None]:
grades_with_courses_df = grades_df.join(courses_df, grades_df.course_id == courses_df.course_id,"inner")

In [None]:
grades_with_courses_df.filter(col("grade") < 70).show()

#### Date Manipulation:
- Convert the date_of_birth column in the students table to DateType.
- Find the instructor with the highest average student grades

In [None]:
students_df = students_df.withColumn("date_of_birth", to_date("date_of_birth", "yyyy-m-d"))

In [None]:
grades_with_courses_df.groupBy("instructor") \
    .agg(avg("grade").alias("Average Grade")) \
    .orderBy(col("Average Grade").desc()).limit(1).show()