In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, min, sum, desc

In [None]:
# Step 1: Initialize SparkSession
spark = SparkSession.builder \
    .appName("Student Performance Analysis") \
    .getOrCreate()

In [None]:
# Step 2: Load the dataset
file_path = "/content/Student_Marks.csv"
data = spark.read.csv(file_path, header=True, inferSchema=True)

In [None]:
# Step 3: Display the schema
print("Schema:")
data.printSchema()

Schema:
root
 |-- number_courses: integer (nullable = true)
 |-- time_study: double (nullable = true)
 |-- Marks: double (nullable = true)



In [None]:
# Step 4: Display the first five rows
print("First five rows:")
data.show(5)

First five rows:
+--------------+----------+------+
|number_courses|time_study| Marks|
+--------------+----------+------+
|             3|     4.508|19.202|
|             4|     0.096| 7.734|
|             4|     3.133|13.811|
|             6|     7.909|53.018|
|             8|     7.811|55.299|
+--------------+----------+------+
only showing top 5 rows



In [None]:
# Step 5: Perform data analysis using Spark SQL

# Register the DataFrame as a temporary view
data.createOrReplaceTempView("student_marks")

# Calculate the total number of students
total_students = spark.sql("SELECT COUNT(*) AS TotalStudents FROM student_marks")
total_students.show()

+-------------+
|TotalStudents|
+-------------+
|          100|
+-------------+



In [None]:
# Find the average marks for each subject
average_marks_subject = spark.sql("""
    SELECT number_courses, AVG(Marks) AS AverageMarks
    FROM student_marks
    GROUP BY number_courses
""")
average_marks_subject.show()

+--------------+------------------+
|number_courses|      AverageMarks|
+--------------+------------------+
|             6|29.863062499999998|
|             3|18.433318181818183|
|             5|17.641000000000002|
|             4| 19.02995238095238|
|             8|33.835375000000006|
|             7|29.401466666666668|
+--------------+------------------+



In [None]:
# Find the minimum and maximum marks for each subject
# **Change:** Replace 'Subject' with an existing column like 'number_courses'
min_max_marks = spark.sql("""
    SELECT number_courses, MIN(Marks) AS MinMarks, MAX(Marks) AS MaxMarks  -- Changed 'Subject' to 'number_courses'
    FROM student_marks
    GROUP BY number_courses  -- Changed 'Subject' to 'number_courses'
""")
min_max_marks.show()

+--------------+--------+--------+
|number_courses|MinMarks|MaxMarks|
+--------------+--------+--------+
|             6|  10.522|  53.018|
|             3|   5.609|  43.978|
|             5|   9.333|  36.746|
|             4|   7.336|  44.099|
|             8|  15.038|  55.299|
|             7|  12.027|  54.321|
+--------------+--------+--------+



In [None]:
# Calculate the total number of students per course
students_per_course = spark.sql("""
    SELECT number_courses, COUNT(*) AS TotalStudents
    FROM student_marks
    GROUP BY number_courses
""")
students_per_course.show()

+--------------+-------------+
|number_courses|TotalStudents|
+--------------+-------------+
|             6|           16|
|             3|           22|
|             5|           10|
|             4|           21|
|             8|           16|
|             7|           15|
+--------------+-------------+

