In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.functions import col, lit

# Create Spark session
spark = SparkSession.builder.appName("Quiz3").getOrCreate()
print("Spark Session Created Successfully!")

# Read CSV file with header
df = spark.read.options(header='True').csv("StudentData.csv")
print("Basic DataFrame loaded:")
df.show()

Spark Session Created Successfully!
Basic DataFrame loaded:
+---+------+----------------+------+------+-----+--------------------+
|age|gender|            name|course|  roll|marks|               email|
+---+------+----------------+------+------+-----+--------------------+
| 28|Female| Hubert Oliveras|    DB| 02984|   59|Annika Hoffman_Na...|
| 29|Female|Toshiko Hillyard| Cloud| 12899|   62|Margene Moores_Ma...|
| 28|  Male|  Celeste Lollis|    PF| 21267|   45|Jeannetta Golden_...|
| 29|Female|    Elenore Choy|    DB| 32877|   29|Billi Clore_Mitzi...|
| 28|  Male|  Sheryll Towler|   DSA| 41487|   41|Claude Panos_Judi...|
| 28|  Male|  Margene Moores|   MVC| 52771|   32|Toshiko Hillyard_...|
| 28|  Male|     Neda Briski|   OOP| 61973|   69|Alberta Freund_El...|
| 28|Female|    Claude Panos| Cloud| 72409|   85|Sheryll Towler_Al...|
| 28|  Male|  Celeste Lollis|   MVC| 81492|   64|Nicole Harwood_Cl...|
| 29|  Male|  Cordie Harnois|   OOP| 92882|   51|Judie Chipps_Clem...|
| 29|Female|     

In [2]:
# total number of students enrolled in each course
course_count_df = df.groupBy("course").count()
print("Total number of students enrolled in each course:")
course_count_df.show()

Total number of students enrolled in each course:
+------+-----+
|course|count|
+------+-----+
|    PF|  166|
|    DB|  157|
|   MVC|  157|
|   DSA|  176|
| Cloud|  192|
|   OOP|  152|
+------+-----+



In [3]:
# total number of male and female students in each course
gender_course_count_df = df.groupBy("course", "gender").count()
print("Total number of male and female students in each course:")
gender_course_count_df.show()

Total number of male and female students in each course:
+------+------+-----+
|course|gender|count|
+------+------+-----+
|   OOP|  Male|   70|
|    DB|  Male|   82|
| Cloud|Female|  106|
|   MVC|  Male|   86|
|   DSA|Female|   98|
|    PF|  Male|   97|
|   MVC|Female|   71|
| Cloud|  Male|   86|
|    PF|Female|   69|
|   DSA|  Male|   78|
|    DB|Female|   75|
|   OOP|Female|   82|
+------+------+-----+



In [9]:
# total marks achieved in each course by gender
df_with_int_marks = df.withColumn("marks", col("marks").cast("int"))
marks_course_gender_df = df_with_int_marks.groupBy("course", "gender").sum("marks")
print("Total marks achieved in each course by gender:")
marks_course_gender_df.show()

Total marks achieved in each course by gender:
+------+------+----------+
|course|gender|sum(marks)|
+------+------+----------+
|   OOP|  Male|      4234|
|    DB|  Male|      5073|
| Cloud|Female|      6316|
|   MVC|  Male|      5241|
|   DSA|Female|      6124|
|    PF|  Male|      5960|
|   MVC|Female|      4344|
| Cloud|  Male|      5127|
|    PF|Female|      3973|
|   DSA|  Male|      4826|
|    DB|Female|      4197|
|   OOP|Female|      4682|
+------+------+----------+



In [10]:
# min and max and average marks achieved in each course by each age group
age_course_stats_df = df_with_int_marks.groupBy("course", "age").agg(
    {"marks": "min", "marks": "max", "marks": "avg"}
)
print("Min, Max and Average marks achieved in each course by each age group:")
age_course_stats_df.show()

Min, Max and Average marks achieved in each course by each age group:
+------+---+------------------+
|course|age|        avg(marks)|
+------+---+------------------+
|   MVC| 29| 61.56470588235294|
| Cloud| 29|             61.25|
|   DSA| 28|  64.6867469879518|
|    DB| 29|59.346666666666664|
|    PF| 28| 63.75949367088607|
|    DB| 28| 58.76829268292683|
|   OOP| 29|59.729729729729726|
|   DSA| 29| 60.01075268817204|
|   OOP| 28| 57.64102564102564|
|    PF| 29|56.275862068965516|
| Cloud| 28|             58.08|
|   MVC| 28| 60.44444444444444|
+------+---+------------------+

