In [0]:
# Dataset: Inline CSV – student_scores.csv
csv_data = """student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F
"""

dbutils.fs.put("dbfs:/tmp/student_scores.csv", csv_data, overwrite=True)

Wrote 234 bytes.


True

In [0]:
df = spark.read.option("header", True).option("inferSchema", True).csv("dbfs:/tmp/student_scores.csv")
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



In [0]:
df.write.mode("overwrite").format("delta").save("/tmp/delta/student_scores")

In [0]:
spark.sql("DROP TABLE IF EXISTS student_scores")

DataFrame[]

In [0]:
spark.sql("CREATE TABLE default.student_scores USING DELTA")

DataFrame[]

In [0]:
from delta.tables import DeltaTable

delta_table = DeltaTable.forPath(spark, "/tmp/delta/student_scores")
df = delta_table.toDF()
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



In [0]:
# 1. Show all students and their scores.
df = spark.read.format("delta").load("/tmp/delta/student_scores")
df.createOrReplaceTempView("student_scores")

spark.sql("SELECT name, score FROM student_scores").show()

+-----+-----+
| name|score|
+-----+-----+
|Ankit|   85|
|Divya|   92|
|Rahul|   78|
|Sneha|   65|
|Aryan|   55|
| Isha|   88|
|Tanvi|   91|
|Kunal|   72|
|Megha|   60|
|Rohan|   40|
+-----+-----+



In [0]:
# 2. Count number of students in each subject.
spark.sql("""
SELECT subject, COUNT(*) AS student_count
FROM student_scores
GROUP BY subject
""").show()

+-------+-------------+
|subject|student_count|
+-------+-------------+
|Science|            3|
|   Math|            4|
|English|            3|
+-------+-------------+



In [0]:
# 3. Find average score per subject.
spark.sql("""
SELECT subject, ROUND(AVG(score), 2) AS avg_score
FROM student_scores
GROUP BY subject
""").show()

+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     73.0|
|   Math|    70.25|
|English|    75.33|
+-------+---------+



In [0]:
# 4. List all students who scored more than 80.
spark.sql("SELECT * FROM student_scores WHERE score > 80").show()
     

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
+----------+-----+-------+-----+-----+



In [0]:
# 5. Show student(s) with the highest score in each subject.
spark.sql("""SELECT subject, name, score FROM (SELECT *, RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rnk FROM student_scores) WHERE rnk = 1""").show()
     

+-------+-----+-----+
|subject| name|score|
+-------+-----+-----+
|English| Isha|   88|
|   Math|Tanvi|   91|
|Science|Divya|   92|
+-------+-----+-----+



In [0]:
# 6. Display grades with count of students in each.
spark.sql("""SELECT grade, COUNT(*) AS student_count FROM student_scores GROUP BY grade""").show()
     

+-----+-------------+
|grade|student_count|
+-----+-------------+
|    F|            1|
|    B|            2|
|    D|            1|
|    C|            2|
|    A|            4|
+-----+-------------+



In [0]:
# 7. Show names of students who failed (grade F).
spark.sql("""SELECT * FROM student_scores WHERE grade = 'F'""").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



In [0]:
# 8. List students with score between 60 and 90.
spark.sql("""SELECT * FROM student_scores where score BETWEEN 60 AND 90""").show()
     

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         6| Isha|English|   88|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
+----------+-----+-------+-----+-----+



In [0]:
# 9. Rank students within each subject based on scores.
spark.sql("""SELECT subject, name, score, RANK() OVER (PARTITION BY subject ORDER BY score DESC) AS rank FROM student_scores""").show()
     

+-------+-----+-----+----+
|subject| name|score|rank|
+-------+-----+-----+----+
|English| Isha|   93|   1|
|English|Rahul|   83|   2|
|English|Megha|   65|   3|
|   Math|Tanvi|   91|   1|
|   Math|Ankit|   85|   2|
|   Math|Sneha|   65|   3|
|Science|Divya|   92|   1|
|Science|Kunal|   72|   2|
|Science|Aryan|   55|   3|
+-------+-----+-----+----+



In [0]:
# 10. Increase score of all English subject students by 5.
spark.sql("""UPDATE student_scores SET score = score+5 WHERE subject = 'English'""")
spark.sql("""SELECT * FROM student_scores""").show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|        10|Rohan|   Math|   40|    F|
|         3|Rahul|English|   83|    B|
|         6| Isha|English|   93|    A|
|         9|Megha|English|   65|    C|
+----------+-----+-------+-----+-----+



In [0]:
# 11. Delete all records where score is less than 50.
spark.sql("""DELETE FROM student_scores WHERE score < 50""")
spark.sql("""SELECT * FROM student_scores""")

DataFrame[student_id: int, name: string, subject: string, score: int, grade: string]

In [0]:
# 12. Add a new column pass_status (PASS if score >= 50 else FAIL)
df = spark.sql("""
SELECT *, 
       CASE WHEN score >= 50 THEN 'PASS' ELSE 'FAIL' END AS pass_status
FROM student_scores
""").show()

+----------+-----+-------+-----+-----+-----------+
|student_id| name|subject|score|grade|pass_status|
+----------+-----+-------+-----+-----+-----------+
|         1|Ankit|   Math|   85|    A|       PASS|
|         2|Divya|Science|   92|    A|       PASS|
|         3|Rahul|English|   78|    B|       PASS|
|         4|Sneha|   Math|   65|    C|       PASS|
|         5|Aryan|Science|   55|    D|       PASS|
|         6| Isha|English|   88|    A|       PASS|
|         7|Tanvi|   Math|   91|    A|       PASS|
|         8|Kunal|Science|   72|    B|       PASS|
|         9|Megha|English|   60|    C|       PASS|
|        10|Rohan|   Math|   40|    F|       FAIL|
+----------+-----+-------+-----+-----+-----------+



In [0]:
# 13. Create a temporary view and run SQL to get average scores.
spark.sql("""
CREATE OR REPLACE TEMP VIEW student_scores_v1 AS
SELECT subject, ROUND(AVG(score), 2) AS avg_score
FROM student_scores
GROUP BY subject""")

DataFrame[]

In [0]:
# 14. Convert updated DataFrame into a new Delta table called student_scores_v2 .
delta_v2_path = "/tmp/delta_student_scores_v2"

if df is not None: df.write.format("delta").mode("overwrite").save(delta_v2_path)
print("New Delta table 'student_scores_v2' created.")


New Delta table 'student_scores_v2' created.


In [0]:
# 15. Write the final data to Parquet and JSON formats.
df.write.mode("overwrite").parquet("/tmp/student_scores_parquet")
df.write.mode("overwrite").json("/tmp/student_scores_json")

print("Data written to both Parquet and JSON formats.")

Data written to both Parquet and JSON formats.
