# STEP 1


In [0]:
csv_data = """student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F
"""
with open("/tmp/student_scores.csv", "w") as f:
  dbutils.fs.put("dbfs:/tmp/student_scores.csv", csv_data, overwrite=True)


Wrote 234 bytes.


In [0]:
df = spark.read.option("header", True).option("inferSchema",
True).csv("/tmp/student_scores.csv")
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



#STEP 2 

In [0]:
df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores")

# STEP 3

In [0]:
from pyspark.sql import SparkSession
spark.sql("DROP TABLE IF EXISTS student_scores")
spark.sql("CREATE TABLE spark_catalog.default.student_scores USING DELTA LOCATION '/tmp/delta/student_scores'")

DataFrame[]

# BASIC TASKS

In [0]:
from delta.tables import DeltaTable
delta_path = "/tmp/delta/student_scores"
delta_table = DeltaTable.forPath(spark, delta_path)
df = delta_table.toDF()
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
|        10|Rohan|   Math|   40|    F|
+----------+-----+-------+-----+-----+



In [0]:

print("STUDENTS AND SCORES")
df.select("name","score").show()

print("NUMBER OF STUDENTS IN EACH SUBJECT")
df.groupBy("subject").count().show()

print("AVERAGE SCORE PER SUBJECT")
df.groupBy("subject").avg("score").show()

print("STUDENTS WITH SCORE >80")
df.filter(df.score > 80).show()



STUDENTS AND SCORES
+-----+-----+
| name|score|
+-----+-----+
|Ankit|   85|
|Divya|   92|
|Rahul|   78|
|Sneha|   65|
|Aryan|   55|
| Isha|   88|
|Tanvi|   91|
|Kunal|   72|
|Megha|   60|
|Rohan|   40|
+-----+-----+

NUMBER OF STUDENTS IN EACH SUBJECT
+-------+-----+
|subject|count|
+-------+-----+
|Science|    3|
|   Math|    4|
|English|    3|
+-------+-----+

AVERAGE SCORE PER SUBJECT
+-------+-----------------+
|subject|       avg(score)|
+-------+-----------------+
|Science|             73.0|
|   Math|            70.25|
|English|75.33333333333333|
+-------+-----------------+

STUDENTS WITH SCORE >80
+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         6| Isha|English|   88|    A|
|         7|Tanvi|   Math|   91|    A|
+----------+-----+-------+-----+-----+



# ADVANCED QUERIES

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

print("student(s) with the highest score in each subject")
windoww = Window.partitionBy("subject").orderBy(df.score.desc())
df.withColumn("rank",rank().over(windoww)).filter("rank = 1").show()

print("grades with count of students in each.")
df.groupBy("grade").count().show()

print("names of students failed")
df.filter( col("grade")== 'F').select("name").show()

print("Students with score between 60 and 90")
df.filter((col("score")>=60)&(col("score")<=90)).show()

print("rank students within subject based on scores")
df.withColumn("rank",rank().over(windoww)).show()

student(s) with the highest score in each subject
+----------+-----+-------+-----+-----+----+
|student_id| name|subject|score|grade|rank|
+----------+-----+-------+-----+-----+----+
|         6| Isha|English|   88|    A|   1|
|         7|Tanvi|   Math|   91|    A|   1|
|         2|Divya|Science|   92|    A|   1|
+----------+-----+-------+-----+-----+----+

grades with count of students in each.
+-----+-----+
|grade|count|
+-----+-----+
|    F|    1|
|    B|    2|
|    D|    1|
|    C|    2|
|    A|    4|
+-----+-----+

names of students failed
+-----+
| name|
+-----+
|Rohan|
+-----+

Students with score between 60 and 90
+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         3|Rahul|English|   78|    B|
|         4|Sneha|   Math|   65|    C|
|         6| Isha|English|   88|    A|
|         8|Kunal|Science|   72|    B|
|         9|Megha|English|   60|    C|
+----------+-----+--

# UPDATE & DELETE TASKS

In [0]:
from pyspark.sql.functions import when
print("ADDING +5 TO ENGLISH")
delta_table.update(
    condition= col("subject") == "English",
    set = {"score": col("score")+5}
)
delta_table.toDF().show()

print("AFTER DELETION")
delta_table.delete(
    condition=col("score")<50
)
delta_table.toDF().show()

print("PASS STATUS")
delta_table = delta_table.toDF().withColumn("pass_status",
                                            when(col("score")>=50,"PASS")
                                            .otherwise("FAIL"))
delta_table.show()

ADDING +5 TO ENGLISH
+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
|        10|Rohan|   Math|   40|    F|
|         3|Rahul|English|   83|    B|
|         6| Isha|English|   93|    A|
|         9|Megha|English|   65|    C|
+----------+-----+-------+-----+-----+

AFTER DELETION
+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         3|Rahul|English|   83|    B|
|         6| Isha|English|   93|    A|
|         9|Megha|English|   65|    C|
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         7|Tanvi|   Math| 

# TRANSFORMATION AND VIEWS

In [0]:
df.show()

+----------+-----+-------+-----+-----+
|student_id| name|subject|score|grade|
+----------+-----+-------+-----+-----+
|         3|Rahul|English|   83|    B|
|         6| Isha|English|   93|    A|
|         9|Megha|English|   65|    C|
|         1|Ankit|   Math|   85|    A|
|         2|Divya|Science|   92|    A|
|         4|Sneha|   Math|   65|    C|
|         5|Aryan|Science|   55|    D|
|         7|Tanvi|   Math|   91|    A|
|         8|Kunal|Science|   72|    B|
+----------+-----+-------+-----+-----+



In [0]:
df.createOrReplaceTempView("student_scores_view")
print("VERAGE SCORES")
spark.sql("""
          select subject , avg(score)
          from student_scores_view
          group by subject """).show()



VERAGE SCORES
+-------+-----------------+
|subject|       avg(score)|
+-------+-----------------+
|Science|             73.0|
|   Math|80.33333333333333|
|English|80.33333333333333|
+-------+-----------------+



In [0]:
delta_table.show()

+----------+-----+-------+-----+-----+-----------+
|student_id| name|subject|score|grade|pass_status|
+----------+-----+-------+-----+-----+-----------+
|         3|Rahul|English|   83|    B|       PASS|
|         6| Isha|English|   93|    A|       PASS|
|         9|Megha|English|   65|    C|       PASS|
|         1|Ankit|   Math|   85|    A|       PASS|
|         2|Divya|Science|   92|    A|       PASS|
|         4|Sneha|   Math|   65|    C|       PASS|
|         5|Aryan|Science|   55|    D|       PASS|
|         7|Tanvi|   Math|   91|    A|       PASS|
|         8|Kunal|Science|   72|    B|       PASS|
+----------+-----+-------+-----+-----+-----------+



In [0]:
delta_table.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores_v2")

delta_v2 = DeltaTable.forPath(spark, "/tmp/delta/student_scores_v2")
print("student_scores_v2 Delta table:")
delta_v2.toDF().show()

student_scores_v2 Delta table:
+----------+-----+-------+-----+-----+-----------+
|student_id| name|subject|score|grade|pass_status|
+----------+-----+-------+-----+-----+-----------+
|         3|Rahul|English|   83|    B|       PASS|
|         6| Isha|English|   93|    A|       PASS|
|         9|Megha|English|   65|    C|       PASS|
|         1|Ankit|   Math|   85|    A|       PASS|
|         2|Divya|Science|   92|    A|       PASS|
|         4|Sneha|   Math|   65|    C|       PASS|
|         5|Aryan|Science|   55|    D|       PASS|
|         7|Tanvi|   Math|   91|    A|       PASS|
|         8|Kunal|Science|   72|    B|       PASS|
+----------+-----+-------+-----+-----+-----------+



In [0]:
delta_table.write.mode("overwrite").parquet("/tmp/output/student_scores_parquet")

delta_table.write.mode("overwrite").json("/tmp/output/student_scores_json")

print("Data saved in Parquet and JSON formats.")


Data saved in Parquet and JSON formats.
