In [None]:

import pyspark
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = SparkSession.builder     .appName("StudentScoresDelta")     .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")     .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [None]:

csv_data = '''student_id,name,subject,score,grade
1,Ankit,Math,85,A
2,Divya,Science,92,A
3,Rahul,English,78,B
4,Sneha,Math,65,C
5,Aryan,Science,55,D
6,Isha,English,88,A
7,Tanvi,Math,91,A
8,Kunal,Science,72,B
9,Megha,English,60,C
10,Rohan,Math,40,F
'''

with open("/tmp/student_scores.csv", "w") as f:
    f.write(csv_data)

df = spark.read.option("header", True).option("inferSchema", True).csv("/tmp/student_scores.csv")
df.show()


In [None]:

df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores")


In [None]:

spark.sql("DROP TABLE IF EXISTS student_scores")
spark.sql("CREATE TABLE student_scores USING DELTA LOCATION '/tmp/delta/student_scores'")


In [None]:
spark.sql("SELECT name, subject, score FROM student_scores").show()

In [None]:
spark.sql("SELECT subject, COUNT(*) as student_count FROM student_scores GROUP BY subject").show()

In [None]:
spark.sql("SELECT subject, AVG(score) as avg_score FROM student_scores GROUP BY subject").show()

In [None]:
spark.sql("SELECT name, score FROM student_scores WHERE score > 80").show()

In [None]:

spark.sql("""
SELECT subject, name, score FROM (
    SELECT *, RANK() OVER (PARTITION BY subject ORDER BY score DESC) as rnk 
    FROM student_scores
) WHERE rnk = 1
""").show()


In [None]:
spark.sql("SELECT grade, COUNT(*) as count FROM student_scores GROUP BY grade").show()

In [None]:
spark.sql("SELECT name FROM student_scores WHERE grade = 'F'").show()

In [None]:
spark.sql("SELECT name, score FROM student_scores WHERE score BETWEEN 60 AND 90").show()

In [None]:

spark.sql("""
SELECT subject, name, score, RANK() OVER (PARTITION BY subject ORDER BY score DESC) as rank 
FROM student_scores
""").show()


In [None]:

from delta.tables import DeltaTable
deltaTable = DeltaTable.forPath(spark, "/tmp/delta/student_scores")
deltaTable.update(condition="subject = 'English'", set={"score": "score + 5"})


In [None]:
deltaTable.delete("score < 50")

In [None]:

from pyspark.sql.functions import when, col
df = deltaTable.toDF()
df = df.withColumn("pass_status", when(col("score") >= 50, "PASS").otherwise("FAIL"))
df.show()


In [None]:
df.createOrReplaceTempView("student_view")

In [None]:
spark.sql("SELECT subject, AVG(score) as avg_score FROM student_view GROUP BY subject").show()

In [None]:

df.write.format("delta").mode("overwrite").save("/tmp/delta/student_scores_v2")
spark.sql("DROP TABLE IF EXISTS student_scores_v2")
spark.sql("CREATE TABLE student_scores_v2 USING DELTA LOCATION '/tmp/delta/student_scores_v2'")


In [None]:

df.write.mode("overwrite").parquet("/tmp/output/student_scores_parquet")
df.write.mode("overwrite").json("/tmp/output/student_scores_json")
