In [1]:
!pip install delta-spark==3.2.0 -q
import pyspark
from delta import *
from pyspark.sql.functions import *

# Create a SparkSession with Delta Lake extensions
# The '.config(...)' lines are crucial for enabling Delta Lake's features
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Get or create the SparkSession
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("Spark and Delta Lake are ready!")

Spark and Delta Lake are ready!


In [2]:
# Create a sample DataFrame
data = [("Amit", 28), ("Priya", 32), ("Rahul", 25)]
df = spark.createDataFrame(data, ["name", "age"])

# Save as a managed table in Delta format
df.write.format("delta").saveAsTable("managed_people")

# Show the table
spark.sql("SELECT * FROM managed_people").show()

# Get the table's storage location
location = spark.sql("DESCRIBE DETAIL managed_people").collect()[0]['location']
print("Managed Table Location:", location)

+-----+---+
| name|age|
+-----+---+
|Priya| 32|
|Rahul| 25|
| Amit| 28|
+-----+---+

Managed Table Location: file:/content/spark-warehouse/managed_people


In [4]:
spark.sql(""" DROP TABLE  managed_people""")

DataFrame[]

In [6]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Example: CSV file path in your Drive
csv_path = "/content/drive/MyDrive/students.csv"

# Read CSV into Spark DataFrame
df = spark.read.format("csv").option("header", "true").load(csv_path)
df.show()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+-----+----+-------+-------+
| Name|Math|Science|English|
+-----+----+-------+-------+
| Amit|  78|     85|     74|
|Priya|  92|     89|     96|
|Kiran|  65|     70|     60|
|Anita|  88|     95|     90|
| Ravi|  55|     60|     58|
+-----+----+-------+-------+



In [7]:
# Create a sample DataFrame
data = [("Amit", 28), ("Priya", 32), ("Rahul", 25)]
df = spark.createDataFrame(data, ["name", "age"])

# Path for storing external table data (you choose the location)
external_path = "/content/drive/MyDrive/unmanaged_people"

# Save as an unmanaged (external) Delta table
df.write.format("delta").save(external_path)

# Create external table linked to that path
spark.sql(f"""
    CREATE TABLE unmanaged_people
    USING DELTA
    LOCATION '{external_path}'
""")

# Show the table
spark.sql("SELECT * FROM unmanaged_people").show()

# Check table location
location = spark.sql("DESCRIBE DETAIL unmanaged_people").collect()[0]['location']
print("Unmanaged Table Location:", location)


+-----+---+
| name|age|
+-----+---+
|Priya| 32|
|Rahul| 25|
| Amit| 28|
+-----+---+

Unmanaged Table Location: file:/content/drive/MyDrive/unmanaged_people


In [9]:
spark.sql(""" DROP TABLE  unmanaged_people""")

DataFrame[]