In [None]:
from pyspark.sql import SparkSession

master_url = "spark://spark-master:7077"

spark = SparkSession.builder \
    .master(master_url) \
    .appName("JupyterSparkCluster") \
    .config("spark.driver.host", "jupyter-lab") \
    .getOrCreate()

# Get the SparkContext from the session
sc = spark.sparkContext
sc.setLogLevel("WARN")

# You can check the Spark UI (http://localhost:4040 for the application UI)
# to see this application running.
print(f"Spark App Name: {sc.appName}")
print(f"Spark Master: {sc.master}")


In [None]:
# --- Run a Test Job ---
# Create a simple DataFrame using pure Spark
data = [("Alice", 28), ("Bob", 35), ("Charlie", 42)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

print("Sample DataFrame:")
df.show()

# Perform a simple transformation
df_older = df.withColumn("Age_in_10_years", df.Age + 10)

print("Transformed DataFrame:")
df_older.show()



In [None]:
# Create a simple Delta Table
data = [("Alice", 28), ("Bob", 35), ("Charlie", 42)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

df.write.format("delta").mode("overwrite").saveAsTable("test_delta_table")

spark.read.table("test_delta_table").show()

In [None]:
# To stop the SparkSession (and release the resources on the cluster)
spark.stop()