In [None]:
%%pyspark emr-s.EMR-new
import pyspark

In [None]:
%%pyspark emr-s.EMR-new
from pyspark.sql import SparkSession

# Step 1: Initialize the SparkSession
spark = SparkSession.builder \
    .appName("Create and Display Table") \
    .getOrCreate()

# Step 2: Create a DataFrame
data = [
    (1, "John Doe", 30),
    (2, "Jane Smith", 25),
    (3, "Sam Brown", 35)
]

columns = ["id", "name", "age"]

df = spark.createDataFrame(data, schema=columns)

# Step 3: Create a Temporary Table/View
df.createOrReplaceview("people")

# Step 4: Query the Table
result = spark.sql("SELECT * FROM people")

# Step 5: Show the Data
result.show()

# Step 6 (Optional): Save as a Persistent Table (if required)
# Uncomment the following line to save to a Hive table or a database
# result.write.saveAsTable("people_table")  # Requires Hive support


# Create a table in Glue Database

In [None]:
%%pyspark emr-s.EMR-new
from pyspark.sql import SparkSession

# Step 1: Initialize the SparkSession
spark = SparkSession.builder \
    .appName("Create and Display Iceberg Table") \
    .config("spark.sql.catalogImplementation", "hive") \
    .enableHiveSupport() \
    .getOrCreate()

# Step 2: Create a DataFrame
data = [
    (1, "John Doe", 30),
    (2, "Jane Smith", 25),
    (3, "Sam Brown", 35)
]

columns = ["id", "name", "age"]

df = spark.createDataFrame(data, schema=columns)

# Step 3: Write the DataFrame to a Glue table (using Hive Metastore)
# Specify the Glue database and table name
glue_database = "glue_db"  # Replace with your Glue database name
glue_table = "demographics"

# Step 4: Write the DataFrame to the Glue catalog
df.write \
    .format("parquet") \
    .mode("overwrite") \
    .saveAsTable(f"{glue_database}.{glue_table}")

# Step 5: Query the Glue table
result = spark.sql(f"SELECT * FROM {glue_database}.{glue_table}")

# Step 6: Show the Data
result.show()


In [None]:
%%pyspark emr-s.EMR-new
from pyspark.sql import SparkSession

# Step 1: Initialize the SparkSession
spark = SparkSession.builder \
    .appName("Create and Display Glue Iceberg Table") \
    .config("spark.sql.catalogImplementation", "hive") \
    .config("spark.sql.catalog.glue", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.glue.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \
    .config("spark.sql.catalog.glue.warehouse", "s3://sagemaker-data-studio-tutorial/demographics_iceberg/") \ #u
    .enableHiveSupport() \
    .getOrCreate()

# Step 2: Create a DataFrame
data = [
    (1, "John Doe", 30),
    (2, "Jane Smith", 25),
    (3, "Sam Brown", 35)
]

columns = ["id", "name", "age"]

df = spark.createDataFrame(data, schema=columns)

# Step 3: Write the DataFrame to an Iceberg table
glue_database = "glue_db"  # Replace with your Glue database name
glue_table = "demographics_iceberg_tbl"

# Step 4: Write the DataFrame to the Glue catalog as an Iceberg table
df.write \
    .format("iceberg") \
    .mode("overwrite") \
    .saveAsTable(f"glue.{glue_database}.{glue_table}")

# Step 5: Query the Iceberg table
result = spark.sql(f"SELECT * FROM glue.{glue_database}.{glue_table}")

# Step 6: Show the Data
result.show()


In [None]:
%%pyspark emr-s.EMR-new
