# 1. Configurar ambiente

In [0]:
catalog = dbutils.widgets.get("CATALOG")
schema = dbutils.widgets.get("SCHEMA")
table = dbutils.widgets.get("TABLE")
volume = dbutils.widgets.get("VOLUME")

location = f"{catalog}.{schema}.{table}"
print(f"Saving Recipe dataset to: {location}")

# 2. Cargar dataset

In [0]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "RecipeNLG_dataset.csv"

# Load the latest version
raw = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "paultimothymooney/recipenlg",
  file_path
).drop(columns=["Unnamed: 0"])

# Save dataframe to volume
raw.to_csv(f"/Volumes/{catalog}/{schema}/{volume}/RecipeNLG_dataset.csv", index=False)

In [0]:
display(raw.head())

# 3. Guardar dataset como tabla Delta

In [0]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import ArrayType, StringType

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("multiLine", "true") \
    .option("quote", '"') \
    .option("escape", '"') \
    .load(f"/Volumes/{catalog}/{schema}/{volume}/RecipeNLG_dataset.csv") \
    .dropna(subset=["title"]) \
    .limit(1000)

display(df)
df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(location)
print("Dataset saved as a Delta table!")

# 4. Modificar Metadatos

In [0]:
# Habilitar Change Data Feed
spark.sql(f"ALTER TABLE {location} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)")

# Deshabilitar Null
spark.sql(f"ALTER TABLE {location} ALTER COLUMN title SET NOT NULL")

# Definir llave primaria
spark.sql(f"ALTER TABLE {location} ADD CONSTRAINT title_pk PRIMARY KEY (title)")