<a href="https://colab.research.google.com/github/Mondin0/data-eng/blob/main/CeL_Data_Eng_Delta_lake_parte_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Delta Lake
## Aspectos adicionales y avanzados

In [None]:
!pip install deltalake
!pip install pyarrow

Collecting deltalake
  Downloading deltalake-0.25.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)
Downloading deltalake-0.25.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (44.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deltalake
Successfully installed deltalake-0.25.4


In [None]:
from deltalake import DeltaTable, write_deltalake
from deltalake.table import TableOptimizer
import pyarrow as pa
import pandas as pd

### Inicializar tabla

Crear una tabla o archivo Delta lake, de forma vacía con el método [create](https://delta-io.github.io/delta-rs/api/delta_table/#deltalake.DeltaTable.create)

Es útil para inializar la tabla o archivo con el esquema deseado, el particionado a aplicar si corresponde, metadatos adicionales, [configuraciones propias](https://docs.delta.io/latest/table-properties.html) de Delta Lake, incluso constraints, etc.

In [None]:
help(DeltaTable.create)

Help on method create in module deltalake.table:

create(table_uri: Union[str, pathlib.Path], schema: Union[pyarrow.lib.Schema, deltalake._internal.Schema], mode: Literal['error', 'append', 'overwrite', 'ignore'] = 'error', partition_by: Union[List[str], str, NoneType] = None, name: Optional[str] = None, description: Optional[str] = None, configuration: Optional[Mapping[str, Optional[str]]] = None, storage_options: Optional[Dict[str, str]] = None, custom_metadata: Optional[Dict[str, str]] = None, raise_if_key_not_exists: bool = True) -> 'DeltaTable' class method of deltalake.table.DeltaTable
    `CREATE` or `CREATE_OR_REPLACE` a delta table given a table_uri.
    
    Args:
        table_uri: URI of a table
        schema: Table schema
        mode: How to handle existing data. Default is to error if table already exists.
            If 'append', returns not support error if table exists.
            If 'overwrite', will `CREATE_OR_REPLACE` table.
            If 'ignore', will not do a

In [None]:
# Crear una tabla o archivo Delta Lake vacío
DeltaTable.create(
    "datalakehouse/bronze/retail/products",
    # Columnas y tipos de datos
    schema = pa.schema([
        pa.field("product_id", pa.int64()),
        pa.field("product_name", pa.string()),
        pa.field("product_category", pa.string()),
        pa.field("product_description", pa.string()),
        pa.field("product_brand", pa.string())
    ]),
    # Particionado, opcional
    partition_by=["product_category"],
    # Descripcion de la tabla, como metadata. Opcional
    description="Tabla de datos crudos de productos de la tienda",
    # Etiquedas o tags adicionales, para sumar metadata. Opcional
    custom_metadata={
        "domain": "retail",
        "source": "CRM",
        "data_owner": "Equipo de ventas"
    },
    # Configuraciones de la tabla
    configuration={
        "delta.appendOnly": "false",
        "delta.deletedFileRetentionDuration": "interval 7 day",
        "delta.logRetentionDuration": "interval 7 day",
        "delta.enableChangeDataFeed": "true",
        "delta.autoOptimize.optimizeWrite": "true"
    }
)

DeltaTable()

Una vez creada la tabla o archivo, se puede escribir datos en ella

In [None]:
df_products = pd.DataFrame(
    columns=["product_id", "product_name", "product_category", "product_description", "product_brand"],
    data=[
        [1, "Laptop", "Computers", "Laptop de 15 pulgadas", "HP"],
        [2, "Mouse", "Computers", "Mouse inalámbrico", "Logitech"],
        [3, "Teclado", "Computers", "Teclado mecánico", "Razer"],
        [4, "Smartphone", "Phones", "Smartphone de gama media", "Samsung"],
        [5, "Smartwatch", "Wearables", "Smartwatch con GPS", "Apple"]
    ]
    )

write_deltalake(
    "datalakehouse/bronze/retail/products",
    df_products,
    mode="append",
    engine="rust"
)

In [None]:
df_products_new = pd.DataFrame(
    columns=["product_id", "product_name", "product_category", "product_description", "product_brand"],
    data=[
        [6, "Tablet", "Computers", "Tablet de 10 pulgadas", "Apple"],
        [7, "Monitor", "Computers", "Monitor de 24 pulgadas", "Samsung"]
    ]
)
write_deltalake(
    "datalakehouse/bronze/retail/products",
    df_products_new,
    mode="append",
    engine="rust"
)

In [None]:
products_dt = DeltaTable("datalakehouse/bronze/retail/products")

In [None]:
products_dt.to_pandas()

Unnamed: 0,product_id,product_name,product_category,product_description,product_brand
0,6,Tablet,Computers,Tablet de 10 pulgadas,Apple
1,7,Monitor,Computers,Monitor de 24 pulgadas,Samsung
2,5,Smartwatch,Wearables,Smartwatch con GPS,Apple
3,1,Laptop,Computers,Laptop de 15 pulgadas,HP
4,2,Mouse,Computers,Mouse inalámbrico,Logitech
5,3,Teclado,Computers,Teclado mecánico,Razer
6,4,Smartphone,Phones,Smartphone de gama media,Samsung


### Time travel

In [None]:
pd.DataFrame(products_dt.history())

Unnamed: 0,timestamp,operation,operationParameters,clientVersion,operationMetrics,version,domain,source,data_owner
0,1730852617940,WRITE,"{'mode': 'Append', 'partitionBy': '[""product_c...",delta-rs.0.21.0,"{'execution_time_ms': 1, 'num_added_files': 1,...",2,,,
1,1730852587212,WRITE,"{'mode': 'Append', 'partitionBy': '[""product_c...",delta-rs.0.21.0,"{'execution_time_ms': 5, 'num_added_files': 3,...",1,,,
2,1730852505754,CREATE TABLE,"{'protocol': '{""minReaderVersion"":1,""minWriter...",delta-rs.0.21.0,,0,retail,CRM,Equipo de ventas


In [None]:
# Carguemos diferentes versiones, veamos como cambian los datos
products_dt.load_as_version(0)
products_dt.to_pandas()

Unnamed: 0,product_id,product_name,product_category,product_description,product_brand


In [None]:
# Carguemos diferentes versiones, veamos como cambian los datos
products_dt.load_as_version(1)
products_dt.to_pandas()

Unnamed: 0,product_id,product_name,product_category,product_description,product_brand
0,5,Smartwatch,Wearables,Smartwatch con GPS,Apple
1,1,Laptop,Computers,Laptop de 15 pulgadas,HP
2,2,Mouse,Computers,Mouse inalámbrico,Logitech
3,3,Teclado,Computers,Teclado mecánico,Razer
4,4,Smartphone,Phones,Smartphone de gama media,Samsung


In [None]:
# Carguemos diferentes versiones, veamos como cambian los datos
products_dt.load_as_version(2)
products_dt.to_pandas()

Unnamed: 0,product_id,product_name,product_category,product_description,product_brand
0,6,Tablet,Computers,Tablet de 10 pulgadas,Apple
1,7,Monitor,Computers,Monitor de 24 pulgadas,Samsung
2,5,Smartwatch,Wearables,Smartwatch con GPS,Apple
3,1,Laptop,Computers,Laptop de 15 pulgadas,HP
4,2,Mouse,Computers,Mouse inalámbrico,Logitech
5,3,Teclado,Computers,Teclado mecánico,Razer
6,4,Smartphone,Phones,Smartphone de gama media,Samsung


### Optimización

Cada escritura genera un nuevo archivo, esto puede llevar a una gran cantidad de archivos (small file problem), lo que puede ser ineficiente, por lo que se puede compactarlos para reducir la cantidad de los mismos, mejorar la eficiencia de lectura y escritura, y reducir el costo de almacenamiento.
Se utiliza el método [compact](https://delta-io.github.io/delta-rs/api/delta_table/delta_table_optimizer/#deltalake.table.TableOptimizer.compact)

In [None]:
products_dt = DeltaTable("datalakehouse/bronze/retail/products")
TableOptimizer(products_dt).compact()

{'numFilesAdded': 1,
 'numFilesRemoved': 2,
 'filesAdded': '{"avg":2006.0,"max":2006,"min":2006,"totalFiles":1,"totalSize":2006}',
 'filesRemoved': '{"avg":1851.5,"max":1854,"min":1849,"totalFiles":2,"totalSize":3703}',
 'partitionsOptimized': 1,
 'numBatches': 2,
 'totalConsideredFiles': 4,
 'totalFilesSkipped': 2,
 'preserveInsertionOrder': True}

Esta operacion genera un nuevo archivo, que contiene los datos compactados. Sin embargo, los archivos antiguos no son eliminados, por lo que se puede utilizar el método [vacuum](https://delta-io.github.io/delta-rs/api/delta_table/#deltalake.DeltaTable.vacuum) para eliminarlos.

In [None]:
products_dt.vacuum(retention_hours=0, dry_run=False, enforce_retention_duration=False)

['product_category=Computers/part-00001-6f0a8143-7799-4db2-a0b1-66cf8f84aa89-c000.snappy.parquet',
 'product_category=Computers/part-00001-5f0b1c94-3c2e-4f7f-834e-4a49a2cb0bc8-c000.snappy.parquet']

In [None]:
# Chequeamos que no hemos perdido datos con todas estas operaciones de optimización
products_dt.to_pandas()

Unnamed: 0,product_id,product_name,product_category,product_description,product_brand
0,1,Laptop,Computers,Laptop de 15 pulgadas,HP
1,2,Mouse,Computers,Mouse inalámbrico,Logitech
2,3,Teclado,Computers,Teclado mecánico,Razer
3,6,Tablet,Computers,Tablet de 10 pulgadas,Apple
4,7,Monitor,Computers,Monitor de 24 pulgadas,Samsung
5,5,Smartwatch,Wearables,Smartwatch con GPS,Apple
6,4,Smartphone,Phones,Smartphone de gama media,Samsung


In [None]:
pd.DataFrame(products_dt.history())

Unnamed: 0,timestamp,operation,operationParameters,operationMetrics,clientVersion,version,readVersion,source,data_owner,domain
0,1730852961526,VACUUM END,{'status': 'COMPLETED'},"{'numDeletedFiles': 2, 'numVacuumedDirectories...",delta-rs.0.21.0,5,,,,
1,1730852961524,VACUUM START,"{'defaultRetentionMillis': '604800000', 'reten...","{'numFilesToDelete': 2, 'sizeOfDataToDelete': ...",delta-rs.0.21.0,4,,,,
2,1730852915932,OPTIMIZE,"{'predicate': '[]', 'targetSize': '104857600'}","{'filesAdded': '{""avg"":2006.0,""max"":2006,""min""...",delta-rs.0.21.0,3,2.0,,,
3,1730852617940,WRITE,"{'mode': 'Append', 'partitionBy': '[""product_c...","{'execution_time_ms': 1, 'num_added_files': 1,...",delta-rs.0.21.0,2,,,,
4,1730852587212,WRITE,"{'partitionBy': '[""product_category""]', 'mode'...","{'execution_time_ms': 5, 'num_added_files': 3,...",delta-rs.0.21.0,1,,,,
5,1730852505754,CREATE TABLE,"{'protocol': '{""minReaderVersion"":1,""minWriter...",,delta-rs.0.21.0,0,,CRM,Equipo de ventas,retail


### Schema evolution

In [None]:
df_products_new_2 = pd.DataFrame(
    columns=["product_id", "product_name", "product_category", "product_description", "product_brand", "product_price"],
    data=[
        [20, "Smartphone", "Phones", "Smartphone de gama alta", "Apple", 999.99],
    ]
)

write_deltalake(
    "datalakehouse/bronze/retail/products",
    df_products_new_2,
    mode="append",
    engine="rust",
    schema_mode="merge"
    )

In [None]:
DeltaTable("datalakehouse/bronze/retail/products").to_pandas()

Unnamed: 0,product_id,product_name,product_description,product_brand,product_category,product_price
0,20,Smartphone,Smartphone de gama alta,Apple,Phones,999.99
1,1,Laptop,Laptop de 15 pulgadas,HP,Computers,
2,2,Mouse,Mouse inalámbrico,Logitech,Computers,
3,3,Teclado,Teclado mecánico,Razer,Computers,
4,1,Laptop,Laptop de 15 pulgadas,HP,Computers,
5,2,Mouse,Mouse inalámbrico,Logitech,Computers,
6,3,Teclado,Teclado mecánico,Razer,Computers,
7,6,Tablet,Tablet de 10 pulgadas,Apple,Computers,
8,7,Monitor,Monitor de 24 pulgadas,Samsung,Computers,
9,6,Tablet,Tablet de 10 pulgadas,Apple,Computers,
