# CONFIGURACION DELTA SPARK

In [1]:
import os
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

def ensure_directories_exist(warehouse_dir, metastore_db_path):
    """
    Ensures the necessary directories for the warehouse and metastore exist.

    Parameters:
        warehouse_dir (str): Path to the warehouse directory (Spark catalog).
        metastore_db_path (str): Path to the metastore database.
    """
    os.makedirs(warehouse_dir, exist_ok=True)
    os.makedirs(os.path.dirname(metastore_db_path), exist_ok=True)

def create_spark_session(app_name="DeltaCatalog", warehouse_dir="./warehouse-spark/spark_catalog", 
                         metastore_db_path="./warehouse-spark/metastore_db"):
    """
    Creates and initializes a SparkSession with Delta Lake support and persistent metastore.

    Parameters:
        app_name (str): Name of the Spark application.
        warehouse_dir (str): Path to the Spark catalog warehouse directory.
        metastore_db_path (str): Path to the persistent metastore database (Derby).

    Returns:
        SparkSession: Configured SparkSession instance.
    """
    # Ensure required directories exist
    ensure_directories_exist(warehouse_dir, metastore_db_path)
    
    # Configure SparkSession with Delta Lake
    builder = SparkSession.builder \
        .appName(app_name) \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.sql.parquet.compression.codec", "gzip") \
        .config("spark.databricks.delta.optimizeWrite.enabled", "true") \
        .config("spark.databricks.delta.autoCompact.enabled", "true") \
        .config("spark.sql.warehouse.dir", os.path.abspath(warehouse_dir)) \
        .config("javax.jdo.option.ConnectionURL", f"jdbc:derby:{os.path.abspath(metastore_db_path)};create=true") \
        .config("spark.sql.catalogImplementation", "hive") \
        .enableHiveSupport()
    
    # Initialize Spark with Delta
    spark = configure_spark_with_delta_pip(builder).getOrCreate()
    
    print(f"SparkSession created with Delta and persistent metastore at: {warehouse_dir}")
    return spark

# SESION DE SPARK

In [2]:
# Example usage
app_name = 'Delta Spark'
# Ruta para el directorio del metastore
warehouse_dir = "./warehouse-spark/spark_catalog"
metastore_db_path = "./warehouse-spark/metastore_db"

spark_session = create_spark_session(
    app_name=app_name,
    warehouse_dir=warehouse_dir,
    metastore_db_path=metastore_db_path
)



:: loading settings :: url = jar:file:/usr/local/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f9258edd-d8b9-4842-b9b1-c275b1a5a874;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.2.1 in central
	found io.delta#delta-storage;3.2.1 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 157ms :: artifacts dl 8ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.2.1 from central in [default]
	io.delta#delta-storage;3.2.1 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   

SparkSession created with Delta and persistent metastore at: ./warehouse-spark/spark_catalog


# [SPARK USER INTERFACE](http://localhost:4040/)

[CLICK HERE](http://localhost:4040/)

# CREAR DATABASE AND DELTA TABLES

In [9]:
def create_database_schema(spark_session, database_name):
    """
    Creates a database if it does not already exist.

    Parameters:
        spark_session (SparkSession): The active SparkSession.
        database_name (str): The name of the database to be created.
    """
    spark_session.sql(f"CREATE DATABASE IF NOT EXISTS {database_name}")
    print(f"Database '{database_name}' created or already exists.")

def list_databases(spark_session):
    """
    Lists all the databases in the Spark catalog.

    Parameters:
        spark_session (SparkSession): The active Spark session.
    """
    databases = spark_session.catalog.listDatabases()
    print('The following databases are present in the Spark catalog:')

    for db in databases:
        print(db.name)

# Ejemplo de uso
database_name = 'delta_spark_database'
create_database(spark_session, database_name)

print()

list_databases(spark_session)

Database 'delta_spark_database' created or already exists.

The following databases are present in the Spark catalog:
default
delta_spark_database
delta_spark_schema


# CREATE SPARK DATAFRAME

In [10]:
def create_dataframe_from_list_dict_using_alphabetical_order_from_columns(spark_session, list_data_dict):
    """
    Creates a Spark DataFrame from a list of dictionaries, reordering columns in alphabetical order.

    Parameters:
        spark_session (SparkSession): The active Spark session.
        list_data_dict (list): A list of dictionaries, where each dictionary represents a row.

    Returns:
        DataFrame: The created Spark DataFrame with columns in alphabetical order.
    """
    if list_data_dict:
        df = spark_session.createDataFrame(list_data_dict)
        return df
    else:
        raise ValueError("The input list is empty.")

def create_dataframe_from_list_dict(spark_session, list_data_dict):
    """
    Creates a Spark DataFrame from a list of dictionaries, preserving the order of the keys.

    Parameters:
        spark_session (SparkSession): The active Spark session.
        list_data_dict (list): A list of dictionaries, where each dictionary represents a row.

    Returns:
        DataFrame: The created Spark DataFrame with columns in the order of the keys.
    """
    if not list_data_dict:
        raise ValueError("The input list is empty.")

    # Get the order of keys from the first dictionary
    columns_order = list(list_data_dict[0].keys())

    # Create the DataFrame and reorder columns
    df = spark_session.createDataFrame(list_data_dict)
    df = df.select(*columns_order)  # Reorder columns explicitly

    return df

def split_spark_dataframe(spark_dataframe, num_parts):
    """
    Splits a Spark DataFrame into the specified number of parts, ensuring each part has at least one row.
    If the requested number of parts exceeds the total rows, it creates as many balanced parts as possible.

    Parameters:
        spark_dataframe (DataFrame): The Spark DataFrame to be split.
        num_parts (int): spark_dataframeThe desired number of parts to split the DataFrame into.

    Returns:
        List[DataFrame]: A list containing the split DataFrames.
    """
    total_rows = spark_dataframe.count()

    if total_rows == 0:
        print("The DataFrame is empty. No parts created.")
        return []

    # Adjust number of parts if more parts are requested than rows
    actual_parts = min(num_parts, total_rows)

    # Calculate base rows per part and distribute remaining rows
    rows_per_part = total_rows // actual_parts
    extra_rows = total_rows % actual_parts

    split_dataframes = []
    start_row = 0

    for i in range(actual_parts):
        # Calculate rows for the current part
        rows_in_this_part = rows_per_part + (1 if i < extra_rows else 0)
        end_row = start_row + rows_in_this_part

        # Select the rows for the current part
        split_dataframes.append(
            spark_dataframe.limit(end_row).subtract(spark_dataframe.limit(start_row))
        )
        start_row = end_row  # Update start row for the next part

    print(f"Successfully created {len(split_dataframes)} DataFrames.")
    return split_dataframes

# Example of usage
list_data_dict = [
    {'id': 1, 'name': 'Nathalie', 'age': 0},
    {'id': 2, 'name': 'Cora', 'age': 3},
    {'id': 3, 'name': 'Gaby', 'age': 10},
    {'id': 4, 'name': 'Muneca', 'age': 42},
    {'id': 5, 'name': 'Principe', 'age': 46},
    {'id': 6, 'name': 'Ana', 'age': 25},
    {'id': 7, 'name': 'Cecilia', 'age': 30},
    {'id': 8, 'name': 'Lucia', 'age': 18},
    {'id': 9, 'name': 'Zeus', 'age': 5},
    {'id': 10, 'name': 'Guadalupe', 'age': 15},
    {'id': 11, 'name': 'Augusto', 'age': 28},
    {'id': 12, 'name': 'Muiscas', 'age': 23},
    {'id': 13, 'name': 'Jorge', 'age': 31}
]

# crea el datafarme
spark_dataframe = create_dataframe_from_list_dict(spark_session, list_data_dict)

spark_dataframe.show()

+---+---------+---+
| id|     name|age|
+---+---------+---+
|  1| Nathalie|  0|
|  2|     Cora|  3|
|  3|     Gaby| 10|
|  4|   Muneca| 42|
|  5| Principe| 46|
|  6|      Ana| 25|
|  7|  Cecilia| 30|
|  8|    Lucia| 18|
|  9|     Zeus|  5|
| 10|Guadalupe| 15|
| 11|  Augusto| 28|
| 12|  Muiscas| 23|
| 13|    Jorge| 31|
+---+---------+---+



# DIVIDIR DATAFRAME

In [5]:
# divide el dataframe
num_parts = 2
df_sql, df_delta = split_spark_dataframe(spark_dataframe, num_parts)

df_sql.show()

df_delta.show()

Successfully created 2 DataFrames.
+---+--------+---+
| id|    name|age|
+---+--------+---+
|  1|Nathalie|  0|
|  2|    Cora|  3|
|  3|    Gaby| 10|
|  4|  Muneca| 42|
|  5|Principe| 46|
|  6|     Ana| 25|
|  7| Cecilia| 30|
+---+--------+---+

+---+---------+---+
| id|     name|age|
+---+---------+---+
|  8|    Lucia| 18|
|  9|     Zeus|  5|
| 10|Guadalupe| 15|
| 11|  Augusto| 28|
| 12|  Muiscas| 23|
| 13|    Jorge| 31|
+---+---------+---+



# CREAR TABLAS DELTA

In [12]:
def create_delta_table_with_spark_dataframe(spark_session, database_name, table_name, spark_dataframe, warehouse_dir):
    """
    Creates a Delta table in a specified database from a Spark DataFrame if it does not already exist.

    Parameters:
        spark_session (SparkSession): The active SparkSession.
        database_name (str): The name of the database where the table will be created.
        table_name (str): The name of the table to be created.
        spark_dataframe (DataFrame): The Spark DataFrame from which the schema is derived.
        warehouse_dir (str): The root directory for the warehouse.
    """
    # Validate that the database exists
    available_databases = [db.name for db in spark_session.catalog.listDatabases()]
    if database_name not in available_databases:
        raise ValueError(f"The database '{database_name}' does not exist. Available databases: {available_databases}")
    
    # Set the active database
    spark_session.sql(f"USE {database_name}")
    
    # Define the table path based on the database and table name
    table_path = f"{warehouse_dir}/{database_name}/{table_name}"
    
    # Get the schema of the DataFrame
    table_schema = ", ".join([f"{field.name} {field.dataType.simpleString()}" for field in spark_dataframe.schema.fields])
    
    # Create the Delta table
    spark_session.sql(f"""
        CREATE TABLE IF NOT EXISTS {table_name} ({table_schema})
        USING DELTA
        LOCATION '{table_path}'
    """)
    
    print(f"Table '{table_name}' created or already exists at '{table_path}' in database '{database_name}'.")

def save_dataframe_as_parquet(spark_dataframe, file_name, file_path):
    """
    Saves a Spark DataFrame to the specified path in Parquet format.

    Parameters:
        spark_dataframe (DataFrame): The Spark DataFrame to be saved.
        file_name (str): The name of the file (or dataset) to be created.
        file_path (str): The location where the file will be stored.
    """
    # Save the Spark DataFrame as a Parquet file at the specified location
    spark_dataframe.write.format("parquet").mode("overwrite").save(file_path)
    
    print(f"DataFrame saved as Parquet at '{file_path}/{file_name}.parquet'.")

def create_delta_table_in_database(spark_session, database_name, table_name, spark_dataframe, warehouse_dir):
    """
    Creates a Delta table in a specified database from a Spark DataFrame.

    Parameters:
        spark_session (SparkSession): The active SparkSession.
        database_name (str): The name of the database where the table will be created.
        table_name (str): The name of the table to be created.
        spark_dataframe (DataFrame): The Spark DataFrame to be written as a table.
        warehouse_dir (str): The root directory for the warehouse.
    """
    # Validate that the database exists
    available_databases = [db.name for db in spark_session.catalog.listDatabases()]
    if database_name not in available_databases:
        raise ValueError(f"The database '{database_name}' does not exist. Available databases: {available_databases}")
    
    # Set the active database
    spark_session.sql(f"USE {database_name}")
    
    # Define the full table name and table path
    full_table_name = f"{database_name}.{table_name}"
    table_path = f"{warehouse_dir}/{database_name}/{table_name}"
    
    # Save the DataFrame to the Delta table in the catalog
    spark_dataframe.write.format("delta").mode("overwrite").saveAsTable(full_table_name)
    
    print(f"Table '{table_name}' created or already exists in database '{database_name}' at '{table_path}'.")

# GUARDAR DELTA TABLE CON SQL

In [13]:
database_sql = 'default'
table_sql = 'delta_sql'
dataframe_sql = df_sql
sql_warehouse_dir = warehouse_dir

create_delta_table_with_spark_dataframe(spark_session, database_sql, table_sql, dataframe_sql, sql_warehouse_dir)

24/11/27 22:44:09 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`default`.`delta_sql` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
24/11/27 22:44:09 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
24/11/27 22:44:09 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
24/11/27 22:44:09 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/11/27 22:44:09 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


Table 'delta_sql' created or already exists at './warehouse-spark/spark_catalog/default/delta_sql' in database 'default'.


# GUARDAR DELTA TABLE CON DATAFRAME

In [14]:
database_delta = 'delta_spark_database'
table_delta = 'delta_dataframe'
dataframe_delta = df_delta
delta_warehouse_dir = warehouse_dir

create_delta_table_in_database(spark_session, database_delta, table_delta, dataframe_delta, delta_warehouse_dir)

                                                                                24/11/27 22:47:18 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                24/11/27 22:47:21 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`delta_spark_database`.`delta_dataframe` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


Table 'delta_dataframe' created or already exists in database 'delta_spark_database' at './warehouse-spark/spark_catalog/delta_spark_database/delta_dataframe'.


# GUARDAR DATAFRAME COMO PARQUET

In [15]:
parquet_dataframe = spark_dataframe
file_name = 'spark_dataframe.parquet'
file_path = './spark_files'

save_dataframe_as_parquet(spark_dataframe, file_name, file_path)

DataFrame saved as Parquet at './spark_files/spark_dataframe.parquet.parquet'.


# LISTAR LAS TABLAS DE LA BASE DE DATOS

In [22]:
def list_tables_in_database(spark_session, database_name):
    """
    Lists all tables in the specified database in Spark.

    Parameters:
        spark_session (SparkSession): The active SparkSession.
        database_name (str): The name of the database whose tables will be listed.

    Returns:
        list: A list of table names in the specified database.
    """
    # Set the current database to the specified database
    spark_session.sql(f"USE {database_name}")
    
    # List all tables in the database
    tables = spark_session.catalog.listTables(database_name)
    
    # Extract table names from the list of table objects
    table_names = [table.name for table in tables]

    for table in table_names:
        print(f'Database : {database_name}, table : {table}')
    
    return table_names

list_tables_in_database(spark_session, database_sql)

list_tables_in_database(spark_session, database_delta)

Database : default, table : delta_sql
Database : delta_spark_database, table : delta_dataframe


['delta_dataframe']