In [42]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col


# Crear la sesión de Spark
spark = SparkSession.builder.appName("Fabric_Lakehouse_Read_bronce").getOrCreate()

# Definir la ruta del archivo Parquet
parquet_path = "Files/bronce/datos.parquet"

# Leer el archivo Parquet
df = spark.read.format("parquet").load(parquet_path)

# Mostrar los datos leídos
df.show()

# Contar las filas del DataFrame
print("Número de filas:", df.count())



StatementMeta(, 6b9a1872-50ac-457d-845a-740201bbd234, 44, Finished, Available, Finished)

+-------------+--------------------+--------+-----------+-------------------+---------------------+-----------+--------------------+
|           IP|                  ID|Variable|Measurement|          Timestamp|EventProcessedUtcTime|PartitionId|EventEnqueuedUtcTime|
+-------------+--------------------+--------+-----------+-------------------+---------------------+-----------+--------------------+
|198.202.124.3|HPWREN:LP-WXT536:...|      Ta|       6.1C|2024-11-02 15:30:36| 2024-11-02 15:52:...|          1|2024-11-02 15:52:...|
|198.202.124.3|HPWREN:LP-WXT536:...|      Ua|     100.0P|2024-11-02 15:30:36| 2024-11-02 15:52:...|          1|2024-11-02 15:52:...|
|198.202.124.3|HPWREN:LP-WXT536:...|      Ua|     100.0P|2024-11-02 15:31:16| 2024-11-02 16:17:...|          2|2024-11-02 16:17:...|
|198.202.124.3|HPWREN:LP-WXT536:...|      Ta|       6.1C|2024-11-02 15:31:26| 2024-11-02 16:24:...|          3|2024-11-02 16:24:...|
|198.202.124.3|HPWREN:LP-WXT536:...|      Pa|     882.8H|2024-11-02 1

In [43]:
# Seleccionar solo las columnas 'Variable', 'Measurement', y 'Timestamp'
df= df.select("Variable", "Measurement", "Timestamp")
df.show()

StatementMeta(, 6b9a1872-50ac-457d-845a-740201bbd234, 45, Finished, Available, Finished)

+--------+-----------+-------------------+
|Variable|Measurement|          Timestamp|
+--------+-----------+-------------------+
|      Ta|       6.1C|2024-11-02 15:30:36|
|      Ua|     100.0P|2024-11-02 15:30:36|
|      Ua|     100.0P|2024-11-02 15:31:16|
|      Ta|       6.1C|2024-11-02 15:31:26|
|      Pa|     882.8H|2024-11-02 15:31:26|
|      Ta|       6.1C|2024-11-02 15:31:36|
|      Pa|     882.8H|2024-11-02 15:31:36|
|      Ta|       6.2C|2024-11-02 15:30:06|
|      Vr|     3.636V|2024-11-02 15:31:23|
|      Dm|       168D|2024-11-02 15:31:25|
|      Sm|       3.5M|2024-11-02 15:31:25|
|      Ua|     100.0P|2024-11-02 15:31:25|
|      Rc|      1.07M|2024-11-02 15:31:25|
|      Rd|      1800s|2024-11-02 15:31:25|
|      Hc|       0.0M|2024-11-02 15:31:25|
|      Vh|      23.9V|2024-11-02 15:31:23|
|      Dn|       157D|2024-11-02 15:30:25|
|      Dm|       167D|2024-11-02 15:30:25|
|      Dx|       175D|2024-11-02 15:30:25|
|      Sn|       1.2M|2024-11-02 15:30:25|
+--------+-

In [44]:
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

variable_unit_mapping = {
    'Sn': {'M': 'm/s', 'K': 'km/h', 'S': 'mph', 'N': 'knots'},  # Wind speed minimum
    'Sm': {'M': 'm/s', 'K': 'km/h', 'S': 'mph', 'N': 'knots'},  # Wind speed average
    'Sx': {'M': 'm/s', 'K': 'km/h', 'S': 'mph', 'N': 'knots'},  # Wind speed maximum
    'Dn': {'D': 'deg'},                                         # Wind direction minimum
    'Dm': {'D': 'deg'},                                         # Wind direction average
    'Dx': {'D': 'deg'},                                         # Wind direction maximum
    'Pa': {'H': 'hPa', 'P': 'Pa', 'B': 'bar', 'M': 'mmHg', 'I': 'inHg'},  # Air pressure
    'Ta': {'C': '°C', 'F': '°F'},                               # Air temperature
    'Tp': {'C': '°C', 'F': '°F'},                               # Internal temperatur
    'Ua': {'P': '%RH'},                                         # Relative humidity
    'Rc': {'M': 'mm', 'I': 'in'},                               # Rain accumulation
    'Rd': {'S': 's'},                                           # Rain duration
    'Ri': {'M': 'mm/h', 'I': 'in/h'},                           # Rain intensity
    'Rp': {'M': 'mm/h', 'I': 'in/h'},                           # Rain peak intensity 
    'Hc': {'M': 'hits/cm²', 'I': 'hits/in²', 'H': 'hits'},      # Hail accumulation
    'Hd': {'S': 's'},                                           # Hail duration
    'Hi': {'M': 'hits/cm²h', 'I': 'hits/in²h', 'H': 'hits/h'},  # Hail intensity 
    'Hp': {'M': 'hits/cm²h', 'I': 'hits/in²h', 'H': 'hits/h'},  # Hail peak intensity 
    'Th': {'C': '°C', 'F': '°F' },                              # Heating temperature 
    'Vh': {'V': 'V', 'N': 'N', 'W': 'W', 'F2':'F2'},            # Heating voltage 
    'Vs': {'V': 'V'},                                           # Supply voltage
    'Vr': {'V': 'V'},                                           # 3.5 V ref. voltage
}

# Función para mapear las unidades de una variable específica
def map_units(variable, unit):
    if variable in variable_unit_mapping:
        return variable_unit_mapping[variable].get(unit, unit)  # Devuelve la unidad mapeada, o la original si no existe mapeo
    return unit

# Registrar la UDF
map_units_udf = F.udf(map_units, StringType())


# Diccionario de variables y sus significados
variable_meaning_mapping = {
    'Sn': 'Wind speed minimum',
    'Sm': 'Wind speed average',
    'Sx': 'Wind speed maximum',
    'Dn': 'Wind direction minimum',
    'Dm': 'Wind direction average',
    'Dx': 'Wind direction maximum',
    'Pa': 'Air pressure',
    'Ta': 'Air temperature',
    'Tp': 'Internal temperatur',
    'Ua': 'Relative humidity',
    'Rc': 'Rain accumulation',
    'Rd': 'Rain duration',
    'Ri': 'Rain intensity',
    'Rp': 'Rain peak intensity', 
    'Hc': 'Hail accumulation',
    'Hd': 'Hail duration',
    'Hi': 'Hail intensity',
    'Hp': 'Hail peak intensity', 
    'Th': 'Heating temperature', 
    'Vh': 'Heating voltage', 
    'Vs': 'Supply voltage',
    'Vr': '3.5 V ref. voltage',
}

# Función para obtener el significado de una variable específica
def get_variable_meaning(variable):
    return variable_meaning_mapping.get(variable, "Unknown variable")  # Devuelve "Unknown variable" si no existe mapeo

# Registrar la UDF
get_variable_meaning_udf = F.udf(get_variable_meaning, StringType())

# Aplicar la función de mapeo para cada fila para obtener el significado de la variable
df = df.withColumn("nombre_variable", get_variable_meaning_udf(F.col("Variable")))

# Aplicar la función de mapeo para cada fila para obtener la unidad mapeada
df = df.withColumn("Mapped_Unit", map_units_udf(F.col("Variable"), F.col("Measurement").substr(-1, 1)))  # Asegúrate de extraer la unidad correctamente de Measurement

# Seleccionar las columnas con la unidad mapeada y el nombre de la variable
df = df.select("Variable", "nombre_variable", "Measurement", "Mapped_Unit", "Timestamp")


# Mostrar los datos resultantes
df.show()


StatementMeta(, 6b9a1872-50ac-457d-845a-740201bbd234, 46, Finished, Available, Finished)

+--------+--------------------+-----------+-----------+-------------------+
|Variable|     nombre_variable|Measurement|Mapped_Unit|          Timestamp|
+--------+--------------------+-----------+-----------+-------------------+
|      Ta|     Air temperature|       6.1C|         °C|2024-11-02 15:30:36|
|      Ua|   Relative humidity|     100.0P|        %RH|2024-11-02 15:30:36|
|      Ua|   Relative humidity|     100.0P|        %RH|2024-11-02 15:31:16|
|      Ta|     Air temperature|       6.1C|         °C|2024-11-02 15:31:26|
|      Pa|        Air pressure|     882.8H|        hPa|2024-11-02 15:31:26|
|      Ta|     Air temperature|       6.1C|         °C|2024-11-02 15:31:36|
|      Pa|        Air pressure|     882.8H|        hPa|2024-11-02 15:31:36|
|      Ta|     Air temperature|       6.2C|         °C|2024-11-02 15:30:06|
|      Vr|  3.5 V ref. voltage|     3.636V|          V|2024-11-02 15:31:23|
|      Dm|Wind direction av...|       168D|        deg|2024-11-02 15:31:25|
|      Sm|  

In [None]:
# Extraer solo los números de la columna Measurement
df = df.withColumn("Measurement_Numeric", F.regexp_extract(F.col("Measurement"), r'(\d*\.?\d+)', 0))

# Seleccionar las columnas deseadas
df_cleaned = df.select("Variable", "nombre_variable",  "Measurement_Numeric", "Measurement", "Mapped_Unit", "Timestamp")

# Mostrar los datos resultantes
df_cleaned.show()


StatementMeta(, 6b9a1872-50ac-457d-845a-740201bbd234, 47, Finished, Available, Finished)

+--------+--------------------+-------------------+-----------+-----------+-------------------+
|Variable|     nombre_variable|Measurement_Numeric|Measurement|Mapped_Unit|          Timestamp|
+--------+--------------------+-------------------+-----------+-----------+-------------------+
|      Ta|     Air temperature|                6.1|       6.1C|         °C|2024-11-02 15:30:36|
|      Ua|   Relative humidity|              100.0|     100.0P|        %RH|2024-11-02 15:30:36|
|      Ua|   Relative humidity|              100.0|     100.0P|        %RH|2024-11-02 15:31:16|
|      Ta|     Air temperature|                6.1|       6.1C|         °C|2024-11-02 15:31:26|
|      Pa|        Air pressure|              882.8|     882.8H|        hPa|2024-11-02 15:31:26|
|      Ta|     Air temperature|                6.1|       6.1C|         °C|2024-11-02 15:31:36|
|      Pa|        Air pressure|              882.8|     882.8H|        hPa|2024-11-02 15:31:36|
|      Ta|     Air temperature|         

In [46]:
import shutil
import os

# Ruta a la carpeta y archivo Parquet en la carpeta 'silver'
path = "Files/silver"  # Ruta a la carpeta 'silver'
parquet_file = f"{path}/datos_silver.parquet"  # Archivo Parquet

# Verificar si la carpeta 'silver' existe, si no, crearla
if not os.path.exists(path):
    os.makedirs(path)

# Si el archivo parquet ya existe, eliminarlo
if os.path.exists(parquet_file):
    os.remove(parquet_file)  # Cambié shutil.rmtree por os.remove para eliminar un archivo

# Guardar el DataFrame en formato Parquet en la carpeta 'silver'
df_cleaned.write.mode("overwrite").parquet(parquet_file)

print(f"Archivo Parquet guardado en {parquet_file}")




StatementMeta(, 6b9a1872-50ac-457d-845a-740201bbd234, 48, Finished, Available, Finished)

Archivo Parquet guardado en Files/silver/datos_silver.parquet
