In [1]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

## Empezamos configurando Spark

In [2]:
conf = (SparkConf()
            .setMaster("yarn")
            .set("spark.executor.cores", 5)
            .set("spark.sql.shuffle.partitions", 200)
            .set("spark.default.parallelism", 200)
            .set("spark.executor.memory", "7g")
            .set("spark.dynamicAllocation.maxExecutors", 20)
        )

spark = SparkSession \
    .builder \
    .config(conf=conf) \
    .appName("Test_PySpark") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## Creación de un DataFrame a partir de los datos almacenados en HDFS de la capa Bronze

In [4]:
df = spark.read \
          .option("header","true") \
          .option("inferSchema", "true") \
          .csv("/datos/gittba26/gittba05/Bronze/BNB")

                                                                                

In [5]:
df.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: double (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



In [6]:
df.show()

+-------------------+------+------+------+------+---------+----+-----+
|           datetime|  open|  high|   low| close|   volume|year|month|
+-------------------+------+------+------+------+---------+----+-----+
|2025-10-01 02:00:00|1008.9|1036.4|1003.5|1027.1| 14695.68|2025|   10|
|2025-10-02 02:00:00|1028.2|1099.5|1023.1|1090.8|20843.915|2025|   10|
|2025-10-03 02:00:00|1090.2|1192.7|1084.5|1189.7|44120.044|2025|   10|
|2025-10-04 02:00:00|1189.8|1190.6|1136.3|1151.0|  18602.3|2025|   10|
|2025-10-05 02:00:00|1151.1|1187.9|1144.2|1167.5|13627.366|2025|   10|
|2025-10-06 02:00:00|1167.5|1239.6|1162.9|1223.7|28499.434|2025|   10|
|2025-10-07 02:00:00|1223.7|1355.1|1205.5|1304.8|74441.805|2025|   10|
|2025-10-08 02:00:00|1305.9|1334.5|1265.2|1308.2|  56335.5|2025|   10|
|2025-10-09 02:00:00|1308.0|1318.7|1225.0|1255.9|62487.178|2025|   10|
|2025-10-10 02:00:00|1256.1|1280.9| 891.1|1104.3| 94237.71|2025|   10|
|2025-10-11 02:00:00|1102.0|1183.4|1077.9|1136.8|52931.492|2025|   10|
|2025-

In [7]:
df.count()

1461

In [12]:
!hdfs dfs -ls /datos/gittba26/gittba05/Bronze/BNB

Found 4 items
drwxr-xr-x   - gittba_bdt09 supergroup          0 2026-01-28 19:44 /datos/gittba26/gittba05/Bronze/BNB/year=2022
drwxr-xr-x   - gittba_bdt09 supergroup          0 2026-01-28 19:44 /datos/gittba26/gittba05/Bronze/BNB/year=2023
drwxr-xr-x   - gittba_bdt09 supergroup          0 2026-01-28 19:44 /datos/gittba26/gittba05/Bronze/BNB/year=2024
drwxr-xr-x   - gittba_bdt09 supergroup          0 2026-01-28 19:44 /datos/gittba26/gittba05/Bronze/BNB/year=2025


## Creamos los parquets de la capa Silver a partir de los .CSV de la capa Bronze

In [17]:
silver_path = "/datos/gittba26/gittba05/Silver/BNB"

df.write \
  .mode("overwrite") \
  .partitionBy("year", "month") \
  .parquet(silver_path)

                                                                                

## Leemos los parquets para comprobar que se han guardado bien

In [18]:
silver_path = "/datos/gittba26/gittba05/Silver/BNB"

df_silver = spark.read.parquet(silver_path)

# Lista de particiones (año, mes) en Silver
df_silver.select("year","month").distinct().orderBy("year","month").show(200, truncate=False)



+----+-----+
|year|month|
+----+-----+
|2022|1    |
|2022|2    |
|2022|3    |
|2022|4    |
|2022|5    |
|2022|6    |
|2022|7    |
|2022|8    |
|2022|9    |
|2022|10   |
|2022|11   |
|2022|12   |
|2023|1    |
|2023|2    |
|2023|3    |
|2023|4    |
|2023|5    |
|2023|6    |
|2023|7    |
|2023|8    |
|2023|9    |
|2023|10   |
|2023|11   |
|2023|12   |
|2024|1    |
|2024|2    |
|2024|3    |
|2024|4    |
|2024|5    |
|2024|6    |
|2024|7    |
|2024|8    |
|2024|9    |
|2024|10   |
|2024|11   |
|2024|12   |
|2025|1    |
|2025|2    |
|2025|3    |
|2025|4    |
|2025|5    |
|2025|6    |
|2025|7    |
|2025|8    |
|2025|9    |
|2025|10   |
|2025|11   |
|2025|12   |
+----+-----+



                                                                                

In [19]:
# Verificamos que hay el mismo número de ficheros en la capa Bronze y en la Silver
bronze_path = "/datos/gittba26/gittba05/Bronze/BNB"

df_bronze = spark.read.option("header","true").option("inferSchema","true").csv(bronze_path)

print("Rows Bronze:", df_bronze.count())
print("Rows Silver:", df_silver.count())

                                                                                

Rows Bronze: 1461
Rows Silver: 1461
