In [0]:
from pyspark.sql import SparkSession

# Obtener la sesión de Spark
spark = SparkSession.builder.getOrCreate()

# Obtener la configuración del clúster
conf = spark.sparkContext.getConf()

# Número de cores disponibles
num_cores = int(conf.get("spark.executor.instances", "0")) * int(conf.get("spark.executor.cores", "0"))
print(f"Número de cores disponibles: {num_cores}")

# Memoria disponible por executor
mem_por_executor = conf.get("spark.executor.memory", "0g")
print(f"Memoria disponible por executor: {mem_por_executor}")

# Memoria total disponible en el clúster
num_executors = int(conf.get("spark.executor.instances", "0"))
mem_total = num_executors * int(mem_por_executor[:-1])  # Remover la última letra (que indica la unidad, por ejemplo, 'g' de gigabytes)
print(f"Memoria total disponible en el clúster: {mem_total} GB")



Número de cores disponibles: 0
Memoria disponible por executor: 8278m
Memoria total disponible en el clúster: 0 GB


In [0]:
#Lectura desde archviso texto plano
summary_df = spark.read.csv('dbfs:/FileStore/curso_databricks/2015_summary.csv', header=True, inferSchema=True)

In [0]:
summary_df.show(10)

+-----------------+-------------------+-----+
|DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+-----------------+-------------------+-----+
|    United States|            Romania|   15|
|    United States|            Croatia|    1|
|    United States|            Ireland|  344|
|            Egypt|      United States|   15|
|    United States|              India|   62|
|    United States|          Singapore|    1|
|    United States|            Grenada|   62|
|       Costa Rica|      United States|  588|
|          Senegal|      United States|   40|
|          Moldova|      United States|    1|
+-----------------+-------------------+-----+
only showing top 10 rows



In [0]:
summary_df.printSchema()

root
 |-- DEST_COUNTRY_NAME: string (nullable = true)
 |-- ORIGIN_COUNTRY_NAME: string (nullable = true)
 |-- count: integer (nullable = true)



In [0]:
#Lectura desde archvio de texto plano: data
persona_df = spark.read.format("csv").option("header","true").option("delimiter","|").option("encoding", "ISO-8859-1").load('dbfs:/FileStore/curso_databricks/persona.data')

In [0]:
persona_df.show(10)

+---+---------+--------------+--------------------+-------------+----+-------+----------+
| ID|   NOMBRE|      TELEFONO|              CORREO|FECHA_INGRESO|EDAD|SALARIO|ID_EMPRESA|
+---+---------+--------------+--------------------+-------------+----+-------+----------+
|  1|     Carl|1-745-633-9145|arcu.Sed.et@ante....|   2004-04-23|  32|  20095|         5|
|  2|Priscilla|      155-2498|Donec.egestas.Ali...|   2019-02-17|  34|   9298|         2|
|  3|  Jocelyn|1-204-956-8594|amet.diam@loborti...|   2002-08-01|  27|  10853|         3|
|  4|    Aidan|1-719-862-9385|euismod.et.commod...|   2018-11-06|  29|   3387|        10|
|  5|  Leandra|      839-8044|at@pretiumetrutru...|   2002-10-10|  41|  22102|         1|
|  6|     Bert|      797-4453|a.felis.ullamcorp...|   2017-04-25|  70|   7800|         7|
|  7|     Mark|1-680-102-6792|Quisque.ac@placer...|   2006-04-21|  52|   8112|         5|
|  8|    Jonah|      214-2975|eu.ultrices.sit@v...|   2017-10-07|  23|  17040|         5|
|  9|    H

In [0]:
persona_df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- NOMBRE: string (nullable = true)
 |-- TELEFONO: string (nullable = true)
 |-- CORREO: string (nullable = true)
 |-- FECHA_INGRESO: string (nullable = true)
 |-- EDAD: string (nullable = true)
 |-- SALARIO: string (nullable = true)
 |-- ID_EMPRESA: string (nullable = true)



In [0]:
#Lectura de archvios semi-estructurados JSON

transacciones_df = spark.read.format("json").option("multiline", False).load('dbfs:/FileStore/curso_databricks/transacciones.json')

In [0]:
transacciones_df.show(10)

+--------------+--------------------+--------------------+
|       EMPRESA|             PERSONA|         TRANSACCION|
+--------------+--------------------+--------------------+
|   {5, Amazon}|{[{59, 9811935}, ...|{2021-01-23, 2628.0}|
|      {9, IBM}|{[{50, 9912937}, ...|{2021-01-23, 4261.0}|
|  {7, Samsung}|{[{53, 9769557}, ...|{2021-01-23, 1429.0}|
|   {5, Amazon}|{[{51, 9733329}, ...|{2021-01-23, 3385.0}|
|   {4, Toyota}|{[{52, 9091334}, ...|{2021-01-23, 3514.0}|
|      {9, IBM}|{[{59, 9708669}, ...| {2021-01-23, 823.0}|
|{2, Microsoft}|{null, 47, 31, Ry...|{2021-01-23, 3724.0}|
|    {10, Sony}|{[{51, 9443174}],...|{2021-01-23, 3429.0}|
|   {4, Toyota}|{[{54, 9375039}, ...|{2021-01-23, 4267.0}|
|      {9, IBM}|{[{59, 9227653}, ...| {2021-01-23, 796.0}|
+--------------+--------------------+--------------------+
only showing top 10 rows



In [0]:
transacciones_df.show(10, False)

+--------------+------------------------------------------------------------------------------------------------------+--------------------+
|EMPRESA       |PERSONA                                                                                               |TRANSACCION         |
+--------------+------------------------------------------------------------------------------------------------------+--------------------+
|{5, Amazon}   |{[{59, 9811935}, {53, 9423163}], 33, 26, Brenden, 20549.0}                                            |{2021-01-23, 2628.0}|
|{9, IBM}      |{[{50, 9912937}, {54, 9046676}, {55, 9874284}, {58, 9746053}, {53, 9058704}], 31, 21, Carissa, 1952.0}|{2021-01-23, 4261.0}|
|{7, Samsung}  |{[{53, 9769557}, {59, 9754523}, {52, 9063371}, {55, 9301624}, {56, 9770100}], 42, 73, Fiona, 9960.0}  |{2021-01-23, 1429.0}|
|{5, Amazon}   |{[{51, 9733329}, {57, 9619332}, {51, 9087416}, {50, 9486747}], 59, 14, Allen, 16289.0}                |{2021-01-23, 3385.0}|
|{4, Toyota} 

In [0]:
transacciones_df.printSchema()

root
 |-- EMPRESA: struct (nullable = true)
 |    |-- ID_EMPRESA: string (nullable = true)
 |    |-- NOMBRE_EMPRESA: string (nullable = true)
 |-- PERSONA: struct (nullable = true)
 |    |-- CONTACTO: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- PREFIJO: string (nullable = true)
 |    |    |    |-- TELEFONO: string (nullable = true)
 |    |-- EDAD: long (nullable = true)
 |    |-- ID_PERSONA: string (nullable = true)
 |    |-- NOMBRE_PERSONA: string (nullable = true)
 |    |-- SALARIO: double (nullable = true)
 |-- TRANSACCION: struct (nullable = true)
 |    |-- FECHA: string (nullable = true)
 |    |-- MONTO: double (nullable = true)

