# 1. Configuracion base

In [1]:
from pyspark.sql import SparkSession
import time
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from pyspark.sql.functions import col
spark = SparkSession.builder \
    .appName("spark_labs") \
    .master("local[6]") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.memory", "4g") \
    .config("spark.default.parallelism", "12") \
    .config("spark.sql.shuffle.partitions", "12") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()
print(spark.sparkContext.uiWebUrl)


http://host.docker.internal:4040


# 2. No optimizado

In [2]:
df = spark.read.csv("./src/ventas.csv", header=True, inferSchema=True)

df_filtrado = df.filter(col("monto") > 100)

df_group1 = df_filtrado.groupBy("pais").sum("monto")
df_group2 = df_filtrado.groupBy("producto").sum("monto")
start = time.time()
df_group1.show()
print("Tiempo groupBy pais:", time.time() - start)

start = time.time()
df_group2.show()
print("Tiempo groupBy pais:", time.time() - start)


+----+----------+
|pais|sum(monto)|
+----+----------+
|  CL|     850.0|
|  PE|    1770.5|
|  MX|    2800.0|
|  CO|     700.0|
+----+----------+

Tiempo groupBy pais: 0.5998163223266602
+--------+----------+
|producto|sum(monto)|
+--------+----------+
| Monitor|    2250.0|
|  Laptop|    3870.5|
+--------+----------+

Tiempo groupBy pais: 0.1193540096282959


# 3. Optimizado


In [3]:
df = spark.read.csv("./src/ventas.csv", header=True, inferSchema=True)

df_base = (
    df
    .select("pais", "producto", "monto")   # column pruning manual
    .filter(col("monto") > 100)
    .cache()                            # cache estrat√©gico
)

df_group1 = df_base.groupBy("pais").sum("monto")
df_group2 = df_base.groupBy("producto").sum("monto")

start = time.time()
df_group1.show()
print("Tiempo groupBy pais:", time.time() - start)

start = time.time()
df_group2.show()
print("Tiempo groupBy producto:", time.time() - start)

+----+----------+
|pais|sum(monto)|
+----+----------+
|  CL|     850.0|
|  PE|    1770.5|
|  MX|    2800.0|
|  CO|     700.0|
+----+----------+

Tiempo groupBy pais: 0.2645833492279053
+--------+----------+
|producto|sum(monto)|
+--------+----------+
| Monitor|    2250.0|
|  Laptop|    3870.5|
+--------+----------+

Tiempo groupBy producto: 0.09758281707763672
