In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType
from pyspark.sql import functions as F
from datetime import date

# Crear una SparkSession
spark = SparkSession.builder.appName("senior_data_engineer_challenge").getOrCreate()

# Definir el esquema del DataFrame
schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price_per_unit", FloatType(), True),
    StructField("transaction_date", DateType(), True),
    StructField("store_location", StringType(), True)
])

# Datos de ejemplo
data = [
    ("T001", "C101", "P101", 2, 10.50, date(2024, 1, 5), "New York"),
    ("T002", "C102", "P102", 1, 25.00, date(2024, 1, 5), "Los Angeles"),
    ("T003", "C101", "P103", 5, 5.00, date(2024, 1, 6), "New York"),
    ("T004", "C103", "P101", 3, 10.50, date(2024, 1, 7), "Chicago"),
    ("T005", "C102", "P104", 1, 50.00, date(2024, 1, 7), "Los Angeles"),
    ("T006", "C101", "P102", 4, 25.00, date(2024, 1, 8), "New York"),
    ("T007", "C104", "P103", 10, 5.00, date(2024, 1, 8), "New York"),
    ("T008", "C105", "P105", 2, 75.00, date(2024, 1, 9), "Chicago"),
    ("T009", "C103", "P102", 1, 25.00, date(2024, 1, 9), "Chicago"),
    ("T010", "C101", "P101", 1, 10.50, date(2024, 1, 10), "New York")
]

# Crear el DataFrame
df = spark.createDataFrame(data, schema=schema)
#df = df.withColumn("transaction_date", F.to_date(F.col("transaction_date"), "yyyy-MM-dd"))

df.show()

1. Tarea 1: Operaciones Básicas y Manipulación de Columnas
- Objetivo: Calcular el valor total de cada transacción y crear una columna que clasifique la transacción como "Small", "Medium" o "Large" basada en el valor total.

In [0]:
df_total_transactions = (
    df.withColumn("total_value", F.col("quantity") * F.col("price_per_unit"))
      .withColumn("transaction_size",
        F.when(F.col("total_value") <= 40, "Small")
          .when((F.col("total_value") > 40) & (F.col("total_value") <= 70), "Medium")
          .otherwise("Large")
      )
).show()

2. Tarea 2: Agregaciones y Filtrado
- Objetivo: Encontrar la cantidad total vendida de cada producto y filtrar para mostrar solo los productos cuya venta total supera las 10 unidades.

In [0]:
df_product_sales = (
        df.groupBy("quantity")
          .agg(
              F.sum(F.col("quantity")).alias("total_quantity_sold"),
              F.count("*").alias("number_of_transactions")
          )
).show()
