Property 	    Description

*event_time* 	Time when event happened at (in UTC).

*event_type* 	Only one kind of event: purchase.

*product_id* 	ID of a product

*category_id* 	Product's category ID

*category_code* 	Product's category taxonomy (code name) if it was possible to make it. Usually present for meaningful categories and skipped for different kinds of accessories.

*brand* 	Downcased string of brand name. Can be missed.

*price* 	Float price of a product. Present.

*user_id* 	Permanent user ID.

Event types
Events can be:

    view - a user viewed a product
    cart - a user added a product to shopping cart
    remove_from_cart - a user removed a product from shopping cart
    purchase - a user purchased a product


In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MiAplicacion").getOrCreate()

In [2]:
from pyspark.sql import functions as F
import os

csv_files = [os.path.join("clean_data", f) for f in os.listdir("clean_data") if f.endswith('.csv')]

df = spark.read.options(header='True', inferSchema='True').csv(csv_files)

In [3]:
df.count()

18093117

In [None]:
# 1. Configuración hardcodeada
target_product = 5809910  # Producto objetivo

# 2. Filtrar eventos de carrito y cachear (reutilizaremos esto)
cart_events = df.filter(F.col("event_type") == "cart").cache()

try:
    # 3. Sesiones que contienen el producto target
    target_sessions = (
        cart_events.filter(F.col("product_id") == target_product)
        .select("user_session")
        .distinct()
    )

    # 4. Join 
    co_occurring_products = (
        # Une los eventos del carrito con las sesiones objetivo usando user_session como clave
        cart_events.join(target_sessions, "user_session")  
        
        # Excluye el propio producto objetivo del resultado
        .filter(F.col("product_id") != target_product)      
        
        # Agrupa por product_id para contar co-ocurrencias
        .groupBy("product_id")
        
        # Cuenta cuántas veces aparece cada producto en carritos con el producto objetivo
        .agg(F.count("*").alias("count"))
        
        # Ordena los resultados de mayor a menor frecuencia
        .orderBy(F.desc("count"))
    )

    # 5. Mostrar resultados (opcional)
    co_occurring_products.show(truncate=False)

finally:
    # 6. Liberar memoria
    cart_events.unpersist()


In [5]:

co_occurring_products.show(truncate=False)

+----------+-----+
|product_id|count|
+----------+-----+
|5809912   |4863 |
|5809911   |3372 |
|5816170   |2681 |
|5816172   |1416 |
|5700037   |1383 |
|5816166   |1373 |
|5854897   |1337 |
|5802432   |996  |
|5816169   |959  |
|5751422   |933  |
|5816178   |870  |
|5854812   |860  |
|5854832   |802  |
|5814046   |762  |
|5792800   |744  |
|5815662   |725  |
|5849033   |724  |
|5816175   |714  |
|5850625   |653  |
|5686925   |638  |
+----------+-----+
only showing top 20 rows



In [None]:
from pyspark.sql import DataFrame

# Configuración en función
def get_co_occurring_products(df: DataFrame, target_product: int) -> DataFrame:

    cart_events = df.filter(F.col("event_type") == "cart").cache()

    try:

        target_sessions = (
            cart_events.filter(F.col("product_id") == target_product)
            .select("user_session")
            .distinct()
        )

   
        co_occurring_products = (
            cart_events.join(target_sessions, "user_session")  
            .filter(F.col("product_id") != target_product)      
            .groupBy("product_id")
            .agg(F.count("*").alias("count"))
            .orderBy(F.desc("count"))
        )

    finally:

        cart_events.unpersist()

    return co_occurring_products