In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("MiAplicacion").getOrCreate()

25/04/01 12:00:56 WARN Utils: Your hostname, mauro-desktop resolves to a loopback address: 127.0.1.1; using 192.168.0.21 instead (on interface wlxf4ec3892e485)
25/04/01 12:00:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/01 12:00:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import os
csv_files = [os.path.join("archive", f) for f in os.listdir("archive") if f.endswith('.csv')]

df = spark.read.options(header='True', inferSchema='True').csv(csv_files)

25/04/01 12:01:10 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [4]:
from pyspark.sql import functions as F

# Nulos

In [5]:
# Muestra conteo de nulos por columna
df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns]).show()



+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|  brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|         0|         0|         0|          0|     20339246|8757117|    0|      0|        4598|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+



                                                                                

In [6]:
# Imputamos los valores nulos ya que las columnas en cuestion no son críticas.
df_clean = df.fillna({
    "category_code": "unknown",
    "brand": "unknown"
})

# Al ser pocas filas, optamos por eliminarlas. 
df_clean = df_clean.filter(F.col("user_session").isNotNull())

In [7]:
# Verificación

df_clean.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in df_clean.columns]).show()



+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+
|         0|         0|         0|          0|            0|    0|    0|      0|           0|
+----------+----------+----------+-----------+-------------+-----+-----+-------+------------+



                                                                                

# Outliers

In [8]:
# Estadísticas básicas de precio
df_clean.select(
    F.mean("price").alias("avg_price"),
    F.stddev("price").alias("std_price"),
    F.min("price").alias("min_price"),
    F.max("price").alias("max_price")
).show()

# Filtrar precios "razonables" (ej: entre 0.01 y 10,000)
df_clean = df_clean.filter((F.col("price") > 0.01) & (F.col("price") < 10000))



+----------------+------------------+---------+---------+
|       avg_price|         std_price|min_price|max_price|
+----------------+------------------+---------+---------+
|8.53489224700748|19.382060565203542|   -79.37|   327.78|
+----------------+------------------+---------+---------+



                                                                                

In [9]:
# Eliminamos precios negativos, podrían ser reembolsos, cupones etc. Pero deberia estar especificado de otra forma.

df_clean = df_clean.filter(F.col("price") >= 0)

In [10]:
# Estadísticas básicas de precio
df_clean.select(
    F.mean("price").alias("avg_price"),
    F.stddev("price").alias("std_price"),
    F.min("price").alias("min_price"),
    F.max("price").alias("max_price")
).show()

# Filtrar precios "razonables" (ej: entre 0.01 y 10,000)
df_clean = df_clean.filter((F.col("price") > 0.01) & (F.col("price") < 10000))



+-----------------+-----------------+---------+---------+
|        avg_price|        std_price|min_price|max_price|
+-----------------+-----------------+---------+---------+
|8.578329096134077|19.42124199976834|     0.05|   327.78|
+-----------------+-----------------+---------+---------+



                                                                                

In [11]:
quantiles = df_clean.approxQuantile("price", [0.25, 0.75, 0.99], 0.01)
Q1, Q3, P99 = quantiles[0], quantiles[1], quantiles[2]
IQR = Q3 - Q1

upper_limit = Q3 + 1.5 * IQR
print(f"Límite superior (IQR): {upper_limit}, Percentil 99%: {P99}")



Límite superior (IQR): 14.64, Percentil 99%: 327.78


                                                                                

A pesar de que puede ser extraño, comprendemos que puede haber cosmeticos de hasta mas de 300 dolares. Por lo que no tomamos medidas respecto a valores atipicos en la columna 'price'.

In [12]:
df_clean.filter(F.col("price")>300).filter((F.col("brand") != "strong")  & (F.col("brand") != "unknown")).show()



+-------------------+----------+----------+-------------------+-------------+------+-----+---------+--------------------+
|         event_time|event_type|product_id|        category_id|category_code| brand|price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+-------------+------+-----+---------+--------------------+
|2019-11-14 16:54:04|      view|   5635474|1487580008984608779|      unknown|entity|307.6|571598796|18af287e-6279-41c...|
|2019-10-07 10:00:58|      view|   5635474|1487580008984608779|      unknown|entity|307.6|440589741|726655f1-fa53-42e...|
|2019-10-09 14:50:11|      view|   5635474|1487580008984608779|      unknown|entity|307.6|558545066|464127c4-1cfb-423...|
|2019-10-09 18:43:02|      view|   5635474|1487580008984608779|      unknown|entity|307.6|480150376|72b5f64a-618c-4a8...|
|2019-10-15 09:50:40|      view|   5635474|1487580008984608779|      unknown|entity|307.6|560482622|583ea02a-fd3f-44d...|
|2019-10-19 15:10:14|   

                                                                                

In [13]:
from pyspark.sql.window import Window

# Agregar columna de eventos por día
user_daily_events = df_clean.withColumn(
    "events_per_day", 
    F.count("*").over(Window.partitionBy("user_id", F.to_date("event_time")))
)

# Filtrar usuarios con >100 eventos/día y contar ocurrencias
high_activity_users = user_daily_events.filter(F.col("events_per_day") > 100)
print(f"Usuarios con >100 eventos/día: {high_activity_users.select('user_id').distinct().count()}")
high_activity_users.select("user_id", "events_per_day").distinct().show(10)

                                                                                

Usuarios con >100 eventos/día: 14704




+---------+--------------+
|  user_id|events_per_day|
+---------+--------------+
|236659975|           108|
|307789062|           109|
|343320891|           171|
|414453512|           102|
|438606371|           106|
|479888694|           143|
|483574352|           183|
|495786914|           123|
|500517651|           108|
|527692709|           106|
+---------+--------------+
only showing top 10 rows



                                                                                

In [14]:
# Filtrar usuarios con <100 eventos/día y contar ocurrencias
high_activity_users = user_daily_events.filter(F.col("events_per_day") < 100)
print(f"Usuarios con >100 eventos/día: {high_activity_users.select('user_id').distinct().count()}")
high_activity_users.select("user_id", "events_per_day").distinct().show(10)

25/04/01 12:03:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:03:46 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

Usuarios con >100 eventos/día: 1635617


25/04/01 12:04:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

+---------+--------------+
|  user_id|events_per_day|
+---------+--------------+
| 31156111|             1|
| 34915661|             1|
| 67944478|             1|
|152961343|             1|
|153316955|            17|
|196042408|             4|
|204142009|             1|
|204166748|             7|
|228684512|             2|
|230933404|            16|
+---------+--------------+
only showing top 10 rows



                                                                                

Vamos a tomar una decision sensata sin mayores complicaciones por el momento. Hay aproximadamente 1.6 millones de usuarios, de los cuales 15 mil usuarios tienen mas de 100 sesiones por dia. Vamos a eliminar los registros con mas de 150 sesiones, con el fin de preservar algun posible cliente extremo o revendedor pero limitando la posibilidad de bots obvios en la plataforma.

In [15]:
# 1. Calcular eventos diarios por usuario
df_clean = df_clean.withColumn(
    "events_per_day", 
    F.count("*").over(Window.partitionBy("user_id", F.to_date("event_time")))
)

# 2. Filtrar SOLO registros donde eventos_per_day < 150
df_clean = df_clean.filter(F.col("events_per_day") < 150).drop("events_per_day")

In [16]:
from pyspark.sql.functions import datediff

session_duration = df_clean.groupBy("user_session").agg(
    F.min("event_time").alias("start_time"),
    F.max("event_time").alias("end_time"),
    datediff(F.max("event_time"), F.min("event_time")).alias("duration_days")
)

# Top 10 sesiones más largas
session_duration.orderBy(F.desc("duration_days")).show(10)

25/04/01 12:04:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:29 WARN RowBasedKeyValueBatch: Calling spill() on

+--------------------+-------------------+-------------------+-------------+
|        user_session|         start_time|           end_time|duration_days|
+--------------------+-------------------+-------------------+-------------+
|099fefe4-a74c-4da...|2019-10-01 03:52:13|2020-02-29 14:31:51|          151|
|53e8dd6f-0a8a-408...|2019-10-01 03:40:33|2020-02-29 05:23:50|          151|
|0e706683-d627-4f4...|2019-10-01 11:18:38|2020-02-29 12:20:42|          151|
|f03388cd-bea8-419...|2019-10-01 20:37:43|2020-02-29 00:02:17|          151|
|52b30a79-923b-461...|2019-10-01 06:07:13|2020-02-29 08:06:45|          151|
|2d6fa5e7-e91c-488...|2019-10-01 05:07:55|2020-02-29 08:18:37|          151|
|32d76835-fa23-4e5...|2019-10-01 16:06:58|2020-02-29 14:44:47|          151|
|285e8547-29b3-49d...|2019-10-01 08:05:27|2020-02-29 13:53:06|          151|
|38785db2-b8d6-4c3...|2019-10-01 15:05:11|2020-02-29 13:30:03|          151|
|2c1569d4-8ab3-414...|2019-10-01 11:12:49|2020-02-29 10:50:25|          151|

                                                                                

Observamos una cantidad anomala de dias de duracion en algunas sesiones, algo que no es posible. Una sesion no puede durar 5 meses. Vamos a explorar la cantidad de sesiones por rangos de dias mas aceptables.

In [17]:
from pyspark.sql.functions import count, when

# Calcular la duración en días de cada sesión (si no lo has hecho)
session_duration = df_clean.groupBy("user_session").agg(
    (F.datediff(F.max("event_time"), F.min("event_time"))).alias("duration_days")
)

# Contar sesiones en cada rango
duration_bins = session_duration.select(
    F.count(F.when(F.col("duration_days") < 15, True)).alias("<15_days"),
    F.count(F.when(F.col("duration_days") > 15, True)).alias(">15_days"),
    F.count(F.when(F.col("duration_days") > 30, True)).alias(">30_days"),
    F.count(F.when(F.col("duration_days") > 50, True)).alias(">50_days"),
    F.count(F.when(F.col("duration_days") > 100, True)).alias(">100_days"),
    F.count("*").alias("total_sessions")
).show()

25/04/01 12:04:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:51 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:52 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:04:54 WARN RowBasedKeyValueBatch: Calling spill() on

+--------+--------+--------+--------+---------+--------------+
|<15_days|>15_days|>30_days|>50_days|>100_days|total_sessions|
+--------+--------+--------+--------+---------+--------------+
| 4345715|    9970|    7314|    5741|     1920|       4355940|
+--------+--------+--------+--------+---------+--------------+



                                                                                

Vamos a tomar una decision contemplando que es muy raro que haya sesiones que duren tanto tiempo, es muy probable que sea un error de tracking, cookies mal configuradas y falta de renovacion de sesiones. Por ello, las sesiones que duren mas de 15 dias seran eliminadas.

In [18]:
df_clean = df_clean.join(
    session_duration.filter(F.col("duration_days") <= 15),
    "user_session",
    "inner"
)

In [37]:
# Eliminar la columna 'duration_days'
df_clean = df_clean.drop("duration_days")

# Inspección y normalización de variables categóricas

In [None]:
# Verificar valores únicos en `event_type
df_clean.select("event_type").distinct().show()

25/04/01 12:25:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:25:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:25:01 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:25:02 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:25:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:25:03 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:25:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:25:04 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:25:04 WARN RowBasedKeyValueBatch: Calling spill() on

+----------------+
|      event_type|
+----------------+
|        purchase|
|            view|
|            cart|
|remove_from_cart|
+----------------+



                                                                                

In [None]:
# Verificar valores únicos en `category_code`
df_clean.select("category_code").distinct().show(truncate=False)

25/04/01 12:37:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:37:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:37:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:37:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:37:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:37:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:37:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:37:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:37:31 WARN RowBasedKeyValueBatch: Calling spill() on

+--------------------------------------+
|category_code                         |
+--------------------------------------+
|stationery.cartrige                   |
|furniture.living_room.cabinet         |
|unknown                               |
|furniture.bathroom.bath               |
|accessories.bag                       |
|appliances.personal.hair_cutter       |
|appliances.environment.air_conditioner|
|appliances.environment.vacuum         |
|furniture.living_room.chair           |
|accessories.cosmetic_bag              |
|apparel.glove                         |
|appliances.personal.massager          |
|sport.diving                          |
+--------------------------------------+



                                                                                

Los codigos de categoría estan en orden en cuanto a su escritura.

In [25]:
# Verificar valores únicos en `brand`
df_clean.select("brand").distinct().show(truncate=False)

25/04/01 12:39:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:39:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:39:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:39:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:39:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:39:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:39:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:39:40 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:39:40 WARN RowBasedKeyValueBatch: Calling spill() on

+------------+
|brand       |
+------------+
|beautix     |
|farmona     |
|dr.gloderm  |
|profhenna   |
|bioderma    |
|invisibobble|
|philips     |
|riche       |
|nova        |
|oniq        |
|lebelage    |
|vilenta     |
|fancy       |
|jaguar      |
|tertio      |
|siberina    |
|koreatida   |
|jas         |
|rocknailstar|
|depilflax   |
+------------+
only showing top 20 rows



                                                                                

Normalización de marcas

In [29]:
from pyspark.sql.functions import trim
from pyspark.sql.functions import lower

df_clean = df_clean.withColumn("brand", lower(df_clean["brand"]))
df_clean = df_clean.withColumn("brand", trim(df_clean["brand"]))

In [None]:
df_clean.select("brand").distinct() \
    .filter(F.col("brand").rlike("[^a-zA-Z0-9 ]")) \
    .show(truncate=False)


25/04/01 12:52:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:52:27 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:52:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:52:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:52:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:52:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:52:30 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:52:31 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 12:52:31 WARN RowBasedKeyValueBatch: Calling spill() on

+-------------+
|brand        |
+-------------+
|dr.gloderm   |
|f.o.x        |
|beauty-free  |
|marutaka-foot|
|art-visage   |
|i-laq        |
|bpw.style    |
|de.lux       |
|s.care       |
|vl-gel       |
|yu-r         |
+-------------+



                                                                                

Hay marcas con caracteres especiales, pero realmente existen y esta bien preservar su sentido.

# Fechas

In [None]:
# 1. Mostrar el rango de fechas
date_range = df_clean.select(
    F.min("event_time").alias("fecha_minima"),
    F.max("event_time").alias("fecha_maxima")
).show()




+-------------------+-------------------+
|       fecha_minima|       fecha_maxima|
+-------------------+-------------------+
|2019-09-30 21:00:00|2020-02-29 20:59:59|
+-------------------+-------------------+



                                                                                

Las fechas minima y maxima se corresponden correctamente, con la descripción de los datos

In [None]:
# 1. Verificar el formato UTC exacto
utc_format_check = df_clean.withColumn(
    "is_valid_utc",
    F.col("event_time").rlike(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$')
)

# Mostrar conteo de formatos válidos/inválidos
utc_format_check.groupBy("is_valid_utc").count().show()




+------------+--------+
|is_valid_utc|   count|
+------------+--------+
|        true|20692840|
+------------+--------+



                                                                                

Todas las fechas respetan el formato UTC

In [38]:
df_clean.columns

['user_session',
 'event_time',
 'event_type',
 'product_id',
 'category_id',
 'category_code',
 'brand',
 'price',
 'user_id']

Vamos a separar la columna fecha en sus distintas partes, año, mes, dia, hora, a fin de facilitar analisis y agrupaciones futuras.

In [None]:

# 1. Extraer componentes temporales
componentes = [
    ("event_year", F.year("event_time")),
    ("event_month", F.month("event_time")),
    ("event_day", F.dayofmonth("event_time")),
    ("event_hour", F.hour("event_time")),
    ("event_dayofweek", F.dayofweek("event_time"))
]

for col_name, expr in componentes:
    df_clean = df_clean.withColumn(col_name, expr)

# 2. Verificación rápida de resultados
df_clean.select("event_time", "event_timestamp", "event_year", "event_month", "event_day", "event_hour").show(5, truncate=False)

25/04/01 22:36:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 22:36:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 22:36:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 22:36:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 22:36:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 22:36:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 22:36:23 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 22:36:24 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/04/01 22:36:24 WARN RowBasedKeyValueBatch: Calling spill() on

+-------------------+-------------------+----------+-----------+---------+----------+
|event_time         |event_timestamp    |event_year|event_month|event_day|event_hour|
+-------------------+-------------------+----------+-----------+---------+----------+
|2019-10-23 06:07:38|2019-10-23 06:07:38|2019      |10         |23       |6         |
|2019-10-23 06:12:50|2019-10-23 06:12:50|2019      |10         |23       |6         |
|2019-10-23 06:44:43|2019-10-23 06:44:43|2019      |10         |23       |6         |
|2019-10-23 06:57:19|2019-10-23 06:57:19|2019      |10         |23       |6         |
|2019-10-23 06:57:21|2019-10-23 06:57:21|2019      |10         |23       |6         |
+-------------------+-------------------+----------+-----------+---------+----------+
only showing top 5 rows



                                                                                

In [None]:
# Eliminar la columna event_time, ya que no usamos mas formato UTC sino que conservamos el timestamp
df_clean = df_clean.drop("event_time")

Estamos en condiciones de exportar nuestros datos limpios para poder proceder al analisis de los mismos.

In [27]:
df_clean.write.csv(path='clean_data',header='True',mode='overwrite')

spark.stop()

ERROR:root:KeyboardInterrupt while sending command.>               (0 + 0) / 19]
Traceback (most recent call last):
  File "/home/mauro/Code/repositories/ecommerce-events-analysis/env/lib/python3.12/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/mauro/Code/repositories/ecommerce-events-analysis/env/lib/python3.12/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.12/socket.py", line 707, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

[Stage 176:> (5 + 6) / 19][Stage 181:> (0 + 0) / 15][Stage 183:> (0 + 0) / 12]5]