# Elección películas a preguntar

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql import functions as F, Window

spark = SparkSession.builder.master('local[*]').getOrCreate()

Lo primero que hago es importarme mi df inicial

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Explorativa").getOrCreate()

df_full = spark.read.parquet("/home/jovyan/work/datasets/df_ratings_full.parquet")
print(df_full.count())
df_full.printSchema()
df_full.show(5)

1000209
root
 |-- userId: long (nullable = true)
 |-- filmId: long (nullable = true)
 |-- rating: long (nullable = true)
 |-- date: string (nullable = true)
 |-- film: string (nullable = true)
 |-- genders: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- year: long (nullable = true)
 |-- gender: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- age_group_mean: long (nullable = true)
 |-- age_group_letter: string (nullable = true)

+------+------+------+----------+--------------------+--------------------+----+------+--------------------+--------------+----------------+
|userId|filmId|rating|      date|                film|             genders|year|gender|          occupation|age_group_mean|age_group_letter|
+------+------+------+----------+--------------------+--------------------+----+------+--------------------+--------------+----------------+
|     1|  1193|     5|31/12/2000|One Flew Over the...|             [Drama]|1975|     F|Estu

He decido que voy a escoger películas famosas, para ello, tomaré la cantidad de valoraciones como referencia.

Así que tomola película, su nombre y sus géneros y ordeno de mayor cantidad de veloraciones a menos. Como más adelante me va a venir bien su orden, también genero una columna de posición

In [3]:
w = Window.orderBy(F.desc("num_notas"))

top_pelis = (
    df_full
    .groupBy("filmId", "film", "genders")
    .agg(F.count("*").alias("num_notas"))
    .withColumn("posicion", F.row_number().over(w))
    .orderBy(F.desc("num_notas"))
)

top_pelis.show(20, truncate=False)


+------+----------------------------------------------+-----------------------------------------+---------+--------+
|filmId|film                                          |genders                                  |num_notas|posicion|
+------+----------------------------------------------+-----------------------------------------+---------+--------+
|2858  |American Beauty                               |[Comedy, Drama]                          |3428     |1       |
|260   |Star Wars: Episode IV - A New Hope            |[Action, Adventure, Fantasy, Sci-Fi]     |2991     |2       |
|1196  |Star Wars: Episode V - The Empire Strikes Back|[Action, Adventure, Drama, Sci-Fi, War]  |2990     |3       |
|1210  |Star Wars: Episode VI - Return of the Jedi    |[Action, Adventure, Romance, Sci-Fi, War]|2883     |4       |
|480   |Jurassic Park                                 |[Action, Adventure, Sci-Fi]              |2672     |5       |
|2028  |Saving Private Ryan                           |[Action, 

Dado que tengo películas muy similares entre sí e incluso de la misma franquicia, voy a filtrar cuáles son las que tienen más votos por cada género

In [4]:
top100 = top_pelis.orderBy(F.desc("num_notas")).limit(100)

per_genre = (
    top100
    .withColumn("genre", F.explode("genders"))
    .withColumn("rn", F.row_number().over(Window.partitionBy("genre").orderBy(F.desc("num_notas"))))
    .filter(F.col("rn") == 1)
    .select("*")
)

per_genre.show(50, truncate=False)


+------+----------------------------------------------+-----------------------------------------+---------+--------+----------+---+
|filmId|film                                          |genders                                  |num_notas|posicion|genre     |rn |
+------+----------------------------------------------+-----------------------------------------+---------+--------+----------+---+
|260   |Star Wars: Episode IV - A New Hope            |[Action, Adventure, Fantasy, Sci-Fi]     |2991     |2       |Action    |1  |
|260   |Star Wars: Episode IV - A New Hope            |[Action, Adventure, Fantasy, Sci-Fi]     |2991     |2       |Adventure |1  |
|1     |Toy Story                                     |[Animation, Children's, Comedy]          |2077     |30      |Animation |1  |
|1097  |E.T. the Extra-Terrestrial                    |[Children's, Drama, Fantasy, Sci-Fi]     |2269     |21      |Children's|1  |
|2858  |American Beauty                               |[Comedy, Drama]      

Ahora, como veo que está la misma película para varios géneros, voy a quedarme con las que se repiten en una mayor cantidad de ellos y luego, de las restantes, con las que tenían mayor cantidad de valoraciones en general

In [8]:
top_in_per_genre = (
    per_genre
    .groupBy("filmId", "film", "num_notas")
    .agg(F.count("*").alias("cant"))
    .orderBy(F.desc("cant"), F.desc("num_notas"))
)

top_in_per_genre.show(10, truncate=False)

+------+----------------------------------------------+---------+----+
|filmId|film                                          |num_notas|cant|
+------+----------------------------------------------+---------+----+
|260   |Star Wars: Episode IV - A New Hope            |2991     |4   |
|2858  |American Beauty                               |3428     |2   |
|1617  |L.A. Confidential                             |2288     |2   |
|1196  |Star Wars: Episode V - The Empire Strikes Back|2990     |1   |
|1210  |Star Wars: Episode VI - Return of the Jedi    |2883     |1   |
|589   |Terminator 2: Judgment Day                    |2649     |1   |
|608   |Fargo                                         |2513     |1   |
|1097  |E.T. the Extra-Terrestrial                    |2269     |1   |
|2716  |Ghostbusters                                  |2181     |1   |
|1     |Toy Story                                     |2077     |1   |
+------+----------------------------------------------+---------+----+
only s

El resultado no me ha convencido porque, al tomar solo los 5 primeros, tendría un 60% de películas de la misma franquicia, así que voy a decidir quedarme con los 3 primeros y, arbitrariamente, E.T. y Toy Story, ya que considero que son los más variados de los 10 primeros.

# Análisis de tiempos

metrics_append = spark.read.csv(
    "apps_mediciones/metrics_append.csv",
    header=True, inferSchema=True
)

metrics_update = spark.read.csv(
    "apps_mediciones/metrics_update_earliest.csv",
    header=True, inferSchema=True
)

metrics_append.show(5)
metrics_update.show(5)

In [19]:
metrics_append = (
    spark.read.csv("/home/jovyan/work/apps_mediciones/metrics_append_old.csv", header=True, inferSchema=True)
    .withColumn("epoch_id", F.col("epoch_id").cast("long"))
    .withColumn("count", F.col("count").cast("long"))
    .withColumn("duration_ms", F.col("duration_ms").cast("double"))
)

metrics_update = (
    spark.read.csv("/home/jovyan/work/apps_mediciones/metrics_update_earliest_old.csv", header=True, inferSchema=True)
    .withColumn("epoch_id", F.col("epoch_id").cast("long"))
    .withColumn("count", F.col("count").cast("long"))
    .withColumn("duration_ms", F.col("duration_ms").cast("double"))
)

metrics_append.show(5)
metrics_update.show(5)

+--------+-----+-----------+--------------------+
|epoch_id|count|duration_ms|           timestamp|
+--------+-----+-----------+--------------------+
|       0|    0|   897.2393|2025-11-30 15:49:...|
|       0|    0|   810.1819|2025-11-30 15:53:...|
|       1|    1|   979.5644|2025-11-30 15:53:...|
|       2|  126|   378.7649|2025-11-30 15:53:...|
|       3|   61|   342.4324|2025-11-30 15:53:...|
+--------+-----+-----------+--------------------+
only showing top 5 rows

+--------+------+-----------+--------------------+
|epoch_id| count|duration_ms|           timestamp|
+--------+------+-----------+--------------------+
|       0|135185|  6048.8424|2025-11-30 16:00:...|
|       1|     1|   193.6341|2025-11-30 16:00:...|
|       2|    41|   224.8487|2025-11-30 16:00:...|
|       3|    49|    110.961|2025-11-30 16:00:...|
|       4|    27|   189.1427|2025-11-30 16:00:...|
+--------+------+-----------+--------------------+
only showing top 5 rows



In [21]:
def stats(df):
    mean_val = df.select(F.mean("duration_ms").alias("mean")).first()["mean"]
    median_val = df.approxQuantile("duration_ms", [0.5], 0.0)[0]  # mediana exacta con error 0
    return mean_val, median_val

mean_a, median_a = stats(metrics_append)
mean_u, median_u = stats(metrics_update)

print(f"Optimizado:  media: {mean_a:.2f} ms, mediana: {median_a:.2f} ms")
print(f"No optimizado:  media: {mean_u:.2f} ms, mediana: {median_u:.2f} ms")


Optimizado:  media: 269.84 ms, mediana: 143.09 ms
No optimizado:  media: 285.72 ms, mediana: 120.36 ms


In [24]:
print(f"Porcentaje de mejora: {(mean_u/mean_a-1)*100}%")

Porcentaje de mejora: 5.882682399853589%


In [27]:
metrics_append = (
    spark.read.csv("/home/jovyan/work/apps_mediciones/metrics_append.csv", header=True, inferSchema=True)
    .withColumn("epoch_id", F.col("epoch_id").cast("int"))
    .withColumn("count", F.col("count").cast("long"))
    .withColumn("duration_ms", F.col("duration_ms").cast("double"))
)

metrics_update = (
    spark.read.csv("/home/jovyan/work/apps_mediciones/metrics_update_earliest.csv", header=True, inferSchema=True)
    .withColumn("epoch_id", F.col("epoch_id").cast("long"))
    .withColumn("count", F.col("count").cast("long"))
    .withColumn("duration_ms", F.col("duration_ms").cast("double"))
)

metrics_append.show(5)
metrics_update.show(5)

+--------+-----+-----------+--------------------+
|epoch_id|count|duration_ms|           timestamp|
+--------+-----+-----------+--------------------+
|       0|    0|   869.6688|2025-11-30 16:36:...|
|       1|    1|   1086.778|2025-11-30 16:36:...|
|       2|  135|   300.3136|2025-11-30 16:36:...|
|       3|   60|   195.0003|2025-11-30 16:36:...|
|       4|   35|   168.8305|2025-11-30 16:36:...|
+--------+-----+-----------+--------------------+
only showing top 5 rows

+--------+------+-----------+--------------------+
|epoch_id| count|duration_ms|           timestamp|
+--------+------+-----------+--------------------+
|       0|146626|  6662.5408|2025-11-30 16:30:...|
|       1|    10|   220.6046|2025-11-30 16:30:...|
|       2|    45|   248.4224|2025-11-30 16:30:...|
|       3|    50|   205.3467|2025-11-30 16:30:...|
|       4|    44|   168.3689|2025-11-30 16:30:...|
+--------+------+-----------+--------------------+
only showing top 5 rows



In [28]:
def stats(df):
    mean_val = df.select(F.mean("duration_ms").alias("mean")).first()["mean"]
    median_val = df.approxQuantile("duration_ms", [0.5], 0.0)[0]  # mediana exacta con error 0
    return mean_val, median_val

mean_a, median_a = stats(metrics_append)
mean_u, median_u = stats(metrics_update)

print(f"Optimizado: media: {mean_a:.2f} ms, mediana: {median_a:.2f} ms")
print(f"No optimizado: media: {mean_u:.2f} ms, mediana: {median_u:.2f} ms")


Optimizado: media: 70.40 ms, mediana: 56.65 ms
No optimizado: media: 82.97 ms, mediana: 60.78 ms


In [29]:
print(f"Porcentaje de mejora: {(mean_u/mean_a-1)*100}%")

Porcentaje de mejora: 17.857606852207986%
