In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

# Crear la sesión de Spark
spark = SparkSession.builder \
    .appName("AnimeQualityAnalysis") \
    .getOrCreate()

print("Sesión de Spark iniciada")

Sesión de Spark iniciada


In [17]:
# Ruta al archivo CSV
input_path = "../../tests/vshort-final_animedataset.csv"  # Cambia esta ruta según corresponda

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load(input_path)

df.show(20)

+--------+--------+--------+-------+------+--------------------+----+------------+-----+---------+------+----------+--------------------+
|username|anime_id|my_score|user_id|gender|               title|type|      source|score|scored_by|  rank|popularity|               genre|
+--------+--------+--------+-------+------+--------------------+----+------------+-----+---------+------+----------+--------------------+
|karthiga|      21|       9|2255153|Female|           One Piece|  TV|       Manga| 8.54|   423868|  91.0|        35|Action, Adventure...|
|karthiga|      59|       7|2255153|Female|             Chobits|  TV|       Manga| 7.53|   175388|1546.0|       188|Sci-Fi, Comedy, D...|
|karthiga|      74|       7|2255153|Female|        Gakuen Alice|  TV|       Manga| 7.77|    33244| 941.0|      1291|Comedy, School, S...|
|karthiga|     120|       7|2255153|Female|       Fruits Basket|  TV|       Manga| 7.77|   167968| 939.0|       222|Slice of Life, Co...|
|karthiga|     178|       7|225515

In [18]:
# Filtrar filas donde 'source' y 'score' no sean nulos
df_clean = df.filter((col("source").isNotNull()) & (col("score").isNotNull()))

# Convertir 'score' a tipo float, en caso de que no se haya interpretado correctamente
df_clean = df_clean.withColumn("score", col("score").cast("float"))

# Verificar la limpieza
df_clean.show(5, truncate=False)

+--------+--------+--------+-------+------+-------------+----+------+-----+---------+------+----------+---------------------------------------------------------------+
|username|anime_id|my_score|user_id|gender|title        |type|source|score|scored_by|rank  |popularity|genre                                                          |
+--------+--------+--------+-------+------+-------------+----+------+-----+---------+------+----------+---------------------------------------------------------------+
|karthiga|21      |9       |2255153|Female|One Piece    |TV  |Manga |8.54 |423868   |91.0  |35        |Action, Adventure, Comedy, Super Power, Drama, Fantasy, Shounen|
|karthiga|59      |7       |2255153|Female|Chobits      |TV  |Manga |7.53 |175388   |1546.0|188       |Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen                  |
|karthiga|74      |7       |2255153|Female|Gakuen Alice |TV  |Manga |7.77 |33244    |941.0 |1291      |Comedy, School, Shoujo, Super Power                      

In [21]:

# Agrupar por 'source' y calcular el promedio de 'score'
result = df_clean.groupBy("source").agg(avg("score").alias("avg_score"))

# Ordenar los resultados de forma descendente según el promedio obtenido
result_sorted = result.orderBy(col("avg_score").desc())

# Mostrar los resultados finales
result_sorted.show(truncate=False)

+------------+------------------+
|source      |avg_score         |
+------------+------------------+
|Web manga   |8.149166584014893 |
|Manga       |7.773083333174387 |
|Light novel |7.7221848383671094|
|Novel       |7.701379290942488 |
|Original    |7.6814285599141625|
|Other       |7.588888857099745 |
|4-koma manga|7.370952333722796 |
|Game        |7.240833401679993 |
|Visual novel|7.223617066728308 |
|Music       |7.190000057220459 |
|Unknown     |6.778695707735808 |
+------------+------------------+



In [12]:
spark.stop()