In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg

# Crear la sesión de Spark
spark = SparkSession.builder \
    .appName("AnimeQualityAnalysis") \
    .getOrCreate()

print("Sesión de Spark iniciada")

Sesión de Spark iniciada


In [9]:
# Ruta al archivo CSV
# Ej. input_path = "../../tests/vshort-final_animedataset.csv"  # Cambia esta ruta según corresponda
input_path = "../../tests/final_animedataset.csv"  # Cambia esta ruta según corresponda

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load(input_path)

df.show(20)

+--------+--------+--------+-------+------+--------------------+----+------------+-----+---------+------+----------+--------------------+
|username|anime_id|my_score|user_id|gender|               title|type|      source|score|scored_by|  rank|popularity|               genre|
+--------+--------+--------+-------+------+--------------------+----+------------+-----+---------+------+----------+--------------------+
|karthiga|      21|       9|2255153|Female|           One Piece|  TV|       Manga| 8.54|   423868|  91.0|        35|Action, Adventure...|
|karthiga|      59|       7|2255153|Female|             Chobits|  TV|       Manga| 7.53|   175388|1546.0|       188|Sci-Fi, Comedy, D...|
|karthiga|      74|       7|2255153|Female|        Gakuen Alice|  TV|       Manga| 7.77|    33244| 941.0|      1291|Comedy, School, S...|
|karthiga|     120|       7|2255153|Female|       Fruits Basket|  TV|       Manga| 7.77|   167968| 939.0|       222|Slice of Life, Co...|
|karthiga|     178|       7|225515

In [10]:
# Filtrar filas donde 'source' y 'score' no sean nulos
df_clean = df.filter((col("source").isNotNull()) & (col("score").isNotNull()))

# Convertir 'score' a tipo float, en caso de que no se haya interpretado correctamente
df_clean = df_clean.withColumn("score", col("score").cast("float"))

# Verificar la limpieza
df_clean.show(5, truncate=False)

+--------+--------+--------+-------+------+-------------+----+------+-----+---------+------+----------+---------------------------------------------------------------+
|username|anime_id|my_score|user_id|gender|title        |type|source|score|scored_by|rank  |popularity|genre                                                          |
+--------+--------+--------+-------+------+-------------+----+------+-----+---------+------+----------+---------------------------------------------------------------+
|karthiga|21      |9       |2255153|Female|One Piece    |TV  |Manga |8.54 |423868   |91.0  |35        |Action, Adventure, Comedy, Super Power, Drama, Fantasy, Shounen|
|karthiga|59      |7       |2255153|Female|Chobits      |TV  |Manga |7.53 |175388   |1546.0|188       |Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen                  |
|karthiga|74      |7       |2255153|Female|Gakuen Alice |TV  |Manga |7.77 |33244    |941.0 |1291      |Comedy, School, Shoujo, Super Power                      

In [11]:

# Agrupar por 'source' y calcular el promedio de 'score'
result = df_clean.groupBy("source").agg(avg("score").alias("avg_score"))

# Ordenar los resultados de forma descendente según el promedio obtenido
result_sorted = result.orderBy(col("source").asc())

# Mostrar los resultados finales
result_sorted.show(truncate=False)



+-------------+------------------+
|source       |avg_score         |
+-------------+------------------+
|4-koma manga |7.462260465025232 |
|Book         |7.336437238219163 |
|Card game    |6.870959485497469 |
|Digital manga|6.27403482157998  |
|Game         |7.033857693863417 |
|Light novel  |7.547666367656115 |
|Manga        |7.645934224868701 |
|Music        |7.302314167330708 |
|Novel        |7.7962847203267085|
|Original     |7.520261100201623 |
|Other        |6.985161060025731 |
|Picture book |6.936468187449178 |
|Radio        |6.311972300919272 |
|Unknown      |6.785835727731967 |
|Visual novel |7.19380956277404  |
|Web manga    |7.694903890362806 |
+-------------+------------------+



                                                                                

In [12]:
# Escribir el DataFrame en un archivo CSV con el nombre "result_by_source"
result_sorted.write.csv("result_by_source", header=True, mode="overwrite")

                                                                                

In [5]:
spark.stop()