In [11]:
from pyspark.sql import SparkSession

# Crear la sesión de Spark
spark = SparkSession.builder \
    .appName("AnimeQualityAnalysis") \
    .getOrCreate()

print("Sesión de Spark iniciada")

25/04/03 02:26:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Sesión de Spark iniciada


In [12]:
# Ruta al archivo CSV (modifica la ruta según corresponda)
input_path = "../../tests/final_animedataset.csv"

# Cargar el CSV en un DataFrame
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load(input_path)

# Crear una vista temporal para poder utilizar Spark SQL
df.createOrReplaceTempView("anime")

# Mostrar algunas filas para verificar la carga
df.show(5, truncate=False)

+--------+--------+--------+-------+------+-------------+----+------+-----+---------+------+----------+---------------------------------------------------------------+
|username|anime_id|my_score|user_id|gender|title        |type|source|score|scored_by|rank  |popularity|genre                                                          |
+--------+--------+--------+-------+------+-------------+----+------+-----+---------+------+----------+---------------------------------------------------------------+
|karthiga|21      |9       |2255153|Female|One Piece    |TV  |Manga |8.54 |423868   |91.0  |35        |Action, Adventure, Comedy, Super Power, Drama, Fantasy, Shounen|
|karthiga|59      |7       |2255153|Female|Chobits      |TV  |Manga |7.53 |175388   |1546.0|188       |Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen                  |
|karthiga|74      |7       |2255153|Female|Gakuen Alice |TV  |Manga |7.77 |33244    |941.0 |1291      |Comedy, School, Shoujo, Super Power                      

In [13]:
# Consulta SQL: Filtra filas donde 'source' y 'score' no sean nulos,
# convierte 'score' a tipo float, agrupa por 'source' y calcula el promedio,
# ordenando el resultado en forma descendente.
query = """
SELECT source, AVG(CAST(score AS FLOAT)) AS avg_score
FROM anime
WHERE source IS NOT NULL AND score IS NOT NULL
GROUP BY source
ORDER BY source ASC
"""

# Ejecutar la consulta SQL
result = spark.sql(query)

# Mostrar el resultado
result.show(truncate=False)



+-------------+------------------+
|source       |avg_score         |
+-------------+------------------+
|4-koma manga |7.463999112496897 |
|Book         |7.339872149042031 |
|Card game    |6.874629466895309 |
|Digital manga|6.286162937030009 |
|Game         |7.0350680953706926|
|Light novel  |7.5476944013984655|
|Manga        |7.644926472263997 |
|Music        |7.3084495014743185|
|Novel        |7.798500644457453 |
|Original     |7.524421412116416 |
|Other        |6.990557157227344 |
|Picture book |6.947453134982346 |
|Radio        |6.307572355795087 |
|Unknown      |6.7880745587859685|
|Visual novel |7.197369146645407 |
|Web manga    |7.693994017141883 |
+-------------+------------------+



                                                                                

In [14]:
# Escribir el DataFrame en un archivo CSV con el nombre "result_by_sql"
result.write.csv("result_by_sql", header=True, mode="overwrite")

                                                                                

In [15]:
spark.stop()