In [44]:
"""
	Autores:
	- Astrid Alvarado
	- Junior Lara
	- Fredthery Castro
"""

# Asegurese de seleccionar el Kernel de Python 3, puede depender de su entorno.

from pyspark.sql import SparkSession

In [60]:
spark = SparkSession\
    .builder\
    .appName("Solver problema 3 - SQL")\
    .getOrCreate()

In [90]:
# ITEM 1: Ver el comportamiento de la popularidad en función de la distribución.
# Archivo necesiario: final_animedataset.csv
# Campos necesarios:
# 	- title
# 	- gender
# 	- popularity

# Direccionio del archivo
# Ej. file = "../../tests/vshort-final_animedataset.csv"
file = "../../tests/vshort-final_animedataset.csv"

final_animedataset = spark\
	.read\
	.format("csv")\
	.option("header", "true")\
	.load(file)

final_animedataset.show(20)

# Se genera la VIEW
final_animedataset.createOrReplaceTempView("final_animedataset")

+--------+--------+--------+-------+------+--------------------+----+------------+-----+---------+------+----------+--------------------+
|username|anime_id|my_score|user_id|gender|               title|type|      source|score|scored_by|  rank|popularity|               genre|
+--------+--------+--------+-------+------+--------------------+----+------------+-----+---------+------+----------+--------------------+
|karthiga|      21|       9|2255153|Female|           One Piece|  TV|       Manga| 8.54|   423868|  91.0|        35|Action, Adventure...|
|karthiga|      59|       7|2255153|Female|             Chobits|  TV|       Manga| 7.53|   175388|1546.0|       188|Sci-Fi, Comedy, D...|
|karthiga|      74|       7|2255153|Female|        Gakuen Alice|  TV|       Manga| 7.77|    33244| 941.0|      1291|Comedy, School, S...|
|karthiga|     120|       7|2255153|Female|       Fruits Basket|  TV|       Manga| 7.77|   167968| 939.0|       222|Slice of Life, Co...|
|karthiga|     178|       7|225515

In [91]:
# Calcular la cantidad de usuarios por género, la popularidad y los porcentajes
count_female_male_by_title = spark.sql("""
    SELECT 
        title,
        COUNT(CASE WHEN gender = 'Female' THEN 1 END) AS female_count,
        COUNT(CASE WHEN gender = 'Male' THEN 1 END) AS male_count,
        FIRST(popularity) AS popularity,
        ROUND(COUNT(CASE WHEN gender = 'Female' THEN 1 END) * 100.0 / 
              (COUNT(CASE WHEN gender = 'Female' THEN 1 END) + COUNT(CASE WHEN gender = 'Male' THEN 1 END)), 2) AS female_percentage,
        ROUND(COUNT(CASE WHEN gender = 'Male' THEN 1 END) * 100.0 / 
              (COUNT(CASE WHEN gender = 'Female' THEN 1 END) + COUNT(CASE WHEN gender = 'Male' THEN 1 END)), 2) AS male_percentage
    FROM 
        final_animedataset
    GROUP BY 
        title
""")

# Mostrar los resultados
count_female_male_by_title.show(20)

count_female_male_by_title.createOrReplaceGlobalTempView("count_female_male_by_title")

+--------------------+------------+----------+----------+-----------------+---------------+
|               title|female_count|male_count|popularity|female_percentage|male_percentage|
+--------------------+------------+----------+----------+-----------------+---------------+
|"""Bungaku Shoujo...|           1|         0|      1556|           100.00|           0.00|
|"Gyakuten Saiban:...|           1|         1|    5839.0|            50.00|          50.00|
|"Naruto Shippuude...|           1|         0|      1771|           100.00|           0.00|
|        .hack//Roots|           0|         1|      1391|             0.00|         100.00|
|         .hack//Sign|           0|         1|       650|             0.00|         100.00|
|.hack//Tasogare n...|           0|         1|      1474|             0.00|         100.00|
|              11eyes|           1|         0|       487|           100.00|           0.00|
|12-sai.: Chicchan...|           1|         0|      2789|           100.00|     

In [78]:
# ITEM 2: Ver la tendencia de géneros de anime en función de la distribución.
# Archivo necesiario: final_animedataset.csv
# Campos necesarios:
# 	- gender
# 	- genre

# Calcular la cantidad de usuarios por género y los porcentajes para cada género
count_female_male_by_genres = spark.sql("""
    SELECT 
        genre,
        COUNT(CASE WHEN gender = 'Female' THEN 1 END) AS female_count,
        COUNT(CASE WHEN gender = 'Male' THEN 1 END) AS male_count,
        ROUND(COUNT(CASE WHEN gender = 'Female' THEN 1 END) * 100.0 / 
              (COUNT(CASE WHEN gender = 'Female' THEN 1 END) + COUNT(CASE WHEN gender = 'Male' THEN 1 END)), 2) AS female_percentage,
        ROUND(COUNT(CASE WHEN gender = 'Male' THEN 1 END) * 100.0 / 
              (COUNT(CASE WHEN gender = 'Female' THEN 1 END) + COUNT(CASE WHEN gender = 'Male' THEN 1 END)), 2) AS male_percentage
    FROM (
        SELECT 
            gender,
            EXPLODE(SPLIT(genre, ', ')) AS genre
        FROM 
            final_animedataset
    ) exploded_genres
    GROUP BY 
        genre
""")

# Mostrar los resultados
count_female_male_by_genres.show(20)

+------------+------------+----------+-----------------+---------------+
|       genre|female_count|male_count|female_percentage|male_percentage|
+------------+------------+----------+-----------------+---------------+
|      Seinen|          82|        16|            83.67|          16.33|
|     Romance|         241|        58|            80.60|          19.40|
|    Dementia|          18|         7|            72.00|          28.00|
|       Magic|          60|        26|            69.77|          30.23|
|    Thriller|          40|        15|            72.73|          27.27|
|       Josei|          21|         4|            84.00|          16.00|
|   Adventure|         124|        78|            61.39|          38.61|
| Super Power|          64|        45|            58.72|          41.28|
|Martial Arts|          23|         6|            79.31|          20.69|
|      Sports|          29|         7|            80.56|          19.44|
|      Shoujo|          74|        11|            8

In [79]:
# ITEM 3: Ver si existe relación acerca de las preferencias de la distribución con respecto al origen (source) de los animes que consumen.
# Archivo necesiario: final_animedataset.csv
# Campos necesarios:
# 	- source
# 	- gender

# Calcular la cantidad de usuarios por género y los porcentajes para cada source
count_female_male_by_source = spark.sql("""
    SELECT 
        source,
        COUNT(CASE WHEN gender = 'Female' THEN 1 END) AS female_count,
        COUNT(CASE WHEN gender = 'Male' THEN 1 END) AS male_count,
        ROUND(COUNT(CASE WHEN gender = 'Female' THEN 1 END) * 100.0 / 
              (COUNT(CASE WHEN gender = 'Female' THEN 1 END) + COUNT(CASE WHEN gender = 'Male' THEN 1 END)), 2) AS female_percentage,
        ROUND(COUNT(CASE WHEN gender = 'Male' THEN 1 END) * 100.0 / 
              (COUNT(CASE WHEN gender = 'Female' THEN 1 END) + COUNT(CASE WHEN gender = 'Male' THEN 1 END)), 2) AS male_percentage
    FROM 
        final_animedataset
    GROUP BY 
        source
""")

# Mostrar los resultados
count_female_male_by_source.show(20)

+------------+------------+----------+-----------------+---------------+
|      source|female_count|male_count|female_percentage|male_percentage|
+------------+------------+----------+-----------------+---------------+
|          TV|           1|         1|            50.00|          50.00|
|Visual novel|          36|        11|            76.60|          23.40|
|    Original|         136|        81|            62.67|          37.33|
|       Novel|          43|        15|            74.14|          25.86|
|     Unknown|          23|         0|           100.00|           0.00|
|       Other|           7|         2|            77.78|          22.22|
|       Manga|         374|       106|            77.92|          22.08|
|4-koma manga|          20|         1|            95.24|           4.76|
|       Music|           1|         1|            50.00|          50.00|
|        Game|           7|         3|            70.00|          30.00|
|   Web manga|          11|         1|            9

In [96]:
# ITEM 4: Ver si es posible detectar un patrón referente a qué estudio es más propenso a sacar los tipos de animes que están en estas características.
# Archivo necesiario: anime-dataset-2023.csv
# Campos necesarios:
# - 

file = "../../tests/anime-dataset-2023.csv"

anime_dataset = spark\
    .read\
    .format("csv")\
    .option("header", "true")\
    .option("quote", "\"")\
    .option("escape", "\"")\
    .option("multiLine", "true")\
    .load(file)

anime_dataset.show()

# Se genera la VIEW
anime_dataset.createOrReplaceTempView("anime_dataset")

+--------+--------------------+--------------------+------------------------------+-----+--------------------+--------------------+-----+--------+--------------------+-----------+----------------+--------------------+--------------------+----------------+-----------+-------------+--------------------+------+----------+---------+---------+-------+--------------------+
|anime_id|                Name|        English name|                    Other name|Score|              Genres|            Synopsis| Type|Episodes|               Aired|  Premiered|          Status|           Producers|           Licensors|         Studios|     Source|     Duration|              Rating|  Rank|Popularity|Favorites|Scored By|Members|           Image URL|
+--------+--------------------+--------------------+------------------------------+-----+--------------------+--------------------+-----+--------+--------------------+-----------+----------------+--------------------+--------------------+----------------+-----

In [97]:
# Obtener los animes que ha hecho cada estudio
animes_por_estudio = spark.sql("""
    SELECT 
        Studios AS studio,
        COLLECT_SET(Name) AS animes,
        COLLECT_SET(Source) AS sources,
        COLLECT_SET(Genres) AS genres
    FROM anime_dataset
    WHERE Studios IS NOT NULL
    GROUP BY Studios
""")
animes_por_estudio.createOrReplaceTempView("animes_por_estudio")
animes_por_estudio.show()

+--------------------+--------------------+--------------------+--------------------+
|              studio|              animes|             sources|              genres|
+--------------------+--------------------+--------------------+--------------------+
|             10Gauge|[Tobidasu PriPara...|[Manga, Other, Ga...|[Comedy, Sci-Fi, ...|
|10Gauge, Studio D...|         [Heikousen]|          [Original]|           [Romance]|
|                 1IN|       [Renwu Dashi]|          [Original]|[Action, Adventur...|
|   2:10 AM Animation|[Zhu Tian Ji, Wan...|[Web novel, Manga...|[Action, Adventur...|
|       33 Collective|[Chara to Otamaja...|          [Original]|[Adventure, Fantasy]|
|              5 Inc.|       [Make My Day]|          [Original]|  [Horror, Suspense]|
|              6pucks|[Time Machine, Ch...|          [Original]|[Award Winning, D...|
|                7doc|[AOTU Xueyuan, AO...|         [Web manga]|[Action, Adventur...|
|                8bit|[Yama no Susume S...|[Visual nov

In [98]:
# Añadir columnas de puntaje masculino y femenino, reemplazando NULL con 0
animes_por_estudio_con_puntajes = spark.sql("""
    SELECT 
        a.studio,
        a.animes,
        a.sources,
        a.genres,
        COALESCE(ROUND(AVG(CASE WHEN f.gender = 'Male' THEN f.my_score END), 2), 0) AS male_score,
        COALESCE(ROUND(AVG(CASE WHEN f.gender = 'Female' THEN f.my_score END), 2), 0) AS female_score
    FROM animes_por_estudio a
    LEFT JOIN final_animedataset f
    ON ARRAY_CONTAINS(a.animes, f.title)
    GROUP BY a.studio, a.animes, a.sources, a.genres
""")

# Mostrar los resultados
animes_por_estudio_con_puntajes.show(truncate=False)

+----------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [59]:
spark.stop()