In [35]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode, count, month, udf, trim, sum as spark_sum
from pyspark.sql.types import StringType, IntegerType
from datetime import datetime

# Crear la sesión de Spark
spark = SparkSession.builder \
    .appName("Anime_Zodiac_Pyspark") \
    .getOrCreate()

print("Sesión de Spark iniciada")

Sesión de Spark iniciada


In [36]:
# Ruta al archivo CSV
# users_path = "../../tests/users-details-2023.csv"
users_path = "../../tests/users-details-2023.csv"


users_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load(users_path)

users_df.show(20)

+------+---------------+------+--------------------+--------------------+--------------------+------------+----------+--------+---------+-------+-------+-------------+-------------+---------+----------------+
|Mal ID|       Username|Gender|            Birthday|            Location|              Joined|Days Watched|Mean Score|Watching|Completed|On Hold|Dropped|Plan to Watch|Total Entries|Rewatched|Episodes Watched|
+------+---------------+------+--------------------+--------------------+--------------------+------------+----------+--------+---------+-------+-------+-------------+-------------+---------+----------------+
|     1|          Xinil|  Male|1985-03-04T00:00:...|          California|2004-11-05T00:00:...|       142.3|      7.37|     1.0|    233.0|    8.0|   93.0|         64.0|        399.0|     60.0|          8458.0|
|     3|        Aokaado|  Male|                NULL|        Oslo, Norway|2004-11-11T00:00:...|        68.6|      7.34|    23.0|    137.0|   99.0|   44.0|         40

In [37]:
# Ruta al archivo CSV
# animes_path = "../../tests/anime-filtered.csv"
animes_path = "../../tests/anime-filtered.csv"

animes_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load(animes_path)

animes_df.show(20)

+--------+--------------------+-----+--------------------+--------------------+------------------------------+--------------------+-----+--------+--------------------+-----------+--------------------+--------------------+----------------+-----------+---------------+--------------------+------+----------+-------+---------+--------+---------+-------+-------+
|anime_id|                Name|Score|              Genres|        English name|                 Japanese name|           sypnopsis| Type|Episodes|               Aired|  Premiered|           Producers|           Licensors|         Studios|     Source|       Duration|              Rating|Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|
+--------+--------------------+-----+--------------------+--------------------+------------------------------+--------------------+-----+--------+--------------------+-----------+--------------------+--------------------+----------------+-----------+---------------+----------------

In [38]:
# Función para determinar el signo zodiacal
def get_zodiac_sign(birthday):
    if not birthday:
        return "Unknown"
    try:
        date = datetime.fromisoformat(birthday)
        month, day = date.month, date.day
        zodiac = [
            ("Capricorn", (12, 22), (1, 19)),
            ("Aquarius", (1, 20), (2, 18)),
            ("Pisces", (2, 19), (3, 20)),
            ("Aries", (3, 21), (4, 19)),
            ("Taurus", (4, 20), (5, 20)),
            ("Gemini", (5, 21), (6, 20)),
            ("Cancer", (6, 21), (7, 22)),
            ("Leo", (7, 23), (8, 22)),
            ("Virgo", (8, 23), (9, 22)),
            ("Libra", (9, 23), (10, 22)),
            ("Scorpio", (10, 23), (11, 21)),
            ("Sagittarius", (11, 22), (12, 21)),
        ]
        for sign, start, end in zodiac:
            if (month == start[0] and day >= start[1]) or (month == end[0] and day <= end[1]):
                return sign
    except:
        return "2"
    return "3"

# Registrar la función como UDF en Spark
get_zodiac_sign_udf = udf(get_zodiac_sign, StringType())

In [39]:
# Asociar Signo Zodiacal con género, y contar la cantidad de veces que cada signo consume un género

# Agregar la columna del signo zodiacal
users_df = users_df.withColumn("ZodiacSign", get_zodiac_sign_udf(col("Birthday")))

# Relacionar usuarios con los géneros de anime consumidos
anime_users_df = users_df.join(animes_df, users_df["Mal ID"] == animes_df["anime_id"], "inner").select("ZodiacSign", "Genres")

# Explode de géneros (ya que un anime puede tener múltiples géneros separados por comas)
anime_users_df = anime_users_df.withColumn("Genre", explode(split(col("Genres"), ",")))

# Aplicar trim para eliminar espacios en blanco alrededor de los géneros
anime_users_df = anime_users_df.withColumn("Genre", trim(col("Genre")))
# Contar la cantidad de veces que cada signo consume un género
zodiac_genre_count = anime_users_df.groupBy("ZodiacSign", "Genre").agg(count("*").alias("Count"))

zodiac_genre_count = zodiac_genre_count.sort(col("ZodiacSign").asc(), col("Count").desc())

# Mostrar resultados
zodiac_genre_count.show(50, truncate=False)



+----------+-------------+-----+
|ZodiacSign|Genre        |Count|
+----------+-------------+-----+
|Aquarius  |Comedy       |137  |
|Aquarius  |Action       |93   |
|Aquarius  |Fantasy      |89   |
|Aquarius  |Adventure    |77   |
|Aquarius  |Sci-Fi       |58   |
|Aquarius  |Drama        |58   |
|Aquarius  |Shounen      |49   |
|Aquarius  |Slice of Life|48   |
|Aquarius  |Kids         |44   |
|Aquarius  |School       |41   |
|Aquarius  |Romance      |39   |
|Aquarius  |Hentai       |38   |
|Aquarius  |Supernatural |31   |
|Aquarius  |Music        |30   |
|Aquarius  |Ecchi        |28   |
|Aquarius  |Magic        |27   |
|Aquarius  |Historical   |25   |
|Aquarius  |Mecha        |23   |
|Aquarius  |Mystery      |19   |
|Aquarius  |Parody       |18   |
|Aquarius  |Sports       |17   |
|Aquarius  |Seinen       |17   |
|Aquarius  |Shoujo       |15   |
|Aquarius  |Martial Arts |15   |
|Aquarius  |Super Power  |14   |
|Aquarius  |Game         |12   |
|Aquarius  |Military     |11   |
|Aquarius 

                                                                                

In [40]:
# Escribir el DataFrame en un archivo CSV con el nombre "zodiac_genre_count"
zodiac_genre_count.write.csv("zodiac_genre_count", header=True, mode="overwrite")

                                                                                

In [41]:
# Definir una UDF para extraer el mes a partir del campo "Aired"
def extract_month(aired):
    try:
        # Se asume que el campo "Aired" tiene el formato: "Jan 10, 2020 to Feb 20, 2020"
        # Se toma la primera fecha (antes de "to") y se extrae el mes.
        first_date = aired.split("to")[0].strip()
        dt = datetime.strptime(first_date, "%b %d, %Y")
        return dt.month
    except Exception:
        return None

extract_month_udf = udf(extract_month, IntegerType())

In [42]:
# Agregar la columna "publication_month", parseo, y filtrar aquellos registros sin mes válido
anime_pub_month_df = animes_df.withColumn("publication_month", extract_month_udf(col("Aired"))) \
                   .filter(col("publication_month").isNotNull()) \
                   .select("anime_id", "publication_month")


anime_pub_month_df.show(20)



+--------+-----------------+
|anime_id|publication_month|
+--------+-----------------+
|       1|                4|
|       5|                9|
|       6|                4|
|       7|                7|
|       8|                9|
|      15|                4|
|      16|                4|
|      17|                9|
|      18|                4|
|      19|                4|
|      20|               10|
|      21|               10|
|      22|               10|
|      23|               10|
|      24|               10|
|      25|               10|
|      26|                4|
|      27|                4|
|      28|               10|
|      29|               10|
+--------+-----------------+
only showing top 20 rows



In [55]:
from pyspark.sql.functions import sum as spark_sum

# Realizar el cross join entre anime_pub_month_df y zodiac_genre_count
cross_join_df = zodiac_genre_count.crossJoin(anime_pub_month_df).orderBy("ZodiacSign", "publication_month")

# Agrupar por ZodiacSign y publication_month, y sumar los valores de Count
zodiac_month_count = cross_join_df.groupBy("ZodiacSign", "publication_month") \
    .agg(spark_sum("Count").alias("Counter")).sort(col("ZodiacSign").asc(), col("publication_month").asc())

# Mostrar los resultados
zodiac_month_count.show(50, truncate=False)
# Escribir el DataFrame en un archivo CSV con el nombre "zodiac_month_count"
zodiac_month_count.write.csv("zodiac_month_count", header=True, mode="overwrite")

                                                                                

+----------+-----------------+-------+
|ZodiacSign|publication_month|Counter|
+----------+-----------------+-------+
|Aquarius  |1                |1521888|
|Aquarius  |2                |842310 |
|Aquarius  |3                |1346550|
|Aquarius  |4                |2366490|
|Aquarius  |5                |711666 |
|Aquarius  |6                |829704 |
|Aquarius  |7                |1867980|
|Aquarius  |8                |1031400|
|Aquarius  |9                |895026 |
|Aquarius  |10               |2242722|
|Aquarius  |11               |821682 |
|Aquarius  |12               |1220490|
|Aries     |1                |1460800|
|Aries     |2                |808500 |
|Aries     |3                |1292500|
|Aries     |4                |2271500|
|Aries     |5                |683100 |
|Aries     |6                |796400 |
|Aries     |7                |1793000|
|Aries     |8                |990000 |
|Aries     |9                |859100 |
|Aries     |10               |2152700|
|Aries     |11           

                                                                                

In [None]:
# Detener la sesión de Spark
spark.stop()