In [50]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, explode, count, month, udf
from pyspark.sql.types import StringType
from datetime import datetime

# Crear la sesión de Spark
spark = SparkSession.builder \
    .appName("Anime_Zodiac_Pyspark") \
    .getOrCreate()

print("Sesión de Spark iniciada")

25/04/03 03:23:35 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


Sesión de Spark iniciada


In [None]:
# Ruta al archivo CSV
# users_path = "../../tests/users-details-2023.csv"
users_path = "../../tests/vshort-users-details-2023.csv"


users_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load(users_path)

users_df.show(20)

+------+---------------+------+--------------------+--------------------+--------------------+------------+----------+--------+---------+-------+-------+-------------+-------------+---------+----------------+
|Mal ID|       Username|Gender|            Birthday|            Location|              Joined|Days Watched|Mean Score|Watching|Completed|On Hold|Dropped|Plan to Watch|Total Entries|Rewatched|Episodes Watched|
+------+---------------+------+--------------------+--------------------+--------------------+------------+----------+--------+---------+-------+-------+-------------+-------------+---------+----------------+
|     1|          Xinil|  Male|1985-03-04T00:00:...|          California|2004-11-05T00:00:...|       142.3|      7.37|     1.0|    233.0|    8.0|   93.0|         64.0|        399.0|     60.0|          8458.0|
|     3|        Aokaado|  Male|                NULL|        Oslo, Norway|2004-11-11T00:00:...|        68.6|      7.34|    23.0|    137.0|   99.0|   44.0|         40

In [None]:
# Ruta al archivo CSV
# animes_path = "../../tests/anime-filtered.csv"
animes_path = "../../tests/vshort-anime-filtered.csv"

animes_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("quote", "\"") \
    .option("escape", "\"") \
    .load(animes_path)

animes_df.show(20)

+--------+--------------------+-----+--------------------+--------------------+------------------------------+--------------------+-----+--------+--------------------+-----------+--------------------+--------------------+----------------+-----------+---------------+--------------------+------+----------+-------+---------+--------+---------+-------+-------+
|anime_id|                Name|Score|              Genres|        English name|                 Japanese name|           sypnopsis| Type|Episodes|               Aired|  Premiered|           Producers|           Licensors|         Studios|     Source|       Duration|              Rating|Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|
+--------+--------------------+-----+--------------------+--------------------+------------------------------+--------------------+-----+--------+--------------------+-----------+--------------------+--------------------+----------------+-----------+---------------+----------------

In [53]:
# Función para determinar el signo zodiacal
def get_zodiac_sign(birthday):
    if not birthday:
        return "Unknown"
    try:
        date = datetime.fromisoformat(birthday)
        month, day = date.month, date.day
        zodiac = [
            ("Capricorn", (12, 22), (1, 19)),
            ("Aquarius", (1, 20), (2, 18)),
            ("Pisces", (2, 19), (3, 20)),
            ("Aries", (3, 21), (4, 19)),
            ("Taurus", (4, 20), (5, 20)),
            ("Gemini", (5, 21), (6, 20)),
            ("Cancer", (6, 21), (7, 22)),
            ("Leo", (7, 23), (8, 22)),
            ("Virgo", (8, 23), (9, 22)),
            ("Libra", (9, 23), (10, 22)),
            ("Scorpio", (10, 23), (11, 21)),
            ("Sagittarius", (11, 22), (12, 21)),
        ]
        for sign, start, end in zodiac:
            if (month == start[0] and day >= start[1]) or (month == end[0] and day <= end[1]):
                return sign
    except:
        return "2"
    return "3"

# Registrar la función como UDF en Spark
get_zodiac_sign_udf = udf(get_zodiac_sign, StringType())

In [54]:
# Agregar la columna del signo zodiacal
users_df = users_df.withColumn("ZodiacSign", get_zodiac_sign_udf(col("Birthday")))

# Relacionar usuarios con los géneros de anime consumidos
anime_users_df = users_df.join(animes_df, users_df["Mal ID"] == animes_df["anime_id"], "inner").select("ZodiacSign", "Genres")

# Explode de géneros (ya que un anime puede tener múltiples géneros separados por comas)
anime_users_df = anime_users_df.withColumn("Genre", explode(split(col("Genres"), ",")))

# Contar la cantidad de veces que cada signo consume un género
zodiac_genre_count = anime_users_df.groupBy("ZodiacSign", "Genre").agg(count("*").alias("Count"))

# Mostrar resultados
zodiac_genre_count.show(50, truncate=False)



+-----------+--------------+-----+
|ZodiacSign |Genre         |Count|
+-----------+--------------+-----+
|Pisces     | Magic        |13   |
|Cancer     | Sports       |11   |
|Gemini     |Sci-Fi        |15   |
|Capricorn  | Kids         |36   |
|Libra      | Yuri         |1    |
|Virgo      |Magic         |4    |
|Libra      | Josei        |1    |
|Libra      |Hentai        |36   |
|Pisces     | Dementia     |4    |
|Capricorn  | Vampire      |2    |
|Libra      | Kids         |27   |
|Cancer     | Kids         |32   |
|Virgo      |Seinen        |1    |
|Unknown    | School       |928  |
|Cancer     | School       |38   |
|Taurus     |Adventure     |32   |
|Cancer     | Demons       |12   |
|Leo        | Shounen Ai   |2    |
|Gemini     |Ecchi         |2    |
|Aquarius   | Dementia     |1    |
|Capricorn  | Shounen      |44   |
|Aries      | Sports       |13   |
|Cancer     | Martial Arts |11   |
|Aries      | Yuri         |2    |
|Unknown    | Cars         |34   |
|Unknown    |Game   

                                                                                

In [55]:
# Escribir el DataFrame en un archivo CSV con el nombre "zodiac_genre_count"
zodiac_genre_count.write.csv("zodiac_genre_count", header=True, mode="overwrite")

                                                                                

In [56]:
from pyspark.sql.functions import regexp_extract, month, to_date

# Extraer la primera fecha del campo "Aired"
animes_df = animes_df.withColumn("FirstAired", regexp_extract(col("Aired"), r"([A-Za-z]+) \d{1,2}, \d{4}", 1))

# Convertir el mes a número
month_mapping = {
    "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6,
    "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12
}
from pyspark.sql.functions import create_map, lit
from itertools import chain

month_expr = create_map([lit(x) for x in chain(*month_mapping.items())])
animes_df = animes_df.withColumn("MonthPublished", month_expr[col("FirstAired")])

# Continuar con el procesamiento
zodiac_month_df = users_df.join(animes_df, users_df["Mal ID"] == animes_df["anime_id"], "inner") \
                          .select("ZodiacSign", "MonthPublished")

zodiac_month_count = zodiac_month_df.groupBy("ZodiacSign", "MonthPublished").agg(count("*").alias("Count"))

zodiac_month_count.show(50, truncate=False)




+-----------+--------------+-----+
|ZodiacSign |MonthPublished|Count|
+-----------+--------------+-----+
|Unknown    |11            |430  |
|Virgo      |NULL          |42   |
|Scorpio    |4             |53   |
|Aries      |4             |37   |
|Unknown    |6             |438  |
|Aries      |11            |20   |
|Taurus     |12            |38   |
|Libra      |9             |24   |
|Aquarius   |4             |64   |
|Sagittarius|11            |17   |
|Capricorn  |5             |9    |
|Leo        |9             |19   |
|Virgo      |6             |19   |
|Libra      |3             |39   |
|Capricorn  |11            |28   |
|Aries      |10            |56   |
|Aquarius   |2             |25   |
|Libra      |2             |19   |
|Leo        |11            |14   |
|Scorpio    |2             |21   |
|Libra      |5             |19   |
|Capricorn  |10            |44   |
|Unknown    |7             |974  |
|Aquarius   |3             |21   |
|Gemini     |12            |28   |
|Scorpio    |12     

                                                                                

In [57]:
# Escribir el DataFrame en un archivo CSV con el nombre "zodiac_month_count"
zodiac_month_count.write.csv("zodiac_month_count", header=True, mode="overwrite")

                                                                                

In [58]:
# Detener la sesión de Spark
spark.stop()