In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, round
# Inicializamos la sesion de Spark
spark = SparkSession.builder.appName("SistemaRecomendacion_ALS").config("spark.executor.memory", "8g").config("spark.executor.cores", "6").config("spark.driver.memory","8g").getOrCreate()

# Cargamos el dataset
df = spark.read.option("delimiter", ",").option("quote", '"').option("escape", '"').csv("data/anime.csv", header=True, inferSchema=True)


# Limpieza de los datos

In [2]:
# Visualizamos todo el dataset de animes
df.show()

+---+--------------------+-----+--------------------+--------------------+------------------------------+-----+--------+--------------------+-----------+--------------------+--------------------+----------------+-----------+---------------+--------------------+------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+--------+--------+--------+--------+-------+-------+-------+-------+-------+
| ID|                Name|Score|              Genres|        English name|                 Japanese name| Type|Episodes|               Aired|  Premiered|           Producers|           Licensors|         Studios|     Source|       Duration|              Rating|Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10| Score-9| Score-8| Score-7| Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+---+--------------------+-----+--------------------+--------------------+------------------------------+-----+--------+----------------

In [3]:
# Visualizamos las filas con valores nulos
df.where(df["Type"] == "Unknown").count()

37

In [4]:
# Visualizamos los tipos de animes que existen (Movie, TV, OVA, etc)
df.select("Type").distinct().show(20, False)

+-------+
|Type   |
+-------+
|TV     |
|Special|
|Unknown|
|OVA    |
|Music  |
|Movie  |
|ONA    |
+-------+



In [5]:
# Se filtra el dataset para que solo contenga los animes de tipo Movie y TV
df_filtered = df.filter((df["Type"] == "Movie") | (df["Type"] == "TV"))

In [6]:

# Convertimos los valores de las columnas de Score-1 a Score-10 a valores numericos cambiando los Unknown por 0
df_filtered = df_filtered.withColumn("Score-1", when(df_filtered["Score-1"] == "Unknown", 0).otherwise(df_filtered["Score-1"])) \
    .withColumn("Score-2", when(df_filtered["Score-2"] == "Unknown", 0).otherwise(df_filtered["Score-2"])) \
    .withColumn("Score-3", when(df_filtered["Score-3"] == "Unknown", 0).otherwise(df_filtered["Score-3"])) \
    .withColumn("Score-4", when(df_filtered["Score-4"] == "Unknown", 0).otherwise(df_filtered["Score-4"])) \
    .withColumn("Score-5", when(df_filtered["Score-5"] == "Unknown", 0).otherwise(df_filtered["Score-5"])) \
    .withColumn("Score-6", when(df_filtered["Score-6"] == "Unknown", 0).otherwise(df_filtered["Score-6"])) \
    .withColumn("Score-7", when(df_filtered["Score-7"] == "Unknown", 0).otherwise(df_filtered["Score-7"])) \
    .withColumn("Score-8", when(df_filtered["Score-8"] == "Unknown", 0).otherwise(df_filtered["Score-8"])) \
    .withColumn("Score-9", when(df_filtered["Score-9"] == "Unknown", 0).otherwise(df_filtered["Score-9"])) \
    .withColumn("Score-10", when(df_filtered["Score-10"] == "Unknown", 0).otherwise(df_filtered["Score-10"]))
    

In [7]:
# Visualizamos el dataset con Scores en Unknown y verificamos que los valores de Scores-1 al Scores-10 sean numericos
df_filtered.where(df_filtered["Score"] == "Unknown").show()


+----+--------------------+-------+--------------------+--------------------+-------------------------------------+-----+--------+--------------------+-----------+--------------------+---------+--------------------+---------+---------------+--------------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|  ID|                Name|  Score|              Genres|        English name|                        Japanese name| Type|Episodes|               Aired|  Premiered|           Producers|Licensors|             Studios|   Source|       Duration|              Rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+----+--------------------+-------+--------------------+--------------------+-------------------------------------+-----+--------+----------

In [8]:
## Se muestra uno de los animes para verificar un cambio en la siguinete ejeciución
df_filtered.where(df_filtered["ID"] == "1547").show()

+----+----------------+-------+--------------------+------------+--------------+----+--------+--------------------+-----------+---------+---------+-----------------+------+---------------+------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|  ID|            Name|  Score|              Genres|English name| Japanese name|Type|Episodes|               Aired|  Premiered|Producers|Licensors|          Studios|Source|       Duration|      Rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+----+----------------+-------+--------------------+------------+--------------+----+--------+--------------------+-----------+---------+---------+-----------------+------+---------------+------------+-------+----------+-------+---------+--------+---------+-----

In [9]:
# Se calcula el Score promedio de cada anime si no tiene, y se guarda en la columna Score en otro caso mantiene el Score que tiene
df_filtered = df_filtered.withColumn("Score", when(df_filtered["Score"] == "Unknown", (df_filtered["Score-1"] *1 + df_filtered["Score-2"] *2 + df_filtered["Score-3"] *3 + df_filtered["Score-4"] *4 + df_filtered["Score-5"] *5 + df_filtered["Score-6"] *6 + df_filtered["Score-7"] *7 + df_filtered["Score-8"] *8 + df_filtered["Score-9"] *9 + df_filtered["Score-10"] *10) / (df_filtered["Score-1"] + df_filtered["Score-2"] + df_filtered["Score-3"] + df_filtered["Score-4"] + df_filtered["Score-5"] + df_filtered["Score-6"] + df_filtered["Score-7"] + df_filtered["Score-8"] + df_filtered["Score-9"] + df_filtered["Score-10"])).otherwise(df_filtered["Score"]))

In [10]:
# Redondeamos el Score a 2 decimales
df_filtered = df_filtered.withColumn("Score", round(df_filtered.Score, 2))

In [11]:
# Se muestra uno de los animes para verificar que el Score se haya calculado correctamente
df_filtered.where(df_filtered["ID"] == "1547").show()

+----+----------------+-----+--------------------+------------+--------------+----+--------+--------------------+-----------+---------+---------+-----------------+------+---------------+------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|  ID|            Name|Score|              Genres|English name| Japanese name|Type|Episodes|               Aired|  Premiered|Producers|Licensors|          Studios|Source|       Duration|      Rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+----+----------------+-----+--------------------+------------+--------------+----+--------+--------------------+-----------+---------+---------+-----------------+------+---------------+------------+-------+----------+-------+---------+--------+---------+-------+---

In [12]:
# Comprobación de la media de los scores
# num = (8*10+5*9+6*8+20*7+28*6+31*5+6*4+6*3+2*2+10*1)/(8+5+6+20.0+28+31+6+6+2+10.0)
# num

In [13]:
# Se visualiza el dataset filtrado nuevamente
df_filtered.show()

+---+--------------------+-----+--------------------+--------------------+------------------------------+-----+--------+--------------------+-----------+--------------------+--------------------+----------------+-----------+---------------+--------------------+------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+--------+--------+--------+--------+-------+-------+-------+-------+-------+
| ID|                Name|Score|              Genres|        English name|                 Japanese name| Type|Episodes|               Aired|  Premiered|           Producers|           Licensors|         Studios|     Source|       Duration|              Rating|Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10| Score-9| Score-8| Score-7| Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+---+--------------------+-----+--------------------+--------------------+------------------------------+-----+--------+----------------

In [14]:
# se visualiza el numero de filas del dataset filtrado
df_filtered.count()

8037

In [15]:
# Visualizamos el numero de filas sin valoraciones
df_filtered.where((df_filtered["Score-1"] == 0) & (df_filtered["Score-2"] == 0) & (df_filtered["Score-3"] == 0) & (df_filtered["Score-4"] == 0) & (df_filtered["Score-5"] == 0) & (df_filtered["Score-6"] == 0) & (df_filtered["Score-7"] == 0) & (df_filtered["Score-8"] == 0) & (df_filtered["Score-9"] == 0) & (df_filtered["Score-10"] == 0)).count()

188

In [16]:
# Visualizamos las filas sin valoraciones
df_filtered.where((df_filtered["Score-1"] == 0) & (df_filtered["Score-2"] == 0) & (df_filtered["Score-3"] == 0) & (df_filtered["Score-4"] == 0) & (df_filtered["Score-5"] == 0) & (df_filtered["Score-6"] == 0) & (df_filtered["Score-7"] == 0) & (df_filtered["Score-8"] == 0) & (df_filtered["Score-9"] == 0) & (df_filtered["Score-10"] == 0)).show()

+-----+--------------------+-----+--------------------+--------------------+----------------------------------+-----+--------+--------------+-----------+--------------------+----------+------------+------------+--------+--------------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|   ID|                Name|Score|              Genres|        English name|                     Japanese name| Type|Episodes|         Aired|  Premiered|           Producers| Licensors|     Studios|      Source|Duration|              Rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+-----+--------------------+-----+--------------------+--------------------+----------------------------------+-----+--------+--------------+-----------+--------------------+--------

In [17]:
# Visualizamos las filas sin valoraciones de otra manera
df_filtered.filter(df_filtered.Score.isNull()).show()

+-----+--------------------+-----+--------------------+--------------------+----------------------------------+-----+--------+--------------+-----------+--------------------+----------+------------+------------+--------+--------------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|   ID|                Name|Score|              Genres|        English name|                     Japanese name| Type|Episodes|         Aired|  Premiered|           Producers| Licensors|     Studios|      Source|Duration|              Rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+-----+--------------------+-----+--------------------+--------------------+----------------------------------+-----+--------+--------------+-----------+--------------------+--------

In [18]:
# Se filtra por los animes que tienen valoraciones
df_filtered = df_filtered.filter(df_filtered.Score.isNotNull())

## Entrenamiento del Algoritmo ALS

In [41]:
# Se cargan los valores del csv de ratings
df_ratings = spark.read.csv("data/rating_complete.csv", header=True, sep=",", encoding="UTF-8", inferSchema=True)

# Se cargan los valores de ratings del usuario 66666
df_ratings_ep = spark.read.csv("data/valoraciones_EP.csv", header=False, sep=",", encoding="UTF-8", inferSchema=True)
# Como el csv de datos no tiene header, se reemplaza el nombre por defecto por los que se quieren
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c0", "user_id")
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c1", "anime_id")
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c2", "rating")

In [20]:
print("El numero de valoraciones: " + str(df_ratings.count()))
print("El numero de valoraciones: " + str(df_ratings_ep.count()))

El numero de valoraciones: 57633278
El numero de valoraciones: 65


In [44]:
df_filtered = df_filtered.withColumnRenamed("Rating","Category_rating")

In [47]:
# Se realiza un union de los dos dataframes de valoraciones
df_ratings_completo = df_ratings.union(df_ratings_ep)

In [48]:
df_ratings_completo.show()

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      0|     430|   9.0|
|      0|    1004|   5.0|
|      0|    3010|   7.0|
|      0|     570|   7.0|
|      0|    2762|   9.0|
|      0|     431|   8.0|
|      0|     578|  10.0|
|      0|     433|   6.0|
|      0|    1571|  10.0|
|      0|     121|   9.0|
|      0|     356|   9.0|
|      0|    1250|   7.0|
|      0|    2913|   6.0|
|      0|    1689|   6.0|
|      0|      68|   6.0|
|      0|    1829|   7.0|
|      0|     600|   6.0|
|      0|    3418|   9.0|
|      0|     164|   8.0|
|      0|    1894|   7.0|
+-------+--------+------+
only showing top 20 rows



In [23]:
df_ratings_completo.count()

57633343

In [49]:
# Se realiza un join con el dataframe de animes, para conocer cuales son peliculas y cuales son series
df_ratings_completo_movies = df_ratings_completo.join(df_filtered, df_ratings_completo["anime_id"] == df_filtered["ID"], "inner")

In [50]:
df_ratings_completo_movies.show()

+-------+--------+------+----+--------------------+-----+--------------------+--------------------+--------------------------------------+-----+--------+--------------------+-----------+--------------------+--------------------+--------------------+------------+---------------+--------------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+--------+--------+--------+-------+-------+-------+-------+-------+-------+
|user_id|anime_id|rating|  ID|                Name|Score|              Genres|        English name|                         Japanese name| Type|Episodes|               Aired|  Premiered|           Producers|           Licensors|             Studios|      Source|       Duration|     Category_rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10| Score-9| Score-8| Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+-------+--------+------+----+--------------------+-----+---

In [51]:
df_ratings_movies = df_ratings_completo_movies.filter(df_ratings_completo_movies["Type"] == "Movie").select("user_id", "anime_id", "rating")

In [52]:
df_ratings_movies.show()

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      0|     430|   9.0|
|      0|     570|   7.0|
|      0|     431|   8.0|
|      0|     578|  10.0|
|      0|     433|   6.0|
|      0|    1689|   6.0|
|      0|    1829|   7.0|
|      0|     164|   8.0|
|      0|    1894|   7.0|
|      0|     415|  10.0|
|      0|    2236|  10.0|
|      0|     199|   8.0|
|      0|     459|   9.0|
|      1|   32281|  10.0|
|      1|   28851|   9.0|
|      1|   38826|   8.0|
|      1|     442|   7.0|
|      1|     936|   6.0|
|      1|    2144|   6.0|
|      1|   28755|   8.0|
+-------+--------+------+
only showing top 20 rows



In [53]:
df_ratings_series = df_ratings_completo_movies.filter(df_ratings_completo_movies["Type"] == "TV").select("user_id", "anime_id", "rating")

In [54]:
df_ratings_series.show()

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      0|    3010|   7.0|
|      0|    2762|   9.0|
|      0|    1571|  10.0|
|      0|     121|   9.0|
|      0|     356|   9.0|
|      0|    1250|   7.0|
|      0|    2913|   6.0|
|      0|      68|   6.0|
|      0|     600|   6.0|
|      0|    3418|   9.0|
|      0|    2034|   8.0|
|      0|    2547|   7.0|
|      0|     169|   7.0|
|      0|     174|   4.0|
|      0|    2543|   7.0|
|      0|    4086|   6.0|
|      0|     419|   8.0|
|      1|   22535|   9.0|
|      1|   38000|   9.0|
|      1|   18679|   6.0|
+-------+--------+------+
only showing top 20 rows

