In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, round
# Inicializamos la sesion de Spark
spark = SparkSession.builder.appName("SistemaRecomendacion_ALS").config("spark.executor.memory", "8g").config("spark.executor.cores", "6").config("spark.driver.memory","8g").getOrCreate()

# Cargamos el dataset
df = spark.read.option("delimiter", ",").option("quote", '"').option("escape", '"').csv("data/anime.csv", header=True, inferSchema=True)


# Limpieza de los datos

In [2]:
# Visualizamos todo el dataset de animes
df.show()

+---+--------------------+-----+--------------------+--------------------+------------------------------+-----+--------+--------------------+-----------+--------------------+--------------------+----------------+-----------+---------------+--------------------+------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+--------+--------+--------+--------+-------+-------+-------+-------+-------+
| ID|                Name|Score|              Genres|        English name|                 Japanese name| Type|Episodes|               Aired|  Premiered|           Producers|           Licensors|         Studios|     Source|       Duration|              Rating|Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10| Score-9| Score-8| Score-7| Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+---+--------------------+-----+--------------------+--------------------+------------------------------+-----+--------+----------------

In [3]:
# Visualizamos las filas con valores nulos
df.where(df["Type"] == "Unknown").count()

37

In [4]:
# Visualizamos los tipos de animes que existen (Movie, TV, OVA, etc)
df.select("Type").distinct().show(20, False)

+-------+
|Type   |
+-------+
|TV     |
|Special|
|Unknown|
|OVA    |
|Music  |
|Movie  |
|ONA    |
+-------+



In [5]:
# Se filtra el dataset para que solo contenga los animes de tipo Movie y TV
df_filtered = df.filter((df["Type"] == "Movie") | (df["Type"] == "TV"))

In [6]:

# Convertimos los valores de las columnas de Score-1 a Score-10 a valores numericos cambiando los Unknown por 0
df_filtered = df_filtered.withColumn("Score-1", when(df_filtered["Score-1"] == "Unknown", 0).otherwise(df_filtered["Score-1"])) \
    .withColumn("Score-2", when(df_filtered["Score-2"] == "Unknown", 0).otherwise(df_filtered["Score-2"])) \
    .withColumn("Score-3", when(df_filtered["Score-3"] == "Unknown", 0).otherwise(df_filtered["Score-3"])) \
    .withColumn("Score-4", when(df_filtered["Score-4"] == "Unknown", 0).otherwise(df_filtered["Score-4"])) \
    .withColumn("Score-5", when(df_filtered["Score-5"] == "Unknown", 0).otherwise(df_filtered["Score-5"])) \
    .withColumn("Score-6", when(df_filtered["Score-6"] == "Unknown", 0).otherwise(df_filtered["Score-6"])) \
    .withColumn("Score-7", when(df_filtered["Score-7"] == "Unknown", 0).otherwise(df_filtered["Score-7"])) \
    .withColumn("Score-8", when(df_filtered["Score-8"] == "Unknown", 0).otherwise(df_filtered["Score-8"])) \
    .withColumn("Score-9", when(df_filtered["Score-9"] == "Unknown", 0).otherwise(df_filtered["Score-9"])) \
    .withColumn("Score-10", when(df_filtered["Score-10"] == "Unknown", 0).otherwise(df_filtered["Score-10"]))
    

In [7]:
# Visualizamos el dataset con Scores en Unknown y verificamos que los valores de Scores-1 al Scores-10 sean numericos
df_filtered.where(df_filtered["Score"] == "Unknown").show()


+----+--------------------+-------+--------------------+--------------------+-------------------------------------+-----+--------+--------------------+-----------+--------------------+---------+--------------------+---------+---------------+--------------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|  ID|                Name|  Score|              Genres|        English name|                        Japanese name| Type|Episodes|               Aired|  Premiered|           Producers|Licensors|             Studios|   Source|       Duration|              Rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+----+--------------------+-------+--------------------+--------------------+-------------------------------------+-----+--------+----------

In [8]:
## Se muestra uno de los animes para verificar un cambio en la siguinete ejeciución
df_filtered.where(df_filtered["ID"] == "1547").show()

+----+----------------+-------+--------------------+------------+--------------+----+--------+--------------------+-----------+---------+---------+-----------------+------+---------------+------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|  ID|            Name|  Score|              Genres|English name| Japanese name|Type|Episodes|               Aired|  Premiered|Producers|Licensors|          Studios|Source|       Duration|      Rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+----+----------------+-------+--------------------+------------+--------------+----+--------+--------------------+-----------+---------+---------+-----------------+------+---------------+------------+-------+----------+-------+---------+--------+---------+-----

In [9]:
# Se calcula el Score promedio de cada anime si no tiene, y se guarda en la columna Score en otro caso mantiene el Score que tiene
df_filtered = df_filtered.withColumn("Score", when(df_filtered["Score"] == "Unknown", (df_filtered["Score-1"] *1 + df_filtered["Score-2"] *2 + df_filtered["Score-3"] *3 + df_filtered["Score-4"] *4 + df_filtered["Score-5"] *5 + df_filtered["Score-6"] *6 + df_filtered["Score-7"] *7 + df_filtered["Score-8"] *8 + df_filtered["Score-9"] *9 + df_filtered["Score-10"] *10) / (df_filtered["Score-1"] + df_filtered["Score-2"] + df_filtered["Score-3"] + df_filtered["Score-4"] + df_filtered["Score-5"] + df_filtered["Score-6"] + df_filtered["Score-7"] + df_filtered["Score-8"] + df_filtered["Score-9"] + df_filtered["Score-10"])).otherwise(df_filtered["Score"]))

In [10]:
# Redondeamos el Score a 2 decimales
df_filtered = df_filtered.withColumn("Score", round(df_filtered.Score, 2))

In [11]:
# Se muestra uno de los animes para verificar que el Score se haya calculado correctamente
df_filtered.where(df_filtered["ID"] == "1547").show()

+----+----------------+-----+--------------------+------------+--------------+----+--------+--------------------+-----------+---------+---------+-----------------+------+---------------+------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|  ID|            Name|Score|              Genres|English name| Japanese name|Type|Episodes|               Aired|  Premiered|Producers|Licensors|          Studios|Source|       Duration|      Rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+----+----------------+-----+--------------------+------------+--------------+----+--------+--------------------+-----------+---------+---------+-----------------+------+---------------+------------+-------+----------+-------+---------+--------+---------+-------+---

In [12]:
# Comprobación de la media de los scores
# num = (8*10+5*9+6*8+20*7+28*6+31*5+6*4+6*3+2*2+10*1)/(8+5+6+20.0+28+31+6+6+2+10.0)
# num

In [13]:
# Se visualiza el dataset filtrado nuevamente
df_filtered.show()

+---+--------------------+-----+--------------------+--------------------+------------------------------+-----+--------+--------------------+-----------+--------------------+--------------------+----------------+-----------+---------------+--------------------+------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+--------+--------+--------+--------+-------+-------+-------+-------+-------+
| ID|                Name|Score|              Genres|        English name|                 Japanese name| Type|Episodes|               Aired|  Premiered|           Producers|           Licensors|         Studios|     Source|       Duration|              Rating|Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10| Score-9| Score-8| Score-7| Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+---+--------------------+-----+--------------------+--------------------+------------------------------+-----+--------+----------------

In [14]:
# se visualiza el numero de filas del dataset filtrado
df_filtered.count()

8037

In [15]:
# Visualizamos el numero de filas sin valoraciones
df_filtered.where((df_filtered["Score-1"] == 0) & (df_filtered["Score-2"] == 0) & (df_filtered["Score-3"] == 0) & (df_filtered["Score-4"] == 0) & (df_filtered["Score-5"] == 0) & (df_filtered["Score-6"] == 0) & (df_filtered["Score-7"] == 0) & (df_filtered["Score-8"] == 0) & (df_filtered["Score-9"] == 0) & (df_filtered["Score-10"] == 0)).count()

188

In [16]:
# Visualizamos las filas sin valoraciones
df_filtered.where((df_filtered["Score-1"] == 0) & (df_filtered["Score-2"] == 0) & (df_filtered["Score-3"] == 0) & (df_filtered["Score-4"] == 0) & (df_filtered["Score-5"] == 0) & (df_filtered["Score-6"] == 0) & (df_filtered["Score-7"] == 0) & (df_filtered["Score-8"] == 0) & (df_filtered["Score-9"] == 0) & (df_filtered["Score-10"] == 0)).show()

+-----+--------------------+-----+--------------------+--------------------+----------------------------------+-----+--------+--------------+-----------+--------------------+----------+------------+------------+--------+--------------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|   ID|                Name|Score|              Genres|        English name|                     Japanese name| Type|Episodes|         Aired|  Premiered|           Producers| Licensors|     Studios|      Source|Duration|              Rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+-----+--------------------+-----+--------------------+--------------------+----------------------------------+-----+--------+--------------+-----------+--------------------+--------

In [17]:
# Visualizamos las filas sin valoraciones de otra manera
df_filtered.filter(df_filtered.Score.isNull()).show()

+-----+--------------------+-----+--------------------+--------------------+----------------------------------+-----+--------+--------------+-----------+--------------------+----------+------------+------------+--------+--------------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|   ID|                Name|Score|              Genres|        English name|                     Japanese name| Type|Episodes|         Aired|  Premiered|           Producers| Licensors|     Studios|      Source|Duration|              Rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+-----+--------------------+-----+--------------------+--------------------+----------------------------------+-----+--------+--------------+-----------+--------------------+--------

In [18]:
# Se filtra por los animes que tienen valoraciones
df_filtered = df_filtered.filter(df_filtered.Score.isNotNull())

## Entrenamiento del Algoritmo ALS

In [19]:
# Se cargan los valores del csv de ratings
df_ratings = spark.read.csv("data/rating_complete.csv", header=True, sep=",", encoding="UTF-8", inferSchema=True)

# Se cargan los valores de ratings del usuario 66666
df_ratings_ep = spark.read.csv("data/valoraciones_EP.csv", header=False, sep=",", encoding="UTF-8", inferSchema=True)
# Como el csv de datos no tiene header, se reemplaza el nombre por defecto por los que se quieren
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c0", "user_id")
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c1", "anime_id")
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c2", "rating")

In [20]:
print("El numero de valoraciones: " + str(df_ratings.count()))
print("El numero de valoraciones: " + str(df_ratings_ep.count()))

El numero de valoraciones: 57633278
El numero de valoraciones: 65


In [21]:
df_filtered = df_filtered.withColumnRenamed("Rating","Category_rating")

In [22]:
# Se realiza un union de los dos dataframes de valoraciones
df_ratings_completo = df_ratings.union(df_ratings_ep)

In [23]:
df_ratings_completo.show()

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      0|     430|   9.0|
|      0|    1004|   5.0|
|      0|    3010|   7.0|
|      0|     570|   7.0|
|      0|    2762|   9.0|
|      0|     431|   8.0|
|      0|     578|  10.0|
|      0|     433|   6.0|
|      0|    1571|  10.0|
|      0|     121|   9.0|
|      0|     356|   9.0|
|      0|    1250|   7.0|
|      0|    2913|   6.0|
|      0|    1689|   6.0|
|      0|      68|   6.0|
|      0|    1829|   7.0|
|      0|     600|   6.0|
|      0|    3418|   9.0|
|      0|     164|   8.0|
|      0|    1894|   7.0|
+-------+--------+------+
only showing top 20 rows



In [24]:
df_ratings_completo.count()

57633343

In [25]:
# Se realiza un join con el dataframe de animes, para conocer cuales son peliculas y cuales son series
df_ratings_completo_movies = df_ratings_completo.join(df_filtered, df_ratings_completo["anime_id"] == df_filtered["ID"], "inner")

In [26]:
# Se obtienen las peliculas valoradas por los usuarios
df_ratings_movies = df_ratings_completo_movies.filter(df_ratings_completo_movies["Type"] == "Movie").select("user_id", "anime_id", "rating")

In [27]:
# Se obtienen las series valoradas por los usuarios
df_ratings_series = df_ratings_completo_movies.filter(df_ratings_completo_movies["Type"] == "TV").select("user_id", "anime_id", "rating")

In [28]:
(training_movies, test_movies) = df_ratings_movies.randomSplit([0.8, 0.2])
(training_series, test_series) = df_ratings_series.randomSplit([0.8, 0.2])

In [29]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [30]:
als_movie = ALS(rank=5, maxIter=5, regParam=0.01, userCol="user_id", itemCol="anime_id", ratingCol="rating", coldStartStrategy="drop")
model_movie = als_movie.fit(training_movies)

In [31]:
als_series = ALS(rank=5, maxIter=5, regParam=0.01, userCol="user_id", itemCol="anime_id", ratingCol="rating", coldStartStrategy="drop")
model_series = als_series.fit(training_series)

In [32]:
predictions_movies = model_movie.transform(test_movies)
predictions_series = model_series.transform(test_series)

In [33]:
predictions_movies.show()

+-------+--------+------+----------+
|user_id|anime_id|rating|prediction|
+-------+--------+------+----------+
|     27|    2236|   6.0|  8.113375|
|     27|    2890|   6.0| 7.6938314|
|     27|   12477|   5.0|  7.890316|
|     27|   21647|   6.0| 7.9834337|
|     27|   27411|   6.0| 6.7283673|
|     27|   31765|   8.0|  7.751047|
|     28|     461|   7.0|  8.360939|
|     28|     895|   6.0| 7.7251897|
|     28|     899|   8.0|  7.568357|
|     28|     900|   8.0|  7.522044|
|     28|     936|   8.0| 7.6841674|
|     28|    1121|   7.0|  7.658155|
|     28|    2107|  10.0|   8.10916|
|     28|    2386|   9.0|  8.516698|
|     28|    4026|   9.0|  7.538605|
|     34|    2144|   6.0|  7.767227|
|     34|    4835|  10.0|  8.367336|
|     44|     164|   8.0|   7.81253|
|     44|     437|   8.0|  7.997062|
|     44|    1430|   8.0| 7.0598006|
+-------+--------+------+----------+
only showing top 20 rows



In [34]:
predictions_series.show()

+-------+--------+------+----------+
|user_id|anime_id|rating|prediction|
+-------+--------+------+----------+
|     27|     154|   9.0| 7.3307667|
|     27|     276|   9.0|  5.869542|
|     27|     277|   9.0|  5.859152|
|     27|    5114|   8.0|  8.589972|
|     27|   18897|  10.0|   7.80424|
|     27|   21881|   8.0|  7.833205|
|     27|   33487|   7.0| 7.1571217|
|     27|   34382|   6.0|  6.862756|
|     27|   34449|   5.0| 4.9212112|
|     27|   34822|  10.0| 7.6988497|
|     27|   35629|   1.0|  4.986211|
|     27|   36124|   8.0|  6.495854|
|     27|   36511|  10.0|  7.350711|
|     28|     223|  10.0|   8.50159|
|     28|     225|   9.0|  9.469904|
|     28|     550|   9.0| 8.0676155|
|     28|     860|   8.0| 8.0588455|
|     28|    1535|  10.0|   8.95359|
|     28|    2363|   7.0| 7.5903435|
|     28|    4214|  10.0|  9.405077|
+-------+--------+------+----------+
only showing top 20 rows



In [35]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse_movies = evaluator.evaluate(predictions_movies)
rmse_series = evaluator.evaluate(predictions_series)
print("Root-mean-square error movies = " + str(rmse_movies))
print("Root-mean-square error series = " + str(rmse_series))

Root-mean-square error movies = 1.264842022309026
Root-mean-square error series = 1.1916305467312633


In [36]:
user_ep = df_ratings_movies.select(als_movie.getUserCol()).where(df_ratings_movies["user_id"] == 66666).distinct()

In [37]:
recomendation_userEp_movies = model_movie.recommendForUserSubset(user_ep, 5)
recomendation_userEp_series = model_series.recommendForUserSubset(user_ep, 5)

In [38]:
recomendation_userEp_movies.show(20, False)
recomendation_userEp_series.show(20, False)

+-------+---------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                    |
+-------+---------------------------------------------------------------------------------------------------+
|66666  |[{30134, 47.062183}, {34706, 42.979866}, {36701, 40.626667}, {17100, 39.06162}, {29886, 36.493965}]|
+-------+---------------------------------------------------------------------------------------------------+

+-------+---------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                    |
+-------+---------------------------------------------------------------------------------------------------+
|66666  |[{35108, 43.560303}, {39130, 40.396114}, {38537, 35.05435}, {34665, 34.258717}, {37359, 34.150665}]|
+-------+

In [39]:
recomendation_userEp_movies.printSchema()
recomendation_userEp_series.printSchema()

root
 |-- user_id: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- anime_id: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)

root
 |-- user_id: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- anime_id: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [40]:
from pyspark.sql.functions import explode

In [41]:
# Separamos las recomendaciones de peliculas en columnas
df_recomendations_movies_userEp = recomendation_userEp_movies.select("user_id", explode("recommendations").alias("recommendations"))
# Separamos las recomendaciones de series en columnas
df_recomendations_series_userEp = recomendation_userEp_series.select("user_id", explode("recommendations").alias("recommendations"))

In [42]:
df_recomendations_movies_userEp = df_recomendations_movies_userEp.select("user_id", "recommendations.anime_id", "recommendations.rating")
df_recomendations_series_userEp = df_recomendations_series_userEp.select("user_id", "recommendations.anime_id", "recommendations.rating")

In [43]:
df_recomendations_movies_userEp.show(truncate=False)
df_recomendations_series_userEp.show(truncate=False)

+-------+--------+---------+
|user_id|anime_id|rating   |
+-------+--------+---------+
|66666  |30134   |47.062183|
|66666  |34706   |42.979866|
|66666  |36701   |40.626667|
|66666  |17100   |39.06162 |
|66666  |29886   |36.493965|
+-------+--------+---------+

+-------+--------+---------+
|user_id|anime_id|rating   |
+-------+--------+---------+
|66666  |35108   |43.560303|
|66666  |39130   |40.396114|
|66666  |38537   |35.05435 |
|66666  |34665   |34.258717|
|66666  |37359   |34.150665|
+-------+--------+---------+



In [44]:
a = df_recomendations_movies_userEp.join(df_filtered, df_recomendations_movies_userEp["anime_id"] == df_filtered["ID"], "inner")

In [45]:
a.show()

+-------+--------+---------+-----+--------------------+-----+------------------+--------------------+------------------+-----+--------+------------+---------+---------+---------+--------------------+--------+-------------+--------------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|user_id|anime_id|   rating|   ID|                Name|Score|            Genres|        English name|     Japanese name| Type|Episodes|       Aired|Premiered|Producers|Licensors|             Studios|  Source|     Duration|     Category_rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+-------+--------+---------+-----+--------------------+-----+------------------+--------------------+------------------+-----+--------+------------+---------+---------+---------+

In [46]:
b = df_recomendations_series_userEp.join(df_filtered, df_recomendations_series_userEp["anime_id"] == df_filtered["ID"], "inner")

In [47]:
b.show()

+-------+--------+---------+-----+--------------------+-----+--------------------+------------------+-----------------------+----+--------+--------------------+-----------+--------------------+---------+--------------+--------+---------------+---------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|user_id|anime_id|   rating|   ID|                Name|Score|              Genres|      English name|          Japanese name|Type|Episodes|               Aired|  Premiered|           Producers|Licensors|       Studios|  Source|       Duration|Category_rating| Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+-------+--------+---------+-----+--------------------+-----+--------------------+------------------+-----------------------+----+--------+-------