In [1]:
import findspark
findspark.init()
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col

In [2]:
spark = SparkSession.builder.appName("SPARK_ML_SistemaRecomendacion").config("spark.executor.memory", "8g").config("spark.executor.cores", "6").config("spark.driver.memory","8g").getOrCreate()

In [3]:
df_animes = spark.read.csv("data/anime.csv", header=True, sep=",", encoding="UTF-8", inferSchema=True)

In [12]:
df_peliculas = df_animes.filter(df_animes["Type"] == "Movie")

In [13]:
df_peliculas.count()

3039

In [14]:
df_series = df_animes.filter(df_animes["Type"] == "TV")

In [15]:
df_series.count()

4994

In [16]:
df_ratings = spark.read.csv("data/rating_complete.csv", header=True, sep=",", encoding="UTF-8", inferSchema=True)

df_ratings_ep = spark.read.csv("data/valoraciones_EP.csv", header=False, sep=",", encoding="UTF-8", inferSchema=True)
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c0", "user_id")
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c1", "anime_id")
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c2", "rating")

In [17]:
df_ratings_completo = df_ratings.union(df_ratings_ep)

In [18]:
df_ratings_completo.show()

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      0|     430|   9.0|
|      0|    1004|   5.0|
|      0|    3010|   7.0|
|      0|     570|   7.0|
|      0|    2762|   9.0|
|      0|     431|   8.0|
|      0|     578|  10.0|
|      0|     433|   6.0|
|      0|    1571|  10.0|
|      0|     121|   9.0|
|      0|     356|   9.0|
|      0|    1250|   7.0|
|      0|    2913|   6.0|
|      0|    1689|   6.0|
|      0|      68|   6.0|
|      0|    1829|   7.0|
|      0|     600|   6.0|
|      0|    3418|   9.0|
|      0|     164|   8.0|
|      0|    1894|   7.0|
+-------+--------+------+
only showing top 20 rows



In [None]:
df_ratings_completo_movies = df_ratings_completo.join(df_animes, df_ratings_completo["anime_id"] == df_animes["ID"], "inner")

In [6]:
(training, test) = df_ratings_completo.randomSplit([0.8, 0.2])

In [7]:
als = ALS(rank=10, maxIter=5, regParam=0.01, userCol="user_id", itemCol="anime_id", ratingCol="rating", coldStartStrategy="drop")

In [8]:
model = als.fit(training)

In [9]:
predictions = model.transform(test)

In [10]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [11]:
rmse = evaluator.evaluate(predictions) 
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.1653396874384212


In [12]:
userRecs = model.recommendForAllUsers(10)


In [13]:
movieRecs = model.recommendForAllItems(10)

In [14]:
users = df_ratings_completo.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)

In [15]:
movies = df_ratings_completo.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

In [16]:
print("User Recommendations")
userRecs.show(20, False)

User Recommendations
+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                                         |
+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|27     |[{30997, 26.546774}, {35035, 24.273293}, {35058, 23.806496}, {35054, 23.806496}, {28561, 23.509024}, {35055, 21.14663}, {38094, 20.67978}, {35476, 19.84515}, {39083, 19.141048}, {20703, 18.951517}]   |
|28     |[{35035, 24.478012}, {35074, 21.513226}, {24981, 21.259594}, {26293, 20.854996}, {16838, 20.688904}, {19445, 20.420752}, {3505

In [17]:
userRecs.printSchema()

root
 |-- user_id: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- anime_id: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [18]:
usuario6666 = userRecs.filter(userRecs["user_id"] == 666666)

In [19]:
usuario6666.show(20, False)

+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                                       |
+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|666666 |[{30997, 28.92321}, {35035, 26.584492}, {29713, 26.177824}, {35058, 26.105375}, {35054, 26.105375}, {28561, 24.957449}, {19467, 24.732687}, {35055, 24.61034}, {26293, 21.248882}, {31463, 20.518385}]|
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------

In [20]:
exploded_df = usuario6666.select("user_id", explode("recommendations").alias("recommendation"))

In [21]:
result_df = exploded_df.select("user_id",
                               col("recommendation.anime_id").alias("anime_id"),
                               col("recommendation.rating").alias("rating"))

In [22]:
result_df.show(truncate=False)

+-------+--------+---------+
|user_id|anime_id|rating   |
+-------+--------+---------+
|666666 |30997   |28.92321 |
|666666 |35035   |26.584492|
|666666 |29713   |26.177824|
|666666 |35058   |26.105375|
|666666 |35054   |26.105375|
|666666 |28561   |24.957449|
|666666 |19467   |24.732687|
|666666 |35055   |24.61034 |
|666666 |26293   |21.248882|
|666666 |31463   |20.518385|
+-------+--------+---------+



In [23]:
df_peliculas_user = df_animes.join(result_df, df_animes["ID"] == result_df["anime_id"], how="inner")

In [24]:
df_peliculas_user.show(20, False)

+-----+------------------------------------------------------------------+-------+---------------------+--------------------+----------------------------------------------------+-----+--------+----------------------------+-----------+-----------+---------+-------------------------------+--------+---------------+-------------------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+---------+
|ID   |Name                                                              |Score  |Genres               |English name        |Japanese name                                       |Type |Episodes|Aired                       |Premiered  |Producers  |Licensors|Studios                        |Source  |Duration       |Rating                   |Ranked |Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|

In [25]:
print("Movies Recommendations")
movieRecs.show(20, False)

Movies Recommendations
+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|anime_id|recommendations                                                                                                                                                                                                  |
+--------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|26      |[{70660, 17.183125}, {322894, 16.615702}, {162019, 16.261873}, {130450, 16.11904}, {191137, 15.037886}, {300865, 14.9564705}, {201497, 14.652828}, {220771, 14.62065}, {96817, 13.646431}, {274977, 13.623729}]  |
|27      |[{237363, 13.200034}, {88286, 12.843759}, {257137, 12.249708}, {261790, 11.480342},

In [34]:
df_animes.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- English name: string (nullable = true)
 |-- Japanese name: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Episodes: string (nullable = true)
 |-- Aired: string (nullable = true)
 |-- Premiered: string (nullable = true)
 |-- Producers: string (nullable = true)
 |-- Licensors: string (nullable = true)
 |-- Studios: string (nullable = true)
 |-- Source: string (nullable = true)
 |-- Duration: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Ranked: string (nullable = true)
 |-- Popularity: string (nullable = true)
 |-- Members: double (nullable = true)
 |-- Favorites: integer (nullable = true)
 |-- Watching: integer (nullable = true)
 |-- Completed: integer (nullable = true)
 |-- On-Hold: integer (nullable = true)
 |-- Dropped: integer (nullable = true)
 |-- Plan to Watch: integer (nullable = true)

In [53]:
df_animes.select(df_animes["Score-2"]).replace("Unknown", "0").show(20, False)

+-------+
|Score-2|
+-------+
|741.0  |
|109.0  |
|316.0  |
|164.0  |
|50.0   |
|140.0  |
|278.0  |
|31.0   |
|48.0   |
|593.0  |
|3582.0 |
|2616.0 |
|210.0  |
|32.0   |
|441.0  |
|350.0  |
|411.0  |
|456.0  |
|56.0   |
|13.0   |
+-------+
only showing top 20 rows



In [57]:
df_animes_unknown = df_animes.filter(df_animes["Score"] == "Unknown")

In [60]:
a = df_animes_unknown.select(df_animes_unknown["Score-2"]).replace("Unknown", "0")

+----+-----------------------------------------------------------------+-------+-------------------------------------------+------------------------------------------------------------------------------+-----------------------------------------------------------+-----+--------+----------------------------+-----------+--------------------------------------------------------------------------+---------+--------------------------+---------+---------------+------------------------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+
|ID  |Name                                                             |Score  |Genres                                     |English name                                                                  |Japanese name                                              |Type |Episodes|Aired                       |Premiered  |Producers                