In [30]:
import findspark
findspark.init()
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col

In [2]:
spark = SparkSession.builder.appName("SPARK_ML_SistemaRecomendacion").config("spark.executor.memory", "8g").config("spark.executor.cores", "6").config("spark.driver.memory","8g").getOrCreate()

In [3]:
df_animes = spark.read.csv("data/anime.csv", header=True, sep=",", encoding="UTF-8", inferSchema=True)

In [4]:
df_ratings = spark.read.csv("data/rating_complete.csv", header=True, sep=",", encoding="UTF-8", inferSchema=True)

df_ratings_ep = spark.read.csv("data/valoraciones_EP.csv", header=False, sep=",", encoding="UTF-8", inferSchema=True)
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c0", "user_id")
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c1", "anime_id")
df_ratings_ep = df_ratings_ep.withColumnRenamed("_c2", "rating")

In [5]:
df_ratings_completo = df_ratings.union(df_ratings_ep) # ya es un dataframe

In [6]:
(training, test) = df_ratings_completo.randomSplit([0.8, 0.2])

In [7]:
als = ALS(rank=10, maxIter=5, regParam=0.01, userCol="user_id", itemCol="anime_id", ratingCol="rating", coldStartStrategy="drop")

In [8]:
model = als.fit(training)

In [9]:
predictions = model.transform(test)

In [10]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [11]:
rmse = evaluator.evaluate(predictions) 
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.155547337325048


In [13]:
userRecs = model.recommendForAllUsers(10)

In [14]:
movieRecs = model.recommendForAllItems(10)

In [15]:
users = df_ratings_completo.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)

In [16]:
movies = df_ratings_completo.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

In [18]:
print("User Recommendations")
userRecs.show(20, False)

User Recommendations
+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                                         |
+-------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|27     |[{34711, 24.771227}, {7803, 24.742477}, {12771, 21.447422}, {28561, 19.255156}, {34182, 19.13438}, {35140, 18.315731}, {16572, 17.964424}, {9949, 17.235052}, {35710, 17.086786}, {16824, 16.915}]      |
|28     |[{38868, 25.551296}, {20003, 23.611732}, {25589, 22.931404}, {39374, 22.106148}, {36549, 22.048027}, {23867, 21.956226}, {2970

In [21]:
userRecs.printSchema()

root
 |-- user_id: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- anime_id: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [22]:
usuario6666 = userRecs.filter(userRecs["user_id"] == 666666)

In [24]:
usuario6666.show(20, False)

+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user_id|recommendations                                                                                                                                                                                       |
+-------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|666666 |[{34340, 20.056782}, {28561, 20.029015}, {35055, 19.9583}, {37749, 18.873266}, {16572, 18.693483}, {35035, 17.869463}, {35058, 17.716349}, {16824, 17.223783}, {17104, 16.734074}, {30997, 16.367155}]|
+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------

In [31]:
exploded_df = usuario6666.select("user_id", explode("recommendations").alias("recommendation"))

In [32]:
result_df = exploded_df.select("user_id",
                               col("recommendation.anime_id").alias("anime_id"),
                               col("recommendation.rating").alias("rating"))

In [33]:
result_df.show(truncate=False)

+-------+--------+---------+
|user_id|anime_id|rating   |
+-------+--------+---------+
|666666 |34340   |20.056782|
|666666 |28561   |20.029015|
|666666 |35055   |19.9583  |
|666666 |37749   |18.873266|
|666666 |16572   |18.693483|
|666666 |35035   |17.869463|
|666666 |35058   |17.716349|
|666666 |16824   |17.223783|
|666666 |17104   |16.734074|
|666666 |30997   |16.367155|
+-------+--------+---------+



In [35]:
df_peliculas_user = df_animes.join(result_df, df_animes["ID"] == result_df["anime_id"], how="inner")

In [36]:
df_peliculas_user.show(20, False)

+-----+------------------------------------------------------------------+-------+-----------------------+---------------------+----------------------------------------------------+-----+--------+---------------------------+---------+---------+---------+-------+--------+---------------+-------------+-------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+-------+-------+-------+-------+-------+-------+-------+-------+-------+-------+--------+---------+
|ID   |Name                                                              |Score  |Genres                 |English name         |Japanese name                                       |Type |Episodes|Aired                      |Premiered|Producers|Licensors|Studios|Source  |Duration       |Rating       |Ranked |Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10|Score-9|Score-8|Score-7|Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|user_id|anime_id|rating   |
+-----+-

In [19]:
print("Movies Recommendations")
movieRecs.show(20, False)

Movies Recommendations
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|anime_id|recommendations                                                                                                                                                                                                    |
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|26      |[{322894, 19.12891}, {180116, 16.425825}, {353208, 16.263668}, {201448, 16.033533}, {223194, 15.86365}, {39619, 15.768026}, {113769, 15.758004}, {179983, 15.489319}, {312519, 15.4634495}, {334980, 15.387161}]   |
|27      |[{203701, 13.332826}, {18152, 12.957509}, {264264, 12.582412}, {220771, 12.

In [12]:
#spark.stop()