In [1]:
import findspark
findspark.init()
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark import *
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("SPARK_ML_SistemaRecomendacion").getOrCreate()

In [None]:
# lines = spark.read.text("ml-latest-small/ratings.csv").rdd
# parts = lines.map(lambda row: row.value.split(";"))
# ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]), rating=float(p[2]), timestamp=int(p[3])))

In [3]:
# CABECERA:
# ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,Producers,Licensors,Studios,Source,Duration,Rating,Ranked,Popularity,Members,Favorites,Watching,Completed,On-Hold,Dropped,Plan to Watch,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
df = spark.read.csv("anime.csv", header=True, sep=",")
#ratingsRDD = df.rdd.map(lambda row: Row(userId=int(row['ID']), animeId=int(row['anime_id']), rating=int(row['rating'])))
#df.printSchema()
df.show()

+---+--------------------+-----+--------------------+--------------------+------------------------------+-----+--------+--------------------+-----------+--------------------+--------------------+----------------+-----------+---------------+--------------------+------+----------+-------+---------+--------+---------+-------+-------+-------------+--------+--------+--------+--------+--------+-------+-------+-------+-------+-------+
| ID|                Name|Score|              Genres|        English name|                 Japanese name| Type|Episodes|               Aired|  Premiered|           Producers|           Licensors|         Studios|     Source|       Duration|              Rating|Ranked|Popularity|Members|Favorites|Watching|Completed|On-Hold|Dropped|Plan to Watch|Score-10| Score-9| Score-8| Score-7| Score-6|Score-5|Score-4|Score-3|Score-2|Score-1|
+---+--------------------+-----+--------------------+--------------------+------------------------------+-----+--------+----------------

In [4]:
df = spark.read.csv("rating_complete.csv", header=True, sep=",")
ratingsRDD = df.rdd.map(lambda row: Row(userId=int(row['user_id']), animeId=int(row['anime_id']), rating=int(row['rating'])))
df.printSchema()
df.show()

root
 |-- user_id: string (nullable = true)
 |-- anime_id: string (nullable = true)
 |-- rating: string (nullable = true)

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      0|     430|     9|
|      0|    1004|     5|
|      0|    3010|     7|
|      0|     570|     7|
|      0|    2762|     9|
|      0|     431|     8|
|      0|     578|    10|
|      0|     433|     6|
|      0|    1571|    10|
|      0|     121|     9|
|      0|     356|     9|
|      0|    1250|     7|
|      0|    2913|     6|
|      0|    1689|     6|
|      0|      68|     6|
|      0|    1829|     7|
|      0|     600|     6|
|      0|    3418|     9|
|      0|     164|     8|
|      0|    1894|     7|
+-------+--------+------+
only showing top 20 rows



In [None]:
ratings = spark.createDataFrame(ratingsRDD)

In [None]:
(training, test) = ratings.randomSplit([0.8, 0.2])
# Entrenamos el modelo. La estrategia cold start con 'drop' descarata valores NaN en evaluación
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
coldStartStrategy="drop")
model = als.fit(training)
# Evaluamos el modelo con RMSE
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions) 
print("Root-mean-square error = " + str(rmse))

In [None]:
# Generamos las 10 mejores recomendaciones para cada usuario
userRecs = model.recommendForAllUsers(10)
# Generamos los top 10 usuarios para cada película
movieRecs = model.recommendForAllItems(10)
# Generar las 10 mejores recomendaciones para un subconjunto de usuarios
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Geenarar la recomendación con el top 10 usuarios para el subconjunto de películas dado
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import split

# Crear una SparkSession
spark = SparkSession.builder.appName("EjemploCSV").getOrCreate()

# Leer el CSV con cabecera y separador ";"
df = spark.read.csv("ml-latest-small/ratings.csv", header=True, sep=";")

# Mostrar el esquema y los primeros registros
df.printSchema()
df.show()

# Dividir la columna en múltiples columnas
df_split = df.withColumn("split_col", split(df["userId,movieId,rating,timestamp"], ","))

# Crear columnas individuales para userId, movieId, rating, timestamp
df_split = df_split.withColumn("userId", df_split["split_col"].getItem(0).cast("int"))
df_split = df_split.withColumn("movieId", df_split["split_col"].getItem(1).cast("int"))
df_split = df_split.withColumn("rating", df_split["split_col"].getItem(2).cast("float"))
df_split = df_split.withColumn("timestamp", df_split["split_col"].getItem(3).cast("int"))

# Crear el RDD a partir de las columnas individuales
ratingsRDD = df_split.rdd.map(lambda row: Row(userId=row['userId'], movieId=row['movieId'], rating=row['rating'], timestamp=row['timestamp']))

# Crear el DataFrame a partir del RDD
ratings = spark.createDataFrame(ratingsRDD)

# Mostrar el esquema y los primeros registros del nuevo DataFrame
ratings.printSchema()
ratings.show()