In [0]:
display(dbutils.fs.ls("FileStore/tables/movielens-1.txt"))

path,name,size
dbfs:/FileStore/tables/movielens-1.txt,movielens-1.txt,29359


In [0]:
#reading file
file = 'dbfs:/FileStore/tables/movielens-1.txt'
df = spark \
  .read.format("txt") \
  .option("inferSchema", "True") \
  .option("header", "False") \
  .text(file)
display(df.limit(20))

value
0:2:3:1424380312
0:3:1:1424380312
0:5:2:1424380312
0:9:4:1424380312
0:11:1:1424380312
0:12:2:1424380312
0:15:1:1424380312
0:17:1:1424380312
0:19:1:1424380312
0:21:1:1424380312


In [0]:
#split column into multiple comuns
import pyspark.sql.functions as F
split_col = F.split(df.value, ':')
df = df.withColumn('userId', split_col.getItem(0))
df = df.withColumn('movieId', split_col.getItem(1))
df = df.withColumn('rating', split_col.getItem(2))
df = df.withColumn('timestamp', split_col.getItem(3))

#removing "value" column
df = df.drop('value')
display(df.limit(10))

userId,movieId,rating,timestamp
0,2,3,1424380312
0,3,1,1424380312
0,5,2,1424380312
0,9,4,1424380312
0,11,1,1424380312
0,12,2,1424380312
0,15,1,1424380312
0,17,1,1424380312
0,19,1,1424380312
0,21,1,1424380312


In [0]:
#converting dataframe data to int
from pyspark.sql.types import IntegerType
df = df.withColumn("userId", df["userId"].cast(IntegerType()))
df = df.withColumn("movieId", df["movieId"].cast(IntegerType()))
df = df.withColumn("rating", df["rating"].cast(IntegerType()))
df = df.withColumn("timestamp", df["timestamp"].cast(IntegerType()))

df.printSchema()

In [0]:
#LETS GET TO WORK

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

#Divide dataset in training and test
(training, test) = df.randomSplit([0.8, 0.2])

#Instance ALS model, setting maxIter, learning coefficient, used columns and not considering coldstart
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")

#Train the dataset model using als.fit()
model = als.fit(training)

#Apply model on test set to make predictions
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Erro médio quadrático = " + str(rmse))


In [0]:
#Considering all users from dataset, we generate 10 reccomentations
userRec = model.recommendForAllUsers(10)
display(userRec)

In [0]:
#Optionally, the rating matrix transpose was made to recommend potencial users for specific items
movieRecs = model.recommendForAllItems(10)
display(movieRecs)

In [0]:
#Movies recommended by users
UserRecsOnlyItemId = userRec.select(userRec['userId'],
                                   userRec['recommendations']['movieId'])
display(UserRecsOnlyItemId)