In [0]:
display(dbutils.fs.ls("FileStore/tables/movielens-1.txt"))

path,name,size
dbfs:/FileStore/tables/movielens-1.txt,movielens-1.txt,29359


In [0]:
#reading file
file = 'dbfs:/FileStore/tables/movielens-1.txt'
df = spark \
  .read.format("txt") \
  .option("inferSchema", "True") \
  .option("header", "False") \
  .text(file)
display(df)

value
0:2:3:1424380312
0:3:1:1424380312
0:5:2:1424380312
0:9:4:1424380312
0:11:1:1424380312
0:12:2:1424380312
0:15:1:1424380312
0:17:1:1424380312
0:19:1:1424380312
0:21:1:1424380312


In [0]:
#split column into multiple comuns
import pyspark.sql.functions as F
split_col = F.split(df.value, ':')
df = df.withColumn('userId', split_col.getItem(0))
df = df.withColumn('movieId', split_col.getItem(1))
df = df.withColumn('rating', split_col.getItem(2))
df = df.withColumn('timestamp', split_col.getItem(3))

df.show()

In [0]:
#removing "value" column
df = df.drop('value')
df.show(10)

In [0]:
#converting dataframe data to int
from pyspark.sql.types import IntegerType
df = df.withColumn("userId", df["userId"].cast(IntegerType()))
df = df.withColumn("movieId", df["movieId"].cast(IntegerType()))
df = df.withColumn("rating", df["rating"].cast(IntegerType()))
df = df.withColumn("timestamp", df["timestamp"].cast(IntegerType()))

df.printSchema()

In [0]:
#LETS GET TO WORK

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

#Create test and train set
(training, test) = df.randomSplit([0.8, 0.2])

#Create ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", coldStartStrategy="drop", nonnegative=True)

#Tune model using ParamGridBuilder
param_grid = ParamGridBuilder() \
  .addGrid(als.rank, [12, 13, 14])\
  .addGrid(als.maxIter, [18, 19, 20])\
  .addGrid(als.regParam, [.17, .18, .19])\
  .build()

#Define evaluator as RMSE
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

#Build cross validation using TrainValidationSplit
tvs = TrainValidationSplit(
  estimator=als,
  estimatorParamMaps=param_grid,
  evaluator=evaluator
  )

#Fit ALS model to training data
model = tvs.fit(training)

#Extract best model from the tuning exercise using ParamGridBuilder
best_model = model.bestModel

#Generate predictions and evaluate using RMSE
predictions = best_model.transform(test)
rmse = evaluator.evaluate(predictions)

#Print evaluation metrics and model parameters
print("RMSE = " + str(rmse))
print("**Best Model**")
print("  Rank:"), best_model.rank
print("  MaxIter:"), best_model._java_obj.parent().getMaxIter()
print("  RegParam:"), best_model._java_obj.parent().getRegParam()


In [0]:
#LETS GET TO WORK (ALTERNATIVE)

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder

#Divide dataset in training and test
(training, test) = df.randomSplit([0.8, 0.2])

#Instance ALS model, setting maxIter, learning coefficient, used columns and not considering coldstart
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")

#Train the dataset model using als.fit()
model = als.fit(training)

#Apply model on test set to make predictions
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Erro médio quadrático = " + str(rmse))


In [0]:
#Considering all users from dataset, we generate 10 reccomentations
userRec = model.recommendForAllUsers(10)
display(userRec)

userId,recommendations
20,"List(List(75, 3.9509072), List(52, 3.9013956), List(77, 3.803977), List(22, 3.7889962), List(28, 3.7008765), List(94, 3.5079668), List(54, 3.3352475), List(59, 3.1909952), List(51, 3.161779), List(98, 3.1604145))"
10,"List(List(96, 4.337921), List(85, 3.7044408), List(47, 3.7040396), List(40, 3.6210067), List(42, 2.9890547), List(49, 2.958482), List(33, 2.958309), List(25, 2.9552927), List(0, 2.9234056), List(73, 2.9011972))"
0,"List(List(92, 3.8115501), List(24, 3.6800606), List(9, 3.5738146), List(2, 3.0202003), List(66, 2.8711495), List(29, 2.8532207), List(26, 2.8190238), List(91, 2.653624), List(81, 2.5155435), List(57, 2.4982367))"
1,"List(List(54, 3.7965765), List(68, 3.5957983), List(52, 3.271205), List(77, 3.07231), List(85, 3.0042002), List(28, 2.9585998), List(83, 2.8508224), List(9, 2.8322377), List(24, 2.6308136), List(90, 2.5587811))"
21,"List(List(29, 5.0374694), List(53, 4.867947), List(74, 4.040547), List(87, 3.9519641), List(2, 3.9505491), List(25, 3.7469182), List(8, 3.6085196), List(70, 3.5213683), List(41, 3.4941957), List(60, 3.3594854))"
11,"List(List(18, 5.1097913), List(23, 5.0202756), List(30, 5.013684), List(48, 4.916162), List(27, 4.8993835), List(79, 4.8141747), List(33, 4.753074), List(46, 4.1959147), List(19, 3.9580452), List(90, 3.9447293))"
12,"List(List(49, 5.5688934), List(46, 5.2365875), List(27, 5.1435695), List(55, 5.061962), List(17, 4.901826), List(64, 4.8903074), List(90, 4.7350426), List(48, 4.524438), List(65, 4.512791), List(35, 4.3411813))"
22,"List(List(51, 5.297673), List(59, 5.2671647), List(88, 5.257532), List(75, 5.0302954), List(30, 4.8636727), List(74, 4.78579), List(28, 4.659964), List(54, 4.604916), List(77, 4.5519075), List(94, 4.4207435))"
2,"List(List(63, 5.771246), List(41, 5.2815275), List(72, 5.185494), List(81, 4.9911027), List(93, 4.9869423), List(8, 4.971905), List(39, 4.6710296), List(49, 4.615), List(70, 4.5294523), List(89, 4.25675))"
13,"List(List(70, 3.1847553), List(53, 3.126486), List(74, 3.106634), List(29, 2.9596796), List(41, 2.8666875), List(92, 2.752764), List(83, 2.701297), List(2, 2.601574), List(66, 2.5715787), List(8, 2.542613))"


In [0]:
#Optionally, the rating matrix transpose was made to recommend potencial users for specific items
movieRecs = model.recommendForAllItems(10)
display(movieRecs)

movieId,recommendations
20,"List(List(17, 4.8208), List(12, 4.066423), List(23, 3.6724787), List(29, 3.635951), List(5, 3.0632234), List(9, 2.8250465), List(19, 1.6962833), List(2, 1.4329131), List(27, 1.2439922), List(25, 1.1313344))"
40,"List(List(26, 4.2476554), List(2, 3.9566498), List(10, 3.6210067), List(6, 3.025911), List(4, 2.5330994), List(21, 2.463764), List(7, 1.9928433), List(9, 1.9161756), List(8, 1.8857938), List(0, 1.7067448))"
10,"List(List(12, 4.1891165), List(17, 3.957125), List(23, 3.9570475), List(9, 2.4992588), List(29, 2.497035), List(5, 2.1928241), List(19, 2.039927), List(16, 2.0131721), List(24, 1.9175384), List(20, 1.4007881))"
50,"List(List(23, 4.088081), List(12, 4.01974), List(5, 3.0560174), List(9, 3.00414), List(29, 2.3653464), List(17, 2.04303), List(1, 2.0242262), List(15, 1.9418793), List(26, 1.8702526), List(19, 1.7121037))"
80,"List(List(3, 3.871289), List(26, 3.3795276), List(7, 3.2701046), List(18, 3.1977975), List(11, 3.1320367), List(22, 2.9498472), List(21, 1.1667519), List(24, 1.02081), List(29, 0.97589815), List(25, 0.8875932))"
70,"List(List(2, 4.5294523), List(4, 3.7989085), List(21, 3.5213683), List(8, 3.2715309), List(24, 3.2193615), List(13, 3.1847553), List(14, 2.996982), List(9, 2.9807563), List(22, 2.8816164), List(26, 2.4247646))"
60,"List(List(21, 3.3594854), List(2, 3.2580798), List(7, 3.0877233), List(3, 2.9953575), List(8, 2.9074593), List(4, 2.709478), List(22, 2.6043637), List(18, 2.3888192), List(26, 2.376433), List(14, 2.1295214))"
90,"List(List(17, 5.0669117), List(24, 5.04056), List(16, 4.9374747), List(12, 4.7350426), List(5, 4.557121), List(29, 4.498662), List(23, 4.0220733), List(11, 3.9447293), List(19, 3.7233286), List(9, 2.8936706))"
30,"List(List(11, 5.013684), List(26, 4.96661), List(22, 4.8636727), List(24, 3.9387708), List(18, 3.7232497), List(23, 3.5881135), List(3, 3.0557487), List(29, 2.9373424), List(9, 2.8630693), List(12, 2.4119961))"
0,"List(List(10, 2.9234056), List(28, 2.8925502), List(7, 2.8020067), List(8, 2.7722769), List(14, 2.5515895), List(6, 2.3929834), List(25, 2.3747702), List(0, 1.8609223), List(12, 1.6765999), List(16, 1.5512817))"


In [0]:
#Movies recommended by users
UserRecsOnlyItemId = userRec.select(userRec['userId'],
                                   userRec['recommendations']['movieId'])
display(UserRecsOnlyItemId)

userId,recommendations.movieId
20,"List(75, 52, 77, 22, 28, 94, 54, 59, 51, 98)"
10,"List(96, 85, 47, 40, 42, 49, 33, 25, 0, 73)"
0,"List(92, 24, 9, 2, 66, 29, 26, 91, 81, 57)"
1,"List(54, 68, 52, 77, 85, 28, 83, 9, 24, 90)"
21,"List(29, 53, 74, 87, 2, 25, 8, 70, 41, 60)"
11,"List(18, 23, 30, 48, 27, 79, 33, 46, 19, 90)"
12,"List(49, 46, 27, 55, 17, 64, 90, 48, 65, 35)"
22,"List(51, 59, 88, 75, 30, 74, 28, 54, 77, 94)"
2,"List(63, 41, 72, 81, 93, 8, 39, 49, 70, 89)"
13,"List(70, 53, 74, 29, 41, 92, 83, 2, 66, 8)"
