### This is the biggest version of 100k ratings one containing `20M ratings`. It is also taken from the movielens platform. You can find it under older versions section

You can download the dataset from [here](https://grouplens.org/datasets/movielens/?utm_source=chatgpt.com)

In [0]:
df = spark.read.table("dai.phase2.movie_ratings")
display(df.head(10))

userId,movieId,rating,timestamp
122289,3213,4.0,1286658715
122289,3235,3.0,1287251632
122289,3253,4.0,1286592571
122289,3254,4.0,1286648580
122289,3275,3.5,1286589772
122289,3364,3.5,1328060352
122289,3384,3.5,1327271097
122289,3388,1.0,1286651676
122289,3409,1.5,1286649072
122289,3421,4.5,1286589750


In [0]:
df.printSchema()
print(f"Shape: ({df.count()},{len(df.columns)})")

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

Shape: (20000263,4)


In [0]:
ratings = df.drop("timestamp")
display(ratings.head(10))

userId,movieId,rating
122289,3213,4.0
122289,3235,3.0
122289,3253,4.0
122289,3254,4.0
122289,3275,3.5
122289,3364,3.5
122289,3384,3.5
122289,3388,1.0
122289,3409,1.5
122289,3421,4.5


In [0]:
display(ratings.select("rating").distinct())

rating
0.5
1.5
5.0
2.0
4.0
2.5
3.0
4.5
3.5
1.0


### Training the ALS(Alternating Least Square) Model:

> #### Steps to train ALS Model:
> 1. Split the data and train and test sets
> 2. Create ALS Model
> 3. Train the Model
> 4. Make Predictions
> 5. Evaluate

---

In [0]:
# Creating the train and test data set
train, test = ratings.randomSplit([0.7, 0.3], seed=42)

In [0]:
train.count(),test.count()

(13999161, 6001102)

In [0]:
from pyspark.ml.recommendation import ALS

als = ALS(
    userCol="userId",
    itemCol="movieId",
    ratingCol="rating",
    rank=43,              # number of latent factors
    maxIter=10,           # number of iterations
    regParam=0.02,         # regularization
    implicitPrefs=False,  # because these are real ratings
    coldStartStrategy="drop"  # If a user appears in test not in train, this will drop those types of rows to prevent predicitng NaN
)

In [0]:
# Training the model
model = als.fit(train)

In [0]:
predictions = model.transform(test)
display(predictions.head(10))

userId,movieId,rating,prediction
1021,904,5.0,4.401959896087647
1021,908,5.0,4.50413990020752
1021,1193,5.0,5.0290117263793945
1021,2380,1.0,2.290351390838623
1021,2424,4.0,4.04727029800415
1021,2762,5.0,5.102728366851807
1021,2963,3.0,1.8556783199310305
1021,3113,4.0,3.333718061447144
1021,3175,4.0,3.1299495697021484
2737,290,5.0,4.903825759887695


In [0]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    metricName="rmse",
    labelCol="rating",
    predictionCol="prediction"
)

rmse = evaluator.evaluate(predictions)
print("RMSE =", rmse)

RMSE = 0.8261464717577105
