In [1]:
import pyspark

In [39]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col,split,count,desc


# create a Spark session
spark = SparkSession.builder.appName("Movie recommendation").getOrCreate()

# read the movie and ratings data into Spark DataFrames and cache them
read_movie_data = spark.read.csv("/Users/Krishnaveni/Desktop/ml-latest-small/movies.csv",header=True, inferSchema=True).cache()
read_ratings_data = spark.read.csv("/Users/Krishnaveni/Desktop/ml-latest-small/ratings.csv",header=True, inferSchema=True).cache()

# join the movie and ratings data based on movieId, and select only the relevant columns
join_dataframes = read_movie_data.join(read_ratings_data, 'movieId').distinct()
join_dataframes = join_dataframes.select("userId", "title", "genres", "rating")

# extract the genres from the filtered DataFrame, count their occurrences, and sort by count in descending order
filter_rating = join_dataframes.filter((join_dataframes["rating"] > 4) & (join_dataframes["userId"] == 51))
print("Filtered data of user 51 and the rating is greater than 4")
filter_rating.show()


#we need the count of each genre to get more appropriate result.
words = filter_rating.select(explode(split("genres", "\|")).alias("genre"))
words_count = words.groupBy("genre").agg(count("*").alias("count")).orderBy("count", ascending=False)
final_result = words_count.select("genre")
#words_count.show()

print("User 51 top 3 picks: ")
top_3_picks = final_result.show(3)
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

# get the top 3 genres of the user 51
top_3_genres = [row.genre for row in final_result.limit(3).collect()]

# filter the movies that have at least one of the user's top 3 genres
movies_filtered = read_movie_data.filter(read_movie_data.genres.contains(top_3_genres[0]) \
                                          | read_movie_data.genres.contains(top_3_genres[1]) \
                                          | read_movie_data.genres.contains(top_3_genres[2]))

# join with ratings data to get the average rating for each movie
ratings_avg = join_dataframes.groupBy("title").avg("rating").orderBy(desc("avg(rating)"))

# join with filtered movies to get the final recommendations
recommendations = ratings_avg.join(movies_filtered, "title").select("title", "genres", "avg(rating)").orderBy(desc("avg(rating)"))

print("Recommendations for user 51 based on their top 3 genres:")
recommendations.show(10)



23/04/25 12:20:49 WARN CacheManager: Asked to cache already cached data.
23/04/25 12:20:50 WARN CacheManager: Asked to cache already cached data.


Filtered data of user 51 and the rating is greater than 4
+------+--------------------+--------------------+------+
|userId|               title|              genres|rating|
+------+--------------------+--------------------+------+
|    51|Field of Dreams (...|Children|Drama|Fa...|   5.0|
|    51|    Labyrinth (1986)|Adventure|Fantasy...|   5.0|
|    51|Much Ado About No...|      Comedy|Romance|   5.0|
|    51|Man Who Knew Too ...|Adventure|Drama|M...|   5.0|
|    51|Butch Cassidy and...|      Action|Western|   5.0|
|    51|Brokeback Mountai...|       Drama|Romance|   5.0|
|    51|Misérables, Les (...|Crime|Drama|Roman...|   5.0|
|    51|39 Steps, The (1935)|Drama|Mystery|Thr...|   5.0|
|    51|Love and Death (1...|              Comedy|   5.0|
|    51|History of the Wo...|      Comedy|Musical|   5.0|
|    51| Blood Simple (1984)|Crime|Drama|Film-...|   5.0|
|    51|Cool Hand Luke (1...|               Drama|   5.0|
|    51|       Pecker (1998)|        Comedy|Drama|   4.5|
|    51|      



+--------------------+--------------------+-----------+
|               title|              genres|avg(rating)|
+--------------------+--------------------+-----------+
|  The Big Bus (1976)|       Action|Comedy|        5.0|
|Formula of Love (...|              Comedy|        5.0|
| What Love Is (2007)|      Comedy|Romance|        5.0|
|Man and a Woman, ...|       Drama|Romance|        5.0|
|Sandpiper, The (1...|       Drama|Romance|        5.0|
|Bill Hicks: Revel...|              Comedy|        5.0|
|A Flintstones Chr...|Animation|Childre...|        5.0|
|Dylan Moran: Mons...|  Comedy|Documentary|        5.0|
|    Max Manus (2008)|    Action|Drama|War|        5.0|
|Chinese Puzzle (C...|      Comedy|Romance|        5.0|
+--------------------+--------------------+-----------+
only showing top 10 rows



                                                                                