In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, regexp_replace
from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType, StringType

# stop any existing Spark session, if Spark is already running, creating a new session might fail ***
try:
    spark.stop()
except Exception:
    pass

# create session with adjusted memory settings based on your cluster
# .config("spark.local.dir", r"E:\Apache Spark\spark-temp"): change the spark local dir, as the c disk memory is not enough, may cause Py4JJavaError exception ***
spark = SparkSession.builder.appName("MovieRecommender_Movielens") \
    .config("spark.executor.memory", "32g") \
    .config("spark.driver.memory", "16g") \
    .config("spark.local.dir", r"E:\Apache Spark\spark-temp") \
    .getOrCreate()

In [2]:
spark

### Analyze Movie CSV

In [3]:
movies = spark.read \
    .format("csv") \
    .option("sep", ",") \
    .option("header", "true") \
    .option("encoding", "utf-8") \
    .option("mode", "DROPMALFORMED") \
    .load("../data/MovieLens 20M Dataset/movie.csv")

In [4]:
movies = movies.withColumnRenamed('movieId', 'movie_id')

In [5]:
movies.show()

+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Adventure|Animati...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|Comedy|Drama|Romance|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|  Adventure|Children|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
|      11|American Presiden...|Comedy|Drama|Romance|
|      12|Dracula: Dead and...|       Comedy|Horror|
|      13|        Balto (1995)|Adventure|Animati...|
|      14|        Nixon (1995)|               Drama|
|      15|Cutthroat Island ...|Action|Adventure|...|
|      16|       Casino (1995)|         Crime|

In [6]:
movies_num_rows = movies.count()
movies_num_cols = len(movies.columns)
print(movies_num_rows, movies_num_cols)

27278 3


### Analyze Ratings CSV

In [7]:
ratings = spark.read \
    .format("csv") \
    .option("sep", ",") \
    .option("header", "true") \
    .option("encoding", "utf-8") \
    .option("mode", "DROPMALFORMED") \
    .load("../data/MovieLens 20M Dataset/rating.csv")

In [8]:
rename_dict = {
    "userId": "user_id",
    "movieId": "movie_id"
}


for old_name, new_name in rename_dict.items():
    ratings = ratings.withColumnRenamed(old_name, new_name)

In [9]:
ratings.show()

+-------+--------+------+-------------------+
|user_id|movie_id|rating|          timestamp|
+-------+--------+------+-------------------+
|      1|       2|   3.5|2005-04-02 23:53:47|
|      1|      29|   3.5|2005-04-02 23:31:16|
|      1|      32|   3.5|2005-04-02 23:33:39|
|      1|      47|   3.5|2005-04-02 23:32:07|
|      1|      50|   3.5|2005-04-02 23:29:40|
|      1|     112|   3.5|2004-09-10 03:09:00|
|      1|     151|     4|2004-09-10 03:08:54|
|      1|     223|     4|2005-04-02 23:46:13|
|      1|     253|     4|2005-04-02 23:35:40|
|      1|     260|     4|2005-04-02 23:33:46|
|      1|     293|     4|2005-04-02 23:31:43|
|      1|     296|     4|2005-04-02 23:32:47|
|      1|     318|     4|2005-04-02 23:33:18|
|      1|     337|   3.5|2004-09-10 03:08:29|
|      1|     367|   3.5|2005-04-02 23:53:00|
|      1|     541|     4|2005-04-02 23:30:03|
|      1|     589|   3.5|2005-04-02 23:45:57|
|      1|     593|   3.5|2005-04-02 23:31:01|
|      1|     653|     3|2004-09-1

In [10]:
ratings = ratings.sample(fraction=0.5, seed=42) # remove half of the data in rating.csv

In [11]:
print(ratings.count())

9997614


In [12]:
from pyspark.sql.functions import col  # the count is every user's rating counting

rating_count = ratings.groupBy("user_id") \
       .count() \
       .orderBy(col("count"), ascending=False)

rating_count.show()

+-------+-----+
|user_id|count|
+-------+-----+
| 118205| 4704|
|   8405| 3734|
|  82418| 2821|
| 121535| 2768|
|  74142| 2741|
| 125794| 2673|
|  34576| 2660|
| 131904| 2649|
|  83090| 2568|
|  59477| 2458|
| 130767| 2445|
|  79159| 2417|
|   8963| 2282|
|  15617| 2140|
|  92011| 2125|
|  88820| 2079|
|  46470| 2060|
|  20132| 2044|
|  71975| 2042|
| 120575| 1981|
+-------+-----+
only showing top 20 rows



In [13]:
rating_count_nums_row = rating_count.count()
print(rating_count_nums_row)

138388


In [14]:
rating_count = rating_count.filter(rating_count['count'] > 200)

In [15]:
rating_count_nums_row = rating_count.count()
print(rating_count_nums_row)

10639


In [16]:
user_ids = rating_count.select("user_id")

user_id_list = [row['user_id'] for row in user_ids.collect()]  # get the user id list

In [17]:
user_id_list

['118205',
 '8405',
 '82418',
 '121535',
 '74142',
 '125794',
 '34576',
 '131904',
 '83090',
 '59477',
 '130767',
 '79159',
 '8963',
 '15617',
 '92011',
 '88820',
 '46470',
 '20132',
 '71975',
 '120575',
 '63147',
 '9544',
 '130459',
 '18611',
 '68026',
 '91193',
 '111549',
 '31122',
 '125978',
 '51703',
 '41267',
 '18138',
 '35128',
 '70201',
 '92269',
 '105580',
 '136268',
 '54465',
 '14705',
 '114406',
 '12131',
 '24688',
 '53346',
 '67346',
 '131347',
 '26867',
 '119048',
 '107326',
 '129583',
 '131894',
 '27469',
 '123606',
 '86529',
 '7201',
 '22901',
 '91867',
 '61168',
 '107640',
 '24219',
 '68063',
 '97853',
 '128309',
 '32344',
 '76630',
 '62812',
 '103223',
 '79531',
 '51558',
 '92956',
 '128258',
 '80092',
 '43194',
 '59414',
 '113668',
 '72008',
 '106441',
 '52260',
 '50297',
 '118754',
 '119531',
 '116317',
 '33736',
 '52009',
 '122995',
 '116189',
 '66763',
 '117144',
 '31404',
 '135425',
 '137202',
 '64843',
 '42204',
 '131961',
 '3907',
 '95301',
 '42929',
 '2261',
 '5

In [18]:
# filters the ratings DataFrame to include only rows with user IDs that are in y—that is, 
# only ratings by users with more than 200 ratings.
ratings = ratings.filter(ratings.user_id.isin(user_id_list))

In [19]:
ratings.show()

+-------+--------+------+-------------------+
|user_id|movie_id|rating|          timestamp|
+-------+--------+------+-------------------+
|     11|       1|   4.5|2009-01-02 01:13:41|
|     11|      10|   2.5|2009-01-02 01:15:59|
|     11|      19|   3.5|2009-01-01 04:21:44|
|     11|      65|     2|2009-01-02 00:37:29|
|     11|     150|     5|2009-01-01 04:49:03|
|     11|     153|   3.5|2009-01-02 01:15:14|
|     11|     158|     4|2009-01-01 23:52:53|
|     11|     160|     4|2009-01-01 05:27:01|
|     11|     173|     5|2009-01-01 05:49:14|
|     11|     185|     4|2009-01-02 01:16:55|
|     11|     208|   4.5|2009-01-01 05:31:10|
|     11|     260|     5|2009-01-01 05:26:00|
|     11|     316|   3.5|2009-01-01 05:41:36|
|     11|     318|     5|2009-01-01 22:56:11|
|     11|     377|     4|2009-01-02 01:14:40|
|     11|     384|   3.5|2009-01-01 04:59:27|
|     11|     441|   1.5|2009-01-01 23:52:42|
|     11|     442|   4.5|2009-01-01 05:33:22|
|     11|     480|     5|2009-01-0

In [20]:
print(ratings.count())

3994581


### Integrate Movies and Ratings Data and Analyze them

In [43]:
# join the ratings with the movies
ratings_with_movies = ratings.join(movies, on="movie_id", how="inner")

In [44]:
ratings_with_movies.show()

+--------+-------+------+-------------------+--------------------+--------------------+
|movie_id|user_id|rating|          timestamp|               title|              genres|
+--------+-------+------+-------------------+--------------------+--------------------+
|       1|     11|   4.5|2009-01-02 01:13:41|    Toy Story (1995)|Adventure|Animati...|
|      10|     11|   2.5|2009-01-02 01:15:59|    GoldenEye (1995)|Action|Adventure|...|
|      19|     11|   3.5|2009-01-01 04:21:44|Ace Ventura: When...|              Comedy|
|      65|     11|     2|2009-01-02 00:37:29|     Bio-Dome (1996)|              Comedy|
|     150|     11|     5|2009-01-01 04:49:03|    Apollo 13 (1995)|Adventure|Drama|IMAX|
|     153|     11|   3.5|2009-01-02 01:15:14|Batman Forever (1...|Action|Adventure|...|
|     158|     11|     4|2009-01-01 23:52:53|       Casper (1995)|  Adventure|Children|
|     160|     11|     4|2009-01-01 05:27:01|        Congo (1995)|Action|Adventure|...|
|     173|     11|     5|2009-01

In [45]:
ratings_with_movies = ratings_with_movies.dropDuplicates(['movie_id', 'user_id']) # this is quite important, each user can rate the same movie multiple times

In [46]:
# The agg() function in this code is used to apply one or more aggregate functions to a DataFrame after grouping it by a specific column (in this case, "title").
from pyspark.sql.functions import count

rating_count_every_title = ratings_with_movies.groupBy('title').agg(count('rating').alias('rating_count'))

In [47]:
rating_count_every_title.show()

+--------------------+------------+
|               title|rating_count|
+--------------------+------------+
|Bag of Hammers, A...|           5|
|22 Jump Street (2...|         137|
|When We Were King...|         709|
| If Lucy Fell (1996)|         176|
|       Psycho (1960)|        3111|
|   Annie Hall (1977)|        2205|
|Gertie the Dinosa...|           2|
|Men in Black (a.k...|        4646|
|Odd Couple II, Th...|         124|
|In the Heat of th...|         824|
| Three Wishes (1995)|          60|
|Seven Beauties (P...|         128|
|    Elizabeth (1998)|        1641|
|First Blood (Ramb...|        1547|
|Heavenly Creature...|        1185|
|Smiley's People (...|          18|
|My Father the Her...|          26|
|Blind Shaft (Mang...|          10|
|Starship Troopers...|          96|
|Problem Child (1990)|         560|
+--------------------+------------+
only showing top 20 rows



In [48]:
ratings_with_movies = ratings_with_movies.join(rating_count_every_title, on='title', how='inner')

In [49]:
ratings_with_movies = ratings_with_movies.filter(ratings_with_movies['rating_count'] >= 50)

In [50]:
ratings_with_movies.show()

+--------------------+--------+-------+------+-------------------+-------------+------------+
|               title|movie_id|user_id|rating|          timestamp|       genres|rating_count|
+--------------------+--------+-------+------+-------------------+-------------+------------+
|'Til There Was Yo...|     779|   1095|     2|1999-10-20 10:12:13|Drama|Romance|         101|
|'Til There Was Yo...|     779| 128527|     3|2000-12-08 04:51:01|Drama|Romance|         101|
|'Til There Was Yo...|     779| 133310|     3|2001-08-26 21:21:54|Drama|Romance|         101|
|'Til There Was Yo...|     779|  49999|     4|2003-03-17 01:01:39|Drama|Romance|         101|
|'Til There Was Yo...|     779|  83242|   0.5|2005-01-25 19:53:28|Drama|Romance|         101|
|'Til There Was Yo...|     779|  92433|     1|2001-09-14 02:39:09|Drama|Romance|         101|
|'Til There Was Yo...|     779| 109291|   3.5|2004-01-07 16:17:11|Drama|Romance|         101|
|'Til There Was Yo...|     779|  49212|     3|2001-05-07 03:

### Convert the final analyzed Datas into Pivot Table

In [None]:
# Get the pivot table of final data "ratings"
from pyspark.sql.functions import avg

# Pivot the table with user_id as columns, title as rows, and average rating as values
movie_pivot = ratings_with_movies.groupBy("title") \
                         .pivot("user_id") \
                         .agg(avg("rating")) \
                         .fillna(0)  # Optional: Replace nulls with 0


In [None]:
movie_pivot.show()