In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, regexp_replace
from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType, StringType

# stop any existing Spark session, if Spark is already running, creating a new session might fail ***
try:
    spark.stop()
except Exception:
    pass

# create session with adjusted memory settings based on your cluster
# .config("spark.local.dir", r"E:\Apache Spark\spark-temp"): change the spark local dir, as the c disk memory is not enough, may cause Py4JJavaError exception ***
spark = SparkSession.builder.appName("MovieRecommender_Movielens") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.local.dir", r"E:\Apache Spark\spark-temp") \
    .getOrCreate()

In [2]:
spark

### Analyze Movie CSV

In [3]:
movies = spark.read \
    .format("csv") \
    .option("sep", ",") \
    .option("header", "true") \
    .option("encoding", "utf-8") \
    .option("mode", "DROPMALFORMED") \
    .load("../data/MovieLens 20M Dataset/movie.csv")

In [4]:
movies = movies.withColumnRenamed('movieId', 'movie_id')

In [5]:
movies.show()

+--------+--------------------+--------------------+
|movie_id|               title|              genres|
+--------+--------------------+--------------------+
|       1|    Toy Story (1995)|Adventure|Animati...|
|       2|      Jumanji (1995)|Adventure|Childre...|
|       3|Grumpier Old Men ...|      Comedy|Romance|
|       4|Waiting to Exhale...|Comedy|Drama|Romance|
|       5|Father of the Bri...|              Comedy|
|       6|         Heat (1995)|Action|Crime|Thri...|
|       7|      Sabrina (1995)|      Comedy|Romance|
|       8| Tom and Huck (1995)|  Adventure|Children|
|       9| Sudden Death (1995)|              Action|
|      10|    GoldenEye (1995)|Action|Adventure|...|
|      11|American Presiden...|Comedy|Drama|Romance|
|      12|Dracula: Dead and...|       Comedy|Horror|
|      13|        Balto (1995)|Adventure|Animati...|
|      14|        Nixon (1995)|               Drama|
|      15|Cutthroat Island ...|Action|Adventure|...|
|      16|       Casino (1995)|         Crime|

In [6]:
movies_num_rows = movies.count()
movies_num_cols = len(movies.columns)
print(movies_num_rows, movies_num_cols)

27278 3


### Analyze Ratings CSV

In [7]:
ratings = spark.read \
    .format("csv") \
    .option("sep", ",") \
    .option("header", "true") \
    .option("encoding", "utf-8") \
    .option("mode", "DROPMALFORMED") \
    .load("../data/MovieLens 20M Dataset/rating.csv")

In [8]:
rename_dict = {
    "userId": "user_id",
    "movieId": "movie_id"
}


for old_name, new_name in rename_dict.items():
    ratings = ratings.withColumnRenamed(old_name, new_name)

In [9]:
ratings.show()

+-------+--------+------+-------------------+
|user_id|movie_id|rating|          timestamp|
+-------+--------+------+-------------------+
|      1|       2|   3.5|2005-04-02 23:53:47|
|      1|      29|   3.5|2005-04-02 23:31:16|
|      1|      32|   3.5|2005-04-02 23:33:39|
|      1|      47|   3.5|2005-04-02 23:32:07|
|      1|      50|   3.5|2005-04-02 23:29:40|
|      1|     112|   3.5|2004-09-10 03:09:00|
|      1|     151|     4|2004-09-10 03:08:54|
|      1|     223|     4|2005-04-02 23:46:13|
|      1|     253|     4|2005-04-02 23:35:40|
|      1|     260|     4|2005-04-02 23:33:46|
|      1|     293|     4|2005-04-02 23:31:43|
|      1|     296|     4|2005-04-02 23:32:47|
|      1|     318|     4|2005-04-02 23:33:18|
|      1|     337|   3.5|2004-09-10 03:08:29|
|      1|     367|   3.5|2005-04-02 23:53:00|
|      1|     541|     4|2005-04-02 23:30:03|
|      1|     589|   3.5|2005-04-02 23:45:57|
|      1|     593|   3.5|2005-04-02 23:31:01|
|      1|     653|     3|2004-09-1

In [None]:
from pyspark.sql.functions import col  # # the count is every user's rating counting

rating_count = ratings.groupBy("user_id") \
       .count() \
       .orderBy(col("count"), ascending=False)

rating_count.show()

In [None]:
rating_count_nums_row = rating_count.count()
print(rating_count_nums_row)

138493


In [None]:
rating_count = rating_count.filter(rating_count['count'] > 200)

In [None]:
rating_count_nums_row = rating_count.count()
print(rating_count_nums_row)

26599


In [None]:
user_ids = rating_count.select("user_id")

user_id_list = [row['user_id'] for row in user_ids.collect()]  # get the user id list

In [None]:
user_id_list

['118205',
 '8405',
 '82418',
 '121535',
 '125794',
 '74142',
 '34576',
 '131904',
 '83090',
 '59477',
 '130767',
 '79159',
 '8963',
 '15617',
 '92011',
 '71975',
 '20132',
 '46470',
 '88820',
 '63147',
 '130459',
 '120575',
 '9544',
 '31122',
 '18611',
 '125978',
 '18138',
 '91193',
 '111549',
 '68026',
 '41267',
 '51703',
 '92269',
 '70201',
 '35128',
 '105580',
 '14705',
 '54465',
 '114406',
 '136268',
 '12131',
 '53346',
 '24688',
 '107326',
 '131347',
 '26867',
 '27469',
 '119048',
 '123606',
 '67346',
 '86529',
 '22901',
 '129583',
 '131894',
 '91867',
 '7201',
 '24219',
 '62812',
 '61168',
 '51558',
 '68063',
 '97853',
 '32344',
 '80092',
 '103223',
 '107640',
 '128258',
 '79531',
 '128309',
 '92956',
 '118754',
 '76630',
 '106441',
 '59414',
 '113668',
 '122995',
 '116189',
 '50297',
 '52260',
 '72008',
 '33736',
 '52009',
 '43194',
 '117144',
 '3907',
 '137202',
 '27053',
 '31404',
 '42929',
 '119531',
 '135425',
 '66763',
 '116317',
 '64843',
 '131961',
 '2261',
 '42204',
 '9

In [None]:
# filters the ratings DataFrame to include only rows with user IDs that are in y—that is, 
# only ratings by users with more than 200 ratings.
ratings = ratings.filter(ratings.user_id.isin(user_id_list))

In [None]:
ratings.show()

+-------+--------+------+-------------------+
|user_id|movie_id|rating|          timestamp|
+-------+--------+------+-------------------+
|      7|       3|     3|2002-01-16 19:14:23|
|      7|       7|     3|2002-01-16 19:10:20|
|      7|      11|     4|2002-01-16 19:04:49|
|      7|      15|     2|2002-01-16 19:18:52|
|      7|      16|     3|2002-01-16 18:22:58|
|      7|      17|     2|2002-01-16 19:01:16|
|      7|      24|     3|2002-01-16 18:53:58|
|      7|     105|     2|2002-01-16 19:12:46|
|      7|     122|     2|2002-01-16 19:16:09|
|      7|     151|     3|2002-01-16 19:06:05|
|      7|     252|     2|2002-01-16 19:11:03|
|      7|     260|     5|2002-01-16 18:44:19|
|      7|     271|     3|2002-01-16 18:36:57|
|      7|     276|     2|2002-01-16 19:17:35|
|      7|     316|     3|2002-01-16 18:50:19|
|      7|     339|     3|2002-01-16 19:08:10|
|      7|     348|     4|2002-01-16 18:34:06|
|      7|     351|     3|2002-01-16 19:17:17|
|      7|     355|     3|2002-01-1

### Integrate Movies and Ratings Data and Analyze them

In [None]:
# join the ratings with the movies
ratings_with_movies = movies.join(ratings, on="movie_id", how="inner")

In [None]:
ratings_with_movies.show()

+--------+--------------------+--------------------+-------+------+-------------------+
|movie_id|               title|              genres|user_id|rating|          timestamp|
+--------+--------------------+--------------------+-------+------+-------------------+
|       3|Grumpier Old Men ...|      Comedy|Romance|      7|     3|2002-01-16 19:14:23|
|       7|      Sabrina (1995)|      Comedy|Romance|      7|     3|2002-01-16 19:10:20|
|      11|American Presiden...|Comedy|Drama|Romance|      7|     4|2002-01-16 19:04:49|
|      15|Cutthroat Island ...|Action|Adventure|...|      7|     2|2002-01-16 19:18:52|
|      16|       Casino (1995)|         Crime|Drama|      7|     3|2002-01-16 18:22:58|
|      17|Sense and Sensibi...|       Drama|Romance|      7|     2|2002-01-16 19:01:16|
|      24|       Powder (1995)|        Drama|Sci-Fi|      7|     3|2002-01-16 18:53:58|
|     105|Bridges of Madiso...|       Drama|Romance|      7|     2|2002-01-16 19:12:46|
|     122|    Boomerang (1992)| 

In [None]:
# The agg() function in this code is used to apply one or more aggregate functions to a DataFrame after grouping it by a specific column (in this case, "title").
from pyspark.sql.functions import count

rating_count_every_title = ratings_with_movies.groupBy('title').agg(count('rating').alias('rating_count'))

In [None]:
rating_count_every_title.show() # why run so long???

In [None]:
ratings_with_movies = ratings_with_movies.join(rating_count_every_title, on = 'title', how = 'inner')

In [None]:
ratings_with_movies = ratings_with_movies.filter(ratings_with_movies['rating_count'] >= 50)

In [None]:
ratings_with_movies.show()