In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, regexp_replace
from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType, StringType

# stop any existing Spark session, if Spark is already running, creating a new session might fail ***
try:
    spark.stop()
except Exception:
    pass

# create session with adjusted memory settings based on your cluster
# .config("spark.local.dir", r"E:\Apache Spark\spark-temp"): change the spark local dir, as the c disk memory is not enough, may cause Py4JJavaError exception ***
spark = SparkSession.builder.appName("MovieRecommender") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.local.dir", r"E:\Apache Spark\spark-temp") \
    .getOrCreate()

In [3]:
spark

In [4]:
movies = spark.read \
    .format("csv") \
    .option("sep", ",") \
    .option("header", "true") \
    .option("encoding", "utf-8") \
    .option("mode", "DROPMALFORMED") \
    .load("data/TMDB 6000 Movie Dataset with Ratings/tmdb_6000_movie_dataset.csv")

In [5]:
movies.select("genres").show()

+--------------+
|        genres|
+--------------+
| "[{""id"": 28|
| "[{""id"": 12|
| "[{""id"": 28|
| "[{""id"": 28|
| "[{""id"": 28|
| "[{""id"": 14|
| "[{""id"": 16|
| "[{""id"": 28|
| "[{""id"": 12|
| "[{""id"": 28|
| "[{""id"": 12|
| "[{""id"": 12|
| "[{""id"": 12|
| "[{""id"": 28|
| "[{""id"": 28|
| "[{""id"": 12|
|"[{""id"": 878|
| "[{""id"": 12|
| "[{""id"": 28|
| "[{""id"": 28|
+--------------+
only showing top 20 rows



In [6]:
movies.show()

+----+---------+--------------------+--------------------+------+--------------------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+------------+----------+-------+--------------------+--------+--------------------+--------------------+------------+----------+
| _c0|   budget|              genres|            homepage|tmdbId|            keywords|original_language|      original_title|            overview|popularity|production_companies|production_countries|release_date|   revenue|runtime|    spoken_languages|  status|             tagline|               title|vote_average|vote_count|
+----+---------+--------------------+--------------------+------+--------------------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+------------+----------+-------+--------------------+--------+--------------------+--------------------+------------+----------+
|4068|        0|

In [7]:
movies = movies.select('_c0', 'genres', 'title', 'vote_average', 'vote_count', 'original_language', 'popularity', 'release_date', 'tagline')

In [8]:
movies.show()

+---+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|_c0|        genres|               title|        vote_average|          vote_count|   original_language|          popularity|        release_date|             tagline|
+---+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|  0| "[{""id"": 28|        ""id"": 289}| {""name"": ""Twe...|        ""id"": 306}|         {""id"": 14| ""name"": ""Scie...|[{'id': 1463, 'na...|"[{""name"": ""In...|
|  1| "[{""id"": 12|     'name': 'ship'}|         {'id': 5740| 'name': 'alliance'}|         {""id"": 28|                 285|  'name': 'traitor'}|         {'id': 3799|
|  2| "[{""id"": 28|      ""id"": 10761}|  {""name"": ""B24""|    ""id"": 69434}]"|         {""id"": 80|              206647|             Spectre| {""name"": ""

In [9]:
movies_num_rows = movies.count()
movies_num_cols = len(movies.columns)
print(movies_num_rows, movies_num_cols)

5801 9
