In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ImportCSV") \
    .getOrCreate()

Imported the preprocessed dataset and renamed the headers

In [3]:
headers = ['id', 'title', 'tagline', 'release_date', 'genres', 'belongs_to_collection',
                     'original_language', 'budget_musd', 'revenue_musd', 'production_companies',
                     'production_countries', 'vote_count', 'vote_average', 'popularity', 'runtime',
                     'overview', 'spoken_languages', 'poster_path', 'cast', 'cast_size', 'director', 'crew_size']

df = spark.read.csv("/content/Final_preprocessed_data.csv", header=False, inferSchema=True)

for old_name, new_name in zip(df.columns, headers):
    df = df.withColumnRenamed(old_name, new_name)

df.show()

+------+--------------------+--------------------+------------+--------------------+---------------------+-----------------+-----------+------------+--------------------+--------------------+----------+------------+----------+-------+--------------------+--------------------+--------------------+--------------------+---------+---------------+---------+
|    id|               title|             tagline|release_date|              genres|belongs_to_collection|original_language|budget_musd|revenue_musd|production_companies|production_countries|vote_count|vote_average|popularity|runtime|            overview|    spoken_languages|         poster_path|                cast|cast_size|       director|crew_size|
+------+--------------------+--------------------+------------+--------------------+---------------------+-----------------+-----------+------------+--------------------+--------------------+----------+------------+----------+-------+--------------------+--------------------+--------------

# Exploratory Data Analysis

Identifying the Best and Worst Performing Movies based on key metrics

Creating a UDF to streamline ranking operations

In [5]:
def rank_movies(df, column, ascending=False, filter_condition=None, n=10):
    """
    Rank movies based on a column with optional filtering
    """
    if filter_condition:
        filtered_data = df.filter(filter_condition)
    else:
        filtered_data= df

    if ascending:
        ranked_data = filtered_data.orderBy(col(column).asc())
    else:
        ranked_data = filtered_data.orderBy(col(column).desc())

    return ranked_data.limit(n)

Top Rated Movies based on Key Performance Indicators (KPIs)

In [8]:
# Displaying top 10 ranked movies based on Highest Revenue
# The UDF was called to perform this action

from pyspark.sql.functions import col
highest_revenue = rank_movies(df, "revenue_musd")
highest_revenue.select("title", "revenue_musd").show(10)

+--------------------+------------+
|               title|revenue_musd|
+--------------------+------------+
|              Avatar| 2923.706026|
|   Avengers: Endgame|   2799.4391|
|Star Wars: The Fo...| 2068.223624|
|Avengers: Infinit...| 2052.415039|
|      Jurassic World| 1671.537444|
|       The Lion King| 1662.020819|
|        The Avengers| 1518.815515|
|           Furious 7|      1515.4|
|           Frozen II| 1453.683476|
|Avengers: Age of ...| 1405.403694|
+--------------------+------------+



In [9]:
# Ranking movies based on Highest Budget
highest_budget = rank_movies(df, "budget_musd", ascending=False)
highest_budget.select("title", "budget_musd").show(10)

+--------------------+-----------+
|               title|budget_musd|
+--------------------+-----------+
|Avengers: Age of ...|        365|
|   Avengers: Endgame|        356|
|Avengers: Infinit...|        300|
|       The Lion King|        260|
|Star Wars: The Fo...|        245|
|              Avatar|        237|
|        The Avengers|        220|
|       Black Panther|        200|
|       Incredibles 2|        200|
|Star Wars: The La...|        200|
+--------------------+-----------+



In [10]:
# Ranked movies based on Highest Profit by deducting the budget amount from the Revenue generated.
df = df.withColumn("profit_musd", col("revenue_musd") - col("budget_musd"))
highest_profit = rank_movies(df, "profit_musd")
highest_profit.select("title", "profit_musd").show()

+--------------------+------------------+
|               title|       profit_musd|
+--------------------+------------------+
|              Avatar|       2686.706026|
|   Avengers: Endgame|         2443.4391|
|Star Wars: The Fo...|1823.2236240000002|
|Avengers: Infinit...|       1752.415039|
|      Jurassic World|       1521.537444|
|       The Lion King|       1402.020819|
|           Furious 7|            1325.4|
|           Frozen II|       1303.683476|
|        The Avengers|       1298.815515|
|Harry Potter and ...|       1216.511219|
+--------------------+------------------+



**Low Profit Movies:** The *rank_movies* function created above, orders the profit column in descending order, then shows the last 10 movie titles at the bottom.

In [11]:
df = df.withColumn("profit_musd", col("revenue_musd") - col("budget_musd"))
lowest_profit = rank_movies(df, "profit_musd", ascending=True)
lowest_profit.select("title", "profit_musd").show()

+--------------------+-----------+
|               title|profit_musd|
+--------------------+-----------+
|Avengers: Age of ...|1040.403694|
|       Incredibles 2|1042.805359|
|              Frozen|1124.219009|
|Star Wars: The La...| 1132.69883|
|Jurassic World: F...|1140.466296|
|       Black Panther|1149.926083|
|Harry Potter and ...|1216.511219|
|        The Avengers|1298.815515|
|           Frozen II|1303.683476|
|           Furious 7|     1325.4|
+--------------------+-----------+



ROI for each movie title is calculated, a filter is applied to the budget_musd column to calculate the ROI for movies whose budget are less or equal to 10M.

In [12]:
# Highest ROI
df.filter(col("budget_musd") >= 10).select(col("title"), (col("revenue_musd") / col("budget_musd")).alias("ROI")).orderBy(col("ROI").desc()).show(5)

+--------------------+-----------------+
|               title|              ROI|
+--------------------+-----------------+
|              Avatar|12.33631234599156|
|      Jurassic World|      11.14358296|
|Harry Potter and ...|     10.732089752|
|           Frozen II|9.691223173333332|
|              Frozen|8.494793393333333|
+--------------------+-----------------+
only showing top 5 rows



Same thing in the 'Highest ROI' code block is applied here, but the ROI column was ordered in ascending order in order to display only the ROI of movies with Budget ≥ 10M.

In [13]:
# Lowest ROI
df.filter(col("budget_musd") >= 10).select(col("title"), (col("revenue_musd") / col("budget_musd")).alias("ROI")).orderBy(col("ROI").asc()).show(5)

+--------------------+------------------+
|               title|               ROI|
+--------------------+------------------+
|Avengers: Age of ...| 3.850421079452055|
|       Incredibles 2|       6.214026795|
|       The Lion King|6.3923877653846155|
|Star Wars: The La...|        6.66349415|
|       Black Panther|       6.749630415|
+--------------------+------------------+
only showing top 5 rows



**Most Voted Movies**

In [14]:
# Most Voted Movies
most_voted = rank_movies(df, "vote_count")
most_voted.select("title", "vote_count").show()

+--------------------+----------+
|               title|vote_count|
+--------------------+----------+
|              Avatar|     32166|
|        The Avengers|     31639|
|Avengers: Infinit...|     30442|
|   Avengers: Endgame|     26258|
|Avengers: Age of ...|     23374|
|       Black Panther|     22516|
|Harry Potter and ...|     20978|
|      Jurassic World|     20652|
|Star Wars: The Fo...|     19699|
|              Frozen|     16819|
+--------------------+----------+



In [15]:
# Highest Rated Movies
highest_rated = rank_movies(df, "vote_average", ascending=False)
highest_rated.select("title", "vote_average").show()

+--------------------+------------+
|               title|vote_average|
+--------------------+------------+
|   Avengers: Endgame|       8.237|
|Avengers: Infinit...|       8.235|
|Harry Potter and ...|       8.087|
|        The Avengers|       7.741|
|              Avatar|       7.588|
|       Incredibles 2|       7.455|
|       Black Panther|       7.373|
|Avengers: Age of ...|       7.271|
|Star Wars: The Fo...|       7.261|
|           Frozen II|       7.249|
+--------------------+------------+



In [16]:
# Lowest Rated movies
lowest_rated = rank_movies(df, "vote_average", ascending=True)
lowest_rated.select("title", "vote_average").show()

+--------------------+------------+
|               title|vote_average|
+--------------------+------------+
|Jurassic World: F...|       6.538|
|      Jurassic World|       6.693|
|Star Wars: The La...|       6.778|
|       The Lion King|        7.11|
|           Furious 7|       7.226|
|              Frozen|       7.247|
|           Frozen II|       7.249|
|Star Wars: The Fo...|       7.261|
|Avengers: Age of ...|       7.271|
|       Black Panther|       7.373|
+--------------------+------------+



In [17]:
# Most Popular Movies
most_popular = rank_movies(df, "popularity")
most_popular.select("title", "popularity").show()

+--------------------+----------+
|               title|popularity|
+--------------------+----------+
|Avengers: Infinit...|  110.6769|
|       The Lion King|   79.3952|
|Star Wars: The La...|   74.7713|
|       Black Panther|   72.9206|
|   Avengers: Endgame|   63.2407|
|Jurassic World: F...|    57.162|
|        The Avengers|   36.0158|
|              Avatar|   27.7054|
|Avengers: Age of ...|   24.9273|
|              Frozen|   20.8509|
+--------------------+----------+





---



# Advanced Movies Filtering and Search Queries

Filtering the dataset based on specific queries

* Search 1

In [18]:
sci_fi_action_bruce = df.filter(col("genres").like("Science Fiction") &
                                  col("genres").like("%Action%") &
                                  col("cast").like("%Bruce Willis%")
                                  ).orderBy(col("vote_average").desc())
sci_fi_action_bruce.select("title", "vote_average", "genres", "cast").show(truncate=False)

+-----+------------+------+----+
|title|vote_average|genres|cast|
+-----+------------+------+----+
+-----+------------+------+----+



* Search 2

In [19]:
uma_quentin_movies = df.filter((col("cast").like("Uma Thurman")) & (col("director") == "Quentin Tarantino")).orderBy(col("runtime").asc())
uma_quentin_movies.select("title", "runtime").show()

+-----+-------+
|title|runtime|
+-----+-------+
+-----+-------+



Franchise vs. Standalone Movie Performance

* Mean Revenue

In [20]:
# Comparing movie franchises in terms of Mean Revenue
from pyspark.sql.functions import mean, col, regexp_replace, count

# Calculating the mean revenue for movies in franchises
franchise_revenue = df.filter(col("belongs_to_collection").isNotNull()).select(mean("revenue_musd")).collect()[0][0]

# Calculating the mean revenue for standalone movies
standalone_revenue = df.filter(col("belongs_to_collection").isNull()).select(mean("revenue_musd")).collect()[0][0]

print(f"Mean revenue for movies in franchises: {franchise_revenue}")
print(f"Mean revenue for standalone movies: {standalone_revenue}")

Mean revenue for movies in franchises: 1682.6419708124995
Mean revenue for standalone movies: None


* Median ROI

In [21]:
from pyspark.sql.functions import when, lit, median

df = df.withColumn("ROI", when(col("budget_musd") > 0, (col("revenue_musd") - col("budget_musd")) / col("budget_musd")).otherwise(None))

df = df.withColumn("movie_type", when(col("belongs_to_collection").isNotNull(), lit("Franchise")).otherwise(lit("Standalone")))

median_roi = df.groupBy("movie_type").agg(median("ROI").alias("Median ROI"))
median_roi.show()

+----------+-----------------+
|movie_type|       Median ROI|
+----------+-----------------+
| Franchise|6.786109124058163|
+----------+-----------------+



* Mean Budget Raised

In [22]:
franchise_budget = df.filter(col("belongs_to_collection").isNotNull()).select(mean("budget_musd")).collect()[0][0]

standalone_budget = df.filter(col("belongs_to_collection").isNull()).select(mean("budget_musd")).collect()[0][0]

print(f"Mean budget for movies in franchises: {franchise_budget}")
print(f"Mean budget for standalone movies: {standalone_budget}")

Mean budget for movies in franchises: 219.875
Mean budget for standalone movies: None


* Mean Popularity

In [23]:
franchise_popularity = df.filter(col("belongs_to_collection").isNotNull()).select(mean("popularity")).collect()[0][0]

standalone_popularity = df.filter(col("belongs_to_collection").isNull()).select(mean("popularity")).collect()[0][0]

print(f"Mean popularity for movies in franchises: {franchise_popularity}")
print(f"Mean popularity for standalone movies: {standalone_popularity}")

Mean popularity for movies in franchises: 41.2870125
Mean popularity for standalone movies: None


* Mean Rating

In [24]:
franchise_rating = df.filter(col("belongs_to_collection").isNotNull()).select(mean("vote_average")).collect()[0][0]

standalone_rating = df.filter(col("belongs_to_collection").isNull()).select(mean("vote_average")).collect()[0][0]

print(f"Mean rating for movies in franchises: {franchise_rating}")
print(f"Mean rating for standalone movies: {standalone_rating}")

Mean rating for movies in franchises: 7.3805625
Mean rating for standalone movies: None




---



# Most Successful Franchises and Directors

* Most Successful Movie Franchses based on Total number of movies in franchise

In [25]:
exploded_df = df.withColumn("collection_name", regexp_replace(col("belongs_to_collection"), "[^a-zA-Z0-9\\s]", "")).select("title", "collection_name")
franchise_counts = exploded_df.groupBy("collection_name").agg(count("*").alias("movie_count"))

most_successful_franchises = franchise_counts.orderBy(col("movie_count").desc())
most_successful_franchises.show()

+--------------------+-----------+
|     collection_name|movie_count|
+--------------------+-----------+
|The Avengers Coll...|          4|
|   Frozen Collection|          2|
|Jurassic Park Col...|          2|
|Star Wars Collection|          2|
|The Fast and the ...|          1|
|Harry Potter Coll...|          1|
|Black Panther Col...|          1|
|The Lion King Reb...|          1|
|   Avatar Collection|          1|
|The Incredibles C...|          1|
+--------------------+-----------+



* Most Successful Movie Franchses based on Total and mean Budget

In [27]:
from pyspark.sql.functions import col, sum, mean

franchise_budget = df.groupBy("belongs_to_collection").agg(
    sum("budget_musd").alias("Total Budget"), mean("budget_musd").alias("Mean Budget"))
franchise_budget = franchise_budget.orderBy(col("Total Budget").desc())
franchise_budget.show(10)

+---------------------+------------+-----------+
|belongs_to_collection|Total Budget|Mean Budget|
+---------------------+------------+-----------+
| The Avengers Coll...|        1241|     310.25|
| Star Wars Collection|         445|      222.5|
| Jurassic Park Col...|         320|      160.0|
|    Frozen Collection|         300|      150.0|
| The Lion King (Re...|         260|      260.0|
|    Avatar Collection|         237|      237.0|
| Black Panther Col...|         200|      200.0|
| The Incredibles C...|         200|      200.0|
| The Fast and the ...|         190|      190.0|
| Harry Potter Coll...|         125|      125.0|
+---------------------+------------+-----------+



* Total and Mean Revenue

In [29]:
franchise_revenue = df.groupBy("belongs_to_collection").agg(
    sum("revenue_musd").alias("Total Revenue"), mean("revenue_musd").alias("Mean Revenue"))
franchise_revenue = franchise_revenue.orderBy(col("Total Revenue").desc())
franchise_revenue.show(10)

+---------------------+------------------+------------------+
|belongs_to_collection|     Total Revenue|      Mean Revenue|
+---------------------+------------------+------------------+
| The Avengers Coll...|       7776.073348|       1944.018337|
| Star Wars Collection|3400.9224540000005|1700.4612270000002|
| Jurassic Park Col...|        2982.00374|        1491.00187|
|    Avatar Collection|       2923.706026|       2923.706026|
|    Frozen Collection|2727.9024849999996|1363.9512424999998|
| The Lion King (Re...|       1662.020819|       1662.020819|
| The Fast and the ...|            1515.4|            1515.4|
| Black Panther Col...|       1349.926083|       1349.926083|
| Harry Potter Coll...|       1341.511219|       1341.511219|
| The Incredibles C...|       1242.805359|       1242.805359|
+---------------------+------------------+------------------+



* Mean Rating

In [30]:
mean_rating = df.groupBy("movie_type").agg(mean("vote_average").alias("Mean Rating"))
mean_rating.show()

+----------+-----------+
|movie_type|Mean Rating|
+----------+-----------+
| Franchise|  7.3805625|
+----------+-----------+



Most Successful Directors based on Total Number of Movies Directed

In [31]:
director_movie_counts = df.groupBy("director").agg(count("*").alias("movie_count"))
most_successful_directors = director_movie_counts.orderBy(col("movie_count").desc())
most_successful_directors.show()

+---------------+-----------+
|       director|movie_count|
+---------------+-----------+
|      Joe Russo|          2|
|    Joss Whedon|          2|
|Colin Trevorrow|          1|
|   Ryan Coogler|          1|
|    Jon Favreau|          1|
|    J.A. Bayona|          1|
|    David Yates|          1|
|    J.J. Abrams|          1|
|     Chris Buck|          1|
|      James Wan|          1|
|   Jennifer Lee|          1|
|      Brad Bird|          1|
|  James Cameron|          1|
|   Rian Johnson|          1|
+---------------+-----------+



* Most Successful Directors based on Total Revenue

In [32]:
director_revenue = df.groupBy("director").agg(sum("revenue_musd").alias("Total Revenue"))
most_successful_directors_revenue = director_revenue.orderBy(col("Total Revenue").desc())
most_successful_directors_revenue.show()

+---------------+-------------+
|       director|Total Revenue|
+---------------+-------------+
|      Joe Russo|  4851.854139|
|    Joss Whedon|  2924.219209|
|  James Cameron|  2923.706026|
|    J.J. Abrams|  2068.223624|
|Colin Trevorrow|  1671.537444|
|    Jon Favreau|  1662.020819|
|      James Wan|       1515.4|
|   Jennifer Lee|  1453.683476|
|   Ryan Coogler|  1349.926083|
|    David Yates|  1341.511219|
|   Rian Johnson|   1332.69883|
|    J.A. Bayona|  1310.466296|
|     Chris Buck|  1274.219009|
|      Brad Bird|  1242.805359|
+---------------+-------------+



*  Meam Rating

In [33]:
director_rating = df.groupBy("director").agg(mean("vote_average").alias("Mean Rating"))
most_successful_directors_rating = director_rating.orderBy(col("Mean Rating").desc())
most_successful_directors_rating.show()

+---------------+-----------+
|       director|Mean Rating|
+---------------+-----------+
|      Joe Russo|      8.236|
|    David Yates|      8.087|
|  James Cameron|      7.588|
|    Joss Whedon|      7.506|
|      Brad Bird|      7.455|
|   Ryan Coogler|      7.373|
|    J.J. Abrams|      7.261|
|   Jennifer Lee|      7.249|
|     Chris Buck|      7.247|
|      James Wan|      7.226|
|    Jon Favreau|       7.11|
|   Rian Johnson|      6.778|
|Colin Trevorrow|      6.693|
|    J.A. Bayona|      6.538|
+---------------+-----------+





---

