## 1. Initialize Spark Session

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import requests
import os
from dotenv import load_dotenv

In [15]:
# Configure Spark with increased memory

spark = SparkSession.builder.appName("TMDB analysis").getOrCreate()


## 2. Data Fetching (Adapted for Spark)


In [19]:
def fetch_movie_data_spark(movie_ids):
    """Fetch data and return as Spark DataFrame"""
    load_dotenv()
    api_key = os.getenv('api_key')
    if not api_key:
        raise ValueError("API_KEY environment variable not set")
    
    base_url = "https://api.themoviedb.org/3/movie/{}?api_key={}&append_to_response=credits"
    
    # Fetch data using Python requests (single-node operation)
    movies = []
    for mid in movie_ids:
        try:
            response = requests.get(base_url.format(mid, api_key), timeout=5)
            movies.append(response.json())
        except Exception as e:
            print(f"Failed to fetch movie {mid}: {e}, {response.status_code}, {response.json()}")
    
    # Convert to Spark DataFrame
    return spark.createDataFrame(movies)

movie_ids = [0, 299534, 19995, 140607, 299536, 597, 135397, 
             420818, 24428, 168259, 99861, 284054, 12445,
             181808, 330457, 351286, 109445, 321612, 260513]

raw_df = fetch_movie_data_spark(movie_ids)
raw_df.cache()  # Cache for multiple operations
print(f"Fetched {raw_df.count()} movies")

Fetched 19 movies


In [17]:
raw_df.show()

+-----------+--------------------+-------+-----+--------------------+---------------------+---------+--------------------+--------------------+--------------------+------+---------+--------------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+----------+-------+--------------------+--------+--------------------+--------------------+-----+------------+----------+
|status_code|      status_message|success|adult|       backdrop_path|belongs_to_collection|   budget|             credits|              genres|            homepage|    id|  imdb_id|origin_country|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date|   revenue|runtime|    spoken_languages|  status|             tagline|               title|video|vote_average|vote_count|
+-----------+--------------------+-------+-----+--------------------+-

In [18]:
raw_df.select('title').show()

+--------------------+
|               title|
+--------------------+
|                NULL|
|   Avengers: Endgame|
|              Avatar|
|Star Wars: The Fo...|
|Avengers: Infinit...|
|             Titanic|
|      Jurassic World|
|       The Lion King|
|        The Avengers|
|           Furious 7|
|Avengers: Age of ...|
|       Black Panther|
|Harry Potter and ...|
|Star Wars: The La...|
|           Frozen II|
|Jurassic World: F...|
|              Frozen|
|Beauty and the Beast|
|       Incredibles 2|
+--------------------+

