# ⚙️ TMDB Data Extractor Notebook Overview
This notebook extracts and processes TMDB data from JSON files, keeping only relevant fields.

## 📂 Steps in This Notebook

⓵ **Extract TMDB Data from JSON** → Keeps these keys:  
   `budget`, `genres`, `id`, `imdb_id`, `origin_country`, `original_language`, `original_title`,  
   `production_companies`, `production_countries`, `revenue`, `title`, `credits`, `keywords`, `actors_characters`, `belongs_to_collection`.

⓶ **Parse JSON Files** → Functions to extract:
   - **Genres, keywords, producers, credits, cast & characters**.

⓷ **Iterate Through Files** → Reads all JSON files in `subgenre_titles_data/` and extracts structured data.

⓸ **Save to CSV** → Exports final parsed data to `parsed_json_data.csv`.

---

### EXTRACT TMDB DATA FROM JSON FILES

In [2]:
TMDB_folder="../data/IMDb-subgenre_titles_data"
import json

# Load and inspect a sample TMDB JSON file
sample_tconst = "tt0065377"  # Change to any existing tconst in your dataset
file_path = f"{TMDB_folder}/{sample_tconst}.json"  # Update with your folder path

# Open and read the JSON file
with open(file_path, "r", encoding="utf-8") as f:
    tmdb_data = json.load(f)

# Print full JSON structure (or just keys for a quick view)
#print(json.dumps(tmdb_data, indent=4))  # Pretty-print entire JSON
print(tmdb_data.keys())  # Uncomment to print only top-level keys
keep_keys = ['budget', 'genres', 'id', 'imdb_id', 'origin_country', 'original_language', 'original_title',
             'production_companies', 'production_countries', 'revenue', 'title','credits', 'keywords', 
             'actors_characters', 'belongs_to_collection']

dict_keys(['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'origin_country', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'release_dates', 'credits', 'keywords', 'actors_characters'])


In [9]:
import os
sample_tconst = "tt0065377"
TMDB_JSON_FOLDER = "subgenre_titles_data"
file_path = os.path.join(TMDB_JSON_FOLDER, f"{sample_tconst}.json")
print(os.path.exists(file_path))  # Should return True

False


In [10]:
# parse data
TMDB_JSON_FOLDER = TMDB_folder

import os
import json
import pandas as pd

# Path where TMDB JSON files are stored
# TMDB Fields to extract
TOP_LEVEL_KEYS = [
    'budget', 'id', 'imdb_id', 'origin_country',
    'original_language', 'original_title', 'revenue', 'title'
]

def extract_keywords(data):
    """Extract 'name' values from the 'keywords' field."""
    return [kw['name'] for kw in data.get('keywords', {}).get('keywords', [])]

def extract_genres(data):
    """Extract genre names from the 'genres' field."""
    return [genre['name'] for genre in data.get('genres', [])]

def extract_producers(data):
    """Extract names of production companies from the TMDB JSON."""
    return [company['name'] for company in data.get('production_companies', [])]

def extract_credits(data):
    """Extract relevant cast and crew members from 'credits'."""
    extracted_data = {
        'cast': [],
        'crew': []
    }

    if 'credits' in data:
        # Extract cast details with additional fields
        if 'cast' in data['credits']:
            extracted_data['cast'] = [
                {
                    'name': member.get('name', 'Not Found'),
                    'order': member.get('order', 'Not Found'),
                    'character': member.get('character', 'Not Found'),
                    'gender': member.get('gender', 'Not Found'),
                    'id': member.get('id', 'Not Found'),
                    'popularity': member.get('popularity', 'Not Found'),
                    'credit_id': member.get('credit_id', 'Not Found'),
                    'adult': member.get('adult', 'Not Found')
                }
                for member in data['credits']['cast']
            ]

        # Extract specific crew members with additional fields
        if 'crew' in data['credits']:
            extracted_data['crew'] = [
                {
                    'name': member.get('name', 'Not Found'),
                    'job': member.get('job', 'Not Found'),
                    'gender': member.get('gender', 'Not Found'),
                    'id': member.get('id', 'Not Found'),
                    'popularity': member.get('popularity', 'Not Found'),
                    'credit_id': member.get('credit_id', 'Not Found')
                }
                for member in data['credits']['crew']
                if member.get('job') in ['Director', 'Producer', 'Director of Photography']
            ]

    return extracted_data

def parse_tmdb_json(tconst):
    """Parse a single TMDB JSON file and extract necessary fields."""
    file_path = os.path.join(TMDB_JSON_FOLDER, f"{tconst}.json")

    if not os.path.exists(file_path):
        return {
            **{key: "Not Found" for key in TOP_LEVEL_KEYS},
            "keywords": "Not Found",
            "genres": "Not Found",
            "production_companies": "Not Found",
            "cast": "Not Found",
            "crew": "Not Found",
            "franchise": "Not Found"
        }

    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Extract data
    parsed_data = {key: data.get(key, "Not Found") for key in TOP_LEVEL_KEYS}
    parsed_data["keywords"] = extract_keywords(data)
    parsed_data["production_companies"] = extract_producers(data)
    credits_data = extract_credits(data)
    parsed_data["genres"] = extract_genres(data)
    parsed_data["cast"] = credits_data["cast"]
    parsed_data["crew"] = credits_data["crew"]

    # ✅ Add franchise flag
    parsed_data["franchise"] = data.get("belongs_to_collection") is not None

    return parsed_data


# Example usage
parse_tmdb_json(sample_tconst)


{'budget': 10000000,
 'id': 10671,
 'imdb_id': 'tt0065377',
 'origin_country': ['US'],
 'original_language': 'en',
 'original_title': 'Airport',
 'revenue': 100489151,
 'title': 'Airport',
 'keywords': ['airplane',
  'airport',
  'based on novel or book',
  'panic',
  'bomb',
  'stowaway',
  'snowstorm',
  'landing',
  'deception',
  'blizzard',
  'disaster movie',
  'suicide bomber',
  'snowplow'],
 'production_companies': ['Ross Hunter Productions'],
 'genres': ['Action', 'Drama', 'Thriller'],
 'cast': [{'name': 'Burt Lancaster',
   'order': 0,
   'character': 'Mel Bakersfeld',
   'gender': 2,
   'id': 13784,
   'popularity': 0.059,
   'credit_id': '52fe439f9251416c75017aa5',
   'adult': False},
  {'name': 'Dana Wynter',
   'order': 1,
   'character': 'Cindy Bakersfeld',
   'gender': 1,
   'id': 69810,
   'popularity': 0.037,
   'credit_id': '52fe439f9251416c75017af9',
   'adult': False},
  {'name': 'Dean Martin',
   'order': 2,
   'character': 'Vernon Demerest',
   'gender': 2,
   '

### ITERATE THROUGH FILES

In [11]:
import os
import pandas as pd

def parse_all_files(folder_path):
    """Parse all TMDB JSON files in a folder and return a DataFrame."""
    tmdb_data_list = []
    json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]

    for i, filename in enumerate(json_files):
        tconst = filename.replace(".json", "")  # Extract IMDb ID from filename
        
        if i % 500 == 0:
            print(f"Processing {i}/{len(json_files)}...")

        tmdb_data = parse_tmdb_json(tconst)  # Use the parsing function
        tmdb_data["tconst"] = tconst  # Store the tconst for merging
        tmdb_data_list.append(tmdb_data)

    # Convert extracted TMDB data to DataFrame
    df_tmdb = pd.DataFrame(tmdb_data_list)
    df_tmdb = df_tmdb.drop(columns=["imdb_id", "id"])


    return df_tmdb

# Run parsing on all JSON files in the folder
df_tmdb = parse_all_files(TMDB_JSON_FOLDER)

Processing 0/11072...
Processing 500/11072...
Processing 1000/11072...
Processing 1500/11072...
Processing 2000/11072...
Processing 2500/11072...
Processing 3000/11072...
Processing 3500/11072...
Processing 4000/11072...
Processing 4500/11072...
Processing 5000/11072...
Processing 5500/11072...
Processing 6000/11072...
Processing 6500/11072...
Processing 7000/11072...
Processing 7500/11072...
Processing 8000/11072...
Processing 8500/11072...
Processing 9000/11072...
Processing 9500/11072...
Processing 10000/11072...
Processing 10500/11072...
Processing 11000/11072...


### SAVE AS JSON

In [12]:
df_tmdb.to_csv("../data/cache/parsed_json_data.csv", index=False)

df_tmdb.head()

Unnamed: 0,budget,origin_country,original_language,original_title,revenue,title,keywords,production_companies,genres,cast,crew,franchise,tconst
0,48000000,[US],en,Kate & Leopold,76019048,Kate & Leopold,"[new york city, time travel, duke, fish out of...","[Konrad Pictures, Miramax]","[Romance, Comedy, Fantasy]","[{'name': 'Meg Ryan', 'order': 0, 'character':...","[{'name': 'James Mangold', 'job': 'Director', ...",False,tt0035423
1,0,[FR],fr,Le Boucher,0,The Butcher,"[vietnam veteran, province, butcher, murder, f...","[Les Films La Boétie, Euro International Films]","[Thriller, Crime, Drama]","[{'name': 'Stéphane Audran', 'order': 0, 'char...","[{'name': 'Claude Chabrol', 'job': 'Director',...",False,tt0064106
2,0,[US],en,The Honeymoon Killers,0,The Honeymoon Killers,"[nurse, widow, alabama, lonely hearts ad, base...","[Roxanne Company, American International Pictu...","[Crime, Drama, Romance, Thriller]","[{'name': 'Shirley Stoler', 'order': 0, 'chara...","[{'name': 'Leonard Kastle', 'job': 'Director',...",False,tt0064437
3,0,[GB],en,I Start Counting,0,I Start Counting,"[exploitation, stalker, serial killer, crying,...","[Triumvirate Films, United Artists]","[Thriller, Drama]","[{'name': 'Jenny Agutter', 'order': 0, 'charac...","[{'name': 'David Greene', 'job': 'Director', '...",False,tt0064462
4,0,[XC],cs,Kladivo na čarodějnice,0,Witchhammer,"[witch, based on novel or book, witch burning,...",[Filmové studio Barrandov],"[Drama, Thriller]","[{'name': 'Elo Romančík', 'order': 0, 'charact...","[{'name': 'Otakar Vávra', 'job': 'Director', '...",False,tt0064546
