In [1]:
import json
import requests
import pandas as pd
import os 
from bs4 import BeautifulSoup

In [2]:
def get_titles_info(title_id: str):
    title_info_dict = {
        "title": "",
        "overview": "",
        "genres": [],
        "cast": [],
        "content_is": [],
        "subtitles": [],
        "audio": [],
    }

    request_url = f"https://www.netflix.com/title/{title_id}"
    response = requests.get(
        request_url, 
        # verify=False
    )
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    if soup:
        title_h1 = soup.find("h1", class_="title-title")
        if title_h1:
            title_info_dict.update({
                "title": title_h1.text
            })

        overview_div = soup.find("div", class_="title-info-synopsis")
        if overview_div:
            title_info_dict.update({            
                "overview": overview_div.text
            })

        genre_div = soup.find("div", "more-details-cell cell-genres")
        if genre_div:
            title_info_dict.update({
                "genres": [spn.text.replace(',', '').strip() for spn in genre_div.findAll('span')],
            })
            

        cast_div = soup.find("div", "more-details-cell cell-cast")
        if cast_div:
            title_info_dict.update({
                "cast": [spn.text.replace(',', '').strip() for spn in cast_div.findAll('span')]
            })


        content_mood_div = soup.find("div", "more-details-cell cell-mood-tag")
        if content_mood_div:
            title_info_dict.update({
                "content_is": [spn.text.replace(',', '').strip() for spn in content_mood_div.findAll('span')],
            })

        subtitles_div = soup.find("div", "more-details-cell cell-subtitle")
        if subtitles_div:
            title_info_dict.update({
                "subtitles": [spn.text.replace(',', '').strip() for spn in subtitles_div.findAll('span')],
            })

        audio_div = soup.find("div", "more-details-cell cell-audio")
        if subtitles_div:
            title_info_dict.update({
                "audio": [spn.text.replace(',', '').strip() for spn in audio_div.findAll('span')],
            })

    return title_info_dict

In [3]:
def request_netflix_titles(
        params,
        cookies, 
        headers,
        genre_id = "34399",
        from_ix = 0,
        to_ix = 100,
    ):

    genre_item_summ_path = f'["genres",{genre_id},"az",{{"from":{from_ix},"to":{to_ix}}},"itemSummary"]'
    genre_item_ref_summ_path = f'["genres",{genre_id},"az",{{"from":{from_ix},"to":{to_ix}}},"reference",["availability","episodeCount","inRemindMeList","queue","summary"]]'
    data = {
        'path': [
            genre_item_summ_path,
            genre_item_ref_summ_path,
        ]
    }

    response = requests.post(
        'https://www.netflix.com/nq/website/memberapi/vc63e5850/pathEvaluator',
        params=params,
        cookies=cookies,
        headers=headers,
        data=data,
        # verify=False
    )

    print(f"fetch status code --> {response.status_code}")
    return response



In [4]:
def process_titles_data(genre_id, summary_data):
    title_info_list = []

    for ix, item_summ in summary_data["jsonGraph"]["genres"][genre_id]['az'].items():
        title_info_dict = {}

        if not item_summ["itemSummary"].get("value"):
            print("No data is present, stopping here")
            break

        item_summ_val = item_summ["itemSummary"]["value"]

        print(f"processing title {int(ix)+1}: id -> {item_summ_val['id']}")

        misc_title_info_dict = get_titles_info(str(item_summ_val["id"]))

        title_info_dict.update({
            "id": str(item_summ_val["id"]),
            "title": item_summ_val["title"],
            "type": item_summ_val["type"],
            "release_year": item_summ_val["releaseYear"],
            "runtime": item_summ_val.get("infoDensityRuntime"),
            "backdrop_path": item_summ_val["boxArt"]["url"],
            "made_by_netflix": item_summ_val["isOriginal"],
            "maturity_rating": item_summ_val["maturity"]["rating"]["value"],
            "maturity_warning": item_summ_val["maturity"]["rating"]["maturityDescription"],
            "maturity_rating_reason": [ix for ix in item_summ_val["maturity"]["rating"]["specificRatingReason"].split(', ')],
            "maturity_rating_board": item_summ_val["maturity"]["rating"]["board"],
        })

        title_info_dict = {**title_info_dict, **misc_title_info_dict}

        title_info_list.append(title_info_dict)

    return title_info_list


In [7]:
def main():

    from_ix = 1001
    to_ix = 2000
    genre_id = "34399"
    
    params = {
        'webp': 'true',
        'drmSystem': 'widevine',
        'isVolatileBillboardsEnabled': 'true',
        'routeAPIRequestsThroughFTL': 'false',
        'hasVideoMerchInBob': 'true',
        'hasVideoMerchInJaw': 'true',
        'falcor_server': '0.1.0',
        'withSize': 'true',
        'materialize': 'true',
        'original_path': '/shakti/mre/pathEvaluator',
    }

    cookies = {
        'SecureNetflixId': 'v%3D2%26mac%3DAQEAEQABABSso5AdAyc6d1rLm-N-ww2pmkw_gZ-MUSc.%26dt%3D1697065093678',
        'NetflixId': 'v%3D2%26ct%3DBQAOAAEBEPpZZEBgGGyA-6R91KRsj5uBwKBSBq3roimqQ-myMTBpr5EOJscSEGN-imXJuk_pqxAWCmWnIHItBFFjdOCphMGRG5YI_Dehl-jMpEtCC2tdaALH-LRJCC2qk1G-bJBQFPjBqDn_NhaxZ5No9iAXvoF7G6--GxfBfRpoiYi3WWrHpvdYnO19VZaiEQFUC7jydWYOK1AmnMK2gzx71-_Xy6S2qBoog9ro7SfWLyjBY7V-sgIf_h0Ax4NhUbaCj_saQntQ7FQ7v3u9YoH88cnboovO1Rhclb_hT0XpxaQu5sqkzzi_yba164AtFZv7Mrhv6bPNNuLiIaWCHlgLCh9VRQc24IajbuQSNL2r5iG8CH-T2_UGcCjaSNfwK8msk7_qzmfxwIAHGl3NGEXtdPWY31599cSYqUBpHKZqEt6dk6FHvLldaVY6iJdJDmAxQkf3DTi1rNx5VFzZR_kEzq7uj7OLKx1EK1WA1u344Is_m1b_G_U6MudCr4kuDW5lzrF0V0yrie-45ydG7k7U2Ky9LQC0WZCt1xF-WDx3X_u9Vh1LiywBNo3jGFgI00NxeEXk_6OWGUDEaKCFKbh2hNZxv8FeG68qwHUvTyYQrMKv2n0yeAI.%26bt%3Ddbl%26ch%3DAQEAEAABABRaung3lxsP7zP6KpvmzVirkNLdm8L31PE.%26mac%3DAQEAEAABABTNJh-SZrVtr2uOWNrPU1LpO1YHsE8ElEM.',
        'profilesNewSession': '0'
    }

    headers = {
        'authority': 'www.netflix.com',
        'accept': '/',
        'accept-language': 'en-GB,en;q=0.7',
        'content-type': 'application/x-www-form-urlencoded',
    }

    response = request_netflix_titles(
        params=params,
        cookies=cookies, 
        headers=headers, 
        from_ix=from_ix, 
        to_ix=to_ix,
    )
    
    if response.status_code == 200:
        summary_data = response.json()
        title_info_list = process_titles_data(genre_id, summary_data)
        df = pd.DataFrame.from_records(title_info_list)

        if not df.empty:
            csv_foldername = "csv_files"
            if not os.path.isdir(csv_foldername):
                os.makedirs(csv_foldername)
                
            csv_filename = os.path.join(
                csv_foldername, 
                f"netflix_movies_from_{from_ix}to{to_ix}.csv"
            )

            with open(f"{csv_filename.replace('.csv', '.json')}", 'w') as fi:
                json.dump(title_info_list, fi)
                
            df.to_csv(csv_filename, index=False)
            print(f"CSV file: {csv_filename} saved successfully!")
            return df
        else:
            print("nothing to save...")
            return
            
    else:
        print("failed to get data. Input correct Netflix Id or check on with correct genre_id and index")

In [None]:
df = main()

fetch status code --> 200
processing title 1002: id -> 80082739
processing title 1003: id -> 80131194
processing title 1004: id -> 80200957
processing title 1005: id -> 80013872
processing title 1006: id -> 81713120
processing title 1007: id -> 81023618
processing title 1008: id -> 81329041
processing title 1009: id -> 81591165
processing title 1010: id -> 70295741
processing title 1011: id -> 81050375
processing title 1012: id -> 81026327
processing title 1013: id -> 81370442
processing title 1014: id -> 81678777
processing title 1015: id -> 60033290
processing title 1016: id -> 81341143
processing title 1017: id -> 80117458
processing title 1018: id -> 80067522
processing title 1019: id -> 81599072
processing title 1020: id -> 80240904
processing title 1021: id -> 80099083
processing title 1022: id -> 81405851
processing title 1023: id -> 81270771
processing title 1024: id -> 80234491
processing title 1025: id -> 81450779
processing title 1026: id -> 81154456
processing title 1027: i