In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json

In [2]:
parent_url = "https://www.imdb.com/search/title/?title_type=feature,tv_movie&release_date=2024-01-01,2024-12-31&country_of_origin=IN&primary_language=hi&adult=include&sort=release_date,asc"

In [3]:
def driver_initialize():
    driver = webdriver.Chrome()
    driver.maximize_window()
    return driver

In [4]:
driver = driver_initialize()

In [5]:
def get_soup():
    current_url = driver.current_url
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'}
    response = requests.get(current_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [6]:
driver.get(parent_url)
main_soup = get_soup()
total_movies = main_soup.select("div.fwjHEn")[0].get_text(strip=True).split()[-1]

In [7]:
def button_click(button_name):
    try:
        button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, button_name))
        )

        driver.execute_script("arguments[0].click()", button)

    except Exception as e:
        print("Element not found or clickable!")
        time.sleep(2)

In [8]:
def get_attributes(page_soup, element):
    try:
        attribute = page_soup.select(element)[0].get_text(strip=True)
    except:
        attribute = "NA"

    return attribute

In [9]:
def get_attribute_list(page_soup,element):
    try:
        attr_list = page_soup.select(element)
        attr_set = set(link.get_text(strip=True) for link in attr_list)
        attribute_list = list(attr_set)
    except IndexError:
        attribute_list = "NA"

    return attribute_list


In [10]:
def get_one_attribute(page_soup,element):
    try:
        attribute = page_soup.select_one(element).get_text(strip=True)
    except IndexError:
        attribute = "NA"
    
    return attribute

In [11]:
def get_script_attribute(page_soup,element):
    try:
        attribute = page_soup.find('script', type='application/ld+json')
        json_data = json.loads(attribute.string)
        attribute = json_data.get(element, "NA")
    except IndexError:
        attribute = "NA"
    
    return attribute

In [12]:
movie_list = []

for i in range(1,10):
    if (i % 50 == 0):
        see_more_button = "(//span[@class='ipc-see-more__text'])"

        button_click(see_more_button)

    thumbnail = main_soup.select("li.ipc-metadata-list-summary-item")[i-1]

    if thumbnail.select_one(".ipc-media--fallback") is not None:
        print("Thumbnail not found!")
        pass
    else:
        movie_poster = f"(//a[@class='ipc-title-link-wrapper'])[{i}]"

        button_click(movie_poster)
        
        sub_page_soup = get_soup()

        title = get_attributes(sub_page_soup, "span.hero__primary-text")

        duration = get_script_attribute(sub_page_soup, "duration")
            
        imdb_rating = get_attributes(sub_page_soup, "span.imUuxf")

        directors = get_attribute_list(sub_page_soup, 'a[href*="ref_=tt_cst_dr_"]')

        writers = get_attribute_list(sub_page_soup, 'a[href*="ref_=tt_ov_wr_"]')

        cast = get_attribute_list(sub_page_soup, 'a.kVdWAO')

        genres = get_attribute_list(sub_page_soup, 'a[href*="ref_=tt_ov_in_"]')

        release_date = get_one_attribute(sub_page_soup, 'a.ipc-metadata-list-item__list-content-item[href*="ref_=tt_dt_rdat"]')

        languages = get_attribute_list(sub_page_soup, 'a[href*="ref_=tt_dt_ln"]')

        production_companies = get_attribute_list(sub_page_soup, 'a[href*="ref_=tt_dt_cmpy_"]')
            

        movie_details = {"title":title,
                        "duration":duration,
                        "imdb_rating":imdb_rating, 
                        "directors":directors,
                        "writers":writers,
                        "cast":cast,
                        "genres":genres,
                        "release_date":release_date,
                        "languages":languages,
                        "production_companies":production_companies}

        movie_list.append(movie_details)

        time.sleep(2)
        driver.get(parent_url)


Thumbnail not found!


In [19]:
movie_list
movies_dataframe = pd.DataFrame(movie_list)
movies_dataframe.to_csv("imdb_movies.csv", index=False)

In [20]:
driver.quit()