In [1]:
import os
import random
import time

from playwright.async_api import async_playwright, expect

In [2]:
#Start the browser
playwright = await async_playwright().start()

In [3]:
browser = await playwright.chromium.launch(headless=False)
page = await browser.new_page()
await browser.close()

In [4]:
async def open_browser(headless=False):
    """
    Starts the automated browser and opens a new window
    """
    # Start playwright
    playwright = await async_playwright().start()

    # Open chromium (chrome) browser, can use firefox or others
    browser = await playwright.chromium.launch(headless=headless)
  
    # Create a new browser window
    page = await browser.new_page()

    return browser, page

In [5]:
driver, page = await open_browser()

In [6]:
# visit a URL
url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=1960-01-01,2025-07-31&genres=!short,!documentary&has=plot&primary_language=hi&sort=release_date,asc'
await page.goto(url)

<Response url='https://www.imdb.com/search/title/?title_type=feature&release_date=1960-01-01,2025-07-31&genres=!short,!documentary&has=plot&primary_language=hi&sort=release_date,asc' request=<Request url='https://www.imdb.com/search/title/?title_type=feature&release_date=1960-01-01,2025-07-31&genres=!short,!documentary&has=plot&primary_language=hi&sort=release_date,asc' method='GET'>>

In [7]:
while True:
            try:
                next_button = page.locator('//button[contains(@class, "ipc-see-more__button")]')
                await next_button.click()
                await page.wait_for_timeout(2000)
            except:
                print("No more '50 more' button or error occurred.")
                break

No more '50 more' button or error occurred.


In [8]:
html = await page.content()
with open("imdb_full_page.html", "w", encoding="utf-8") as f:
    f.write(html)

In [9]:
await driver.close()

In [10]:
from bs4 import BeautifulSoup

# Load the saved HTML
with open("imdb_full_page.html", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")

# Inspect how film tiles are structured (class may vary)
film_tiles = soup.select('div.sc-995e3276-1.jziSZL.dli-parent')

# Count them
print("Number of film tiles found:", len(film_tiles))

Number of film tiles found: 8342


In [11]:
items = soup.find_all(class_ = 'dli-parent')

rows = []

for item in items:
    row = {}
    row['title'] = item.select_one('h3.ipc-title__text').text.split('. ', 1)[1].strip()
    row['year'] = item.select_one('span.sc-86fea7d1-8').text.strip() if item.select_one('span.sc-86fea7d1-8') else "No year"
    row['plot'] = item.select_one('div.ipc-html-content-inner-div').text.strip()


    rows.append(row)


In [12]:
import pandas as pd

In [13]:
df = pd.json_normalize(rows)
df.head()

Unnamed: 0,title,year,plot
0,Kanoon,1960,A lawyer holds the eyewitness evidence to catc...
1,Bewaqoof,1960,"In his quest for a heir, Rai Bahadur has an af..."
2,Honeymoon,1960,Shanta and Prema are fast friends. Both are ed...
3,Lal Quila,1960,A fictional account of the real life struggle ...
4,Maa Baap,1960,Sohan and Raju are two brothers. Raju works fo...


In [14]:
df.to_excel("imdb_records.xlsx", index=False)

In [15]:
items = soup.find_all(class_ = 'dli-parent')

for item in items:
    print("---")
    print(item.select_one('h3.ipc-title__text').text.strip())
    print(item.select_one('span.sc-86fea7d1-8').text.strip() if item.select_one('span.sc-86fea7d1-8') else "No year")
    print(item.select_one('div.ipc-html-content-inner-div').text.strip())

---
1. Kanoon
1960
A lawyer holds the eyewitness evidence to catch a killer, but the identified criminal is the lawyer's own mentor, prospective father-in-law, and also the judge who presides over the case.
---
2. Bewaqoof
1960
In his quest for a heir, Rai Bahadur has an affair with a prostitute Meher. When Meher becomes pregnant he promises to take the child and educate him. But fate has other plans, for his wife Meena also becomes pregnant at the same time, and shortly thereafter both women give birth to two boys. Meher threatens to expose him if he does not take her son, and leave Meena's son in an orphanage. However, both children end up in his palatial home. He belittles Meher's son and finally accuses him of stealing. Fed up of these accusations, Meena leaves the house with Kishore, and ironically ends up with Meher. In the meantime, Rai's son has grown up to be an arrogant, and a champion boxer. And it is in the boxing ring that both half-brothers will meet again -- for a match 

In [14]:
#OLD CODE FOR REF
#creating the next button
#next_button = page.locator('//button[contains(@class, "ipc-see-more__button")]')
#await next_button.click()


#parsing the films
#xpath_films = '//div[contains(@class, "sc-995e3276-1 jziSZL dli-parent")]'
#film_tiles = await page.locator(xpath_films).all()
#len(film_tiles)

#fi = film_tiles[0]
#await fi.text_content()


#data = []

#xpath_title = '//h3[contains(@class, "ipc-title__text")]'
#xpath_year = '//span[contains(@class, "sc-86fea7d1-8 JTbpG dli-title-metadata-item")][1]'
#for film in film_tiles:
    #titles = await film.locator(xpath_title).text_content()
    #years = await film.locator(xpath_year).text_content()
    #row = {'titles' : titles,'years' : years}
    #data.append(row)