In [1]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Initialize WebDriver
driver = webdriver.Chrome()
url = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"
driver.get(url)
time.sleep(3)

# Extract movie elements
movies_list = driver.find_elements(By.XPATH, "//*[@id='__next']/main/div/div[3]/section/div/div[2]/div/ul/li")

movies_data = []

for n in range(1, len(movies_list) + 1):
    movie_info = {}
    
    # Movie Title & Link
    try:
        title_xpath = f"//*[@id='__next']/main/div/div[3]/section/div/div[2]/div/ul/li[{n}]/div/div/div/div/div[2]/div[1]/a"
        title_element = driver.find_element(By.XPATH, title_xpath)
        movie_info['Title'] = title_element.text
        movie_link = title_element.get_attribute("href")
    except:
        movie_info['Title'] = "Not Found"
        movie_link = None
    
    # Release Year
    try:
        year_xpath = f"//*[@id='__next']/main/div/div[3]/section/div/div[2]/div/ul/li[{n}]/div/div/div/div/div[2]/div[2]/span[1]"
        movie_info['Release Year'] = driver.find_element(By.XPATH, year_xpath).text
    except:
        movie_info['Release Year'] = "Not Found"
    
    # Duration
    try:
        duration_xpath = f"//*[@id='__next']/main/div/div[3]/section/div/div[2]/div/ul/li[{n}]/div/div/div/div/div[2]/div[2]/span[2]"
        movie_info['Duration'] = driver.find_element(By.XPATH, duration_xpath).text
    except:
        movie_info['Duration'] = "Not Found"
    
    # Certification
    try:
        certification_xpath = f"//*[@id='__next']/main/div/div[3]/section/div/div[2]/div/ul/li[{n}]/div/div/div/div/div[2]/div[2]/span[3]"
        movie_info['Certification'] = driver.find_element(By.XPATH, certification_xpath).text
    except:
        movie_info['Certification'] = "Not Found"
    
    # Rating
    try:
        rating_xpath = f"//*[@id='__next']/main/div/div[3]/section/div/div[2]/div/ul/li[{n}]/div/div/div/div/div[2]/span/div/span/span[1]"
        movie_info['Rating'] = driver.find_element(By.XPATH, rating_xpath).text
    except:
        movie_info['Rating'] = "Not Found"
    
    # Total Ratings
    try:
        total_ratings_xpath = f"//*[@id='__next']/main/div/div[3]/section/div/div[2]/div/ul/li[{n}]/div/div/div/div/div[2]/span/div/span/span[2]"
        movie_info['Total Ratings'] = driver.find_element(By.XPATH, total_ratings_xpath).text
    except:
        movie_info['Total Ratings'] = "Not Found"
    
    # Open movie page for additional details
    if movie_link:
        driver.get(movie_link)
        time.sleep(2)
        
        # Director Name
        try:
            director_xpath = "//a[contains(@href, '/name/') and contains(@href, 'tt_ov_dr_1')]"
            movie_info['Director'] = driver.find_element(By.XPATH, director_xpath).text
        except:
            movie_info['Director'] = "Not Found"
        
        # Description
        try:
            description_xpath = "//span[@role='presentation' and contains(@data-testid, 'plot-l')]"
            movie_info['Description'] = driver.find_element(By.XPATH, description_xpath).text
        except:
            movie_info['Description'] = "Not Found"
        
        # Genres
        try:
            genre_elements = driver.find_elements(By.XPATH, "//div[@class='ipc-chip-list__scroller']/a/span")
            movie_info['Genre'] = ', '.join([genre.text for genre in genre_elements])
        except:
            movie_info['Genre'] = "Not Found"
        
        # Go back to main page
        driver.get(url)
        time.sleep(2)
    
    movies_data.append(movie_info)

# Close the WebDriver
driver.quit()

# Convert data to DataFrame
df = pd.DataFrame(movies_data)
print(df)

# Save to CSV
df.to_csv("IMDB_Top_Movies.csv", index=False)

print("Data successfully saved to IMDB_Top_Movies.csv")

                           Title Release Year   Duration Certification  \
0    1. The Shawshank Redemption         1994     2h 22m             R   
1               2. The Godfather         1972     2h 55m             R   
2             3. The Dark Knight         2008     2h 32m         PG-13   
3       4. The Godfather Part II         1974     3h 22m             R   
4                5. 12 Angry Men         1957     1h 36m      Approved   
..                           ...          ...        ...           ...   
245           246. Groundhog Day         1993     1h 41m            PG   
246                247. The Help         2011     2h 26m         PG-13   
247                    Not Found    Not Found  Not Found     Not Found   
248                    Not Found    Not Found  Not Found     Not Found   
249                    Not Found    Not Found  Not Found     Not Found   

        Rating Total Ratings              Director  \
0          9.3          (3M)        Frank Darabont   
1  