In [None]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC
import time
from selenium.common.exceptions import WebDriverException, NoSuchWindowException


## 1. Setup driver

khởi tạo trình duyệt Edge để phục vụ cho quá trình crawl dữ liệu trên IMDb.\
Hàm setup tạo WebDriver theo cấu hình mặc định của Edge và có thể quan sát trực tiếp toàn bộ quá trình crawl — phù hợp khi debug hoặc kiểm thử thủ công.

In [None]:
def setup(driver_path):
    try:
        cService =webdriver.EdgeService(executable_path = driver_path)
        
        driver= webdriver.Edge(service=cService)
        return driver
    except:
        print('Something went wrong!!')

## 2. Crawl phim

Sử dụng hàm `get_movie_details` để lấy các thông tin không xuất hiện trong danh sách ngoài, bao gồm:
- Genres 
- Budget

In [None]:
def setup(driver_path):
    """Setup Edge driver with optimizations for speed."""
    try:
        # Tạo options để tối ưu
        options = Options()
        
        # 1. Tắt hình ảnh (tăng tốc load trang đáng kể)
        prefs = {
            "profile.managed_default_content_settings.images": 2,
            "profile.default_content_setting_values.notifications": 2,
        }
        options.add_experimental_option("prefs", prefs)
        
        # 2. Các flag để tăng tốc
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        options.add_argument('--disable-extensions')
        
        # 3. Bật blink features để tải nhanh hơn
        options.add_argument('--blink-settings=imagesEnabled=false')
        
        # 4. (Optional) Headless mode - nhanh hơn nhưng không thấy browser
        # options.add_argument('--headless')
        
        cService = webdriver.EdgeService(executable_path=driver_path)
        driver = webdriver.Edge(service=cService, options=options)
        
        # 5. Set page load strategy (không đợi tất cả resources)
        # driver.set_page_load_timeout(10)
        
        return driver
    except Exception as e:
        print(f'Setup error: {e}')
        return None

## 2. Crawl phim

### 2.1 crawl thông tin phim ở ngoài ( danh mục)

In [None]:
def crawl_imdb_popular_movies(driver_path):
    url = 'https://www.imdb.com/search/title/?title_type=feature'
    driver = setup(driver_path)
    driver.get(url)
    time.sleep(3)

    row_template = {
        'rank': np.nan,
        'title': np.nan,
        'release_year': np.nan,
        'run_time': np.nan,
        'mpa': np.nan,
        'metascore': np.nan,
        'vote_count': np.nan,
        'rating': np.nan,
    }

    data = []
    load_more_xpath = '//button[contains(@class, "ipc-see-more__button")]'

    for page in range(3):
        print(f"\n=== PAGE {page+1} ===")

        containers = driver.find_elements(
            By.XPATH, '//div[@class="ipc-metadata-list-summary-item__tc"]'
        )

        start_index = len(data)
        print(f"Found {len(containers)} movies so far.")

        new_containers = containers[start_index:]

        for idx, container in enumerate(new_containers, start=start_index+1):
            print(f"\nProcessing movie {idx}...")
            row = row_template.copy()

            # Rank + title
            try:
                row['rank'], row['title'] = container.find_element(By.CLASS_NAME, 'ipc-title__text').text.split('.', 1)
                row['rank'] = row['rank'].strip()
                row['title'] = row['title'].strip()
            except:
                pass

            # Metadata block (year, runtime, mpa)
            movie_metadata_container = None
            try:
                    
                movie_metadata_container = container.find_element(By.XPATH, value='.//div[@class="sc-3eaf0513-6 dMOIgq dli-title-metadata"]')
                try:
                    metadata = movie_metadata_container.find_elements(By.XPATH, value='./span[@class="sc-3eaf0513-7 hmmeot dli-title-metadata-item"]')
            
                    if len(metadata) >= 1:
                        row['release_year'] = metadata[0].text
                    if len(metadata) >= 2:
                        row['run_time'] = metadata[1].text
                    if len(metadata) >= 3:
                        row['mpa'] = metadata[2].text
                except:
                    pass
            except:
                pass

            try:
                row['metascore'] = movie_metadata_container.find_element(By.XPATH, './/span[contains(@class,"metacritic-score-box")]').text
            except:
                pass

            try:
                row['rating'] = container.find_element(By.XPATH, value='.//span[@class="ipc-rating-star--rating"]').text
            except:
                pass

            try:
                row['vote_count'] = container.find_element(By.XPATH, value='.//span[@class="ipc-rating-star--voteCount"]').text
            except:
                pass

            data.append(row)

        # Load more
        try:
            load_more_btn = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, load_more_xpath))
            )
            driver.execute_script(
                "arguments[0].scrollIntoView({block:'center'});", load_more_btn
            )
            time.sleep(1)
            driver.execute_script("arguments[0].click();", load_more_btn)
            time.sleep(2)
        except:
            print("No more movies to load.")
            break

    driver.quit()
    df= pd.DataFrame(data)
    df.to_csv('IMDB_movies.csv',index=False)
    return data


### 2.2 crawl phim chi tiết nội dung bên trong từng phim ( genre, budget)

In [None]:
def get_movie_links_on_page(driver):
    """Return up to 50 movie links from current search page."""
    containers = driver.find_elements(By.XPATH,value ='//div[@class="ipc-metadata-list-summary-item__tc"]')
    links=[e.find_element(By.TAG_NAME, "a").get_attribute("href") for e in containers]
    return links

In [None]:
def get_movie_details(driver, url):
    """
    Open new tab → scrape genres + budget → close → return (genres, budget)
    """
    main_tab = driver.current_window_handle

    genres = None
    budget = None

    try:
        # mở tab mới
        driver.execute_script("window.open('');")
        new_tab = [t for t in driver.window_handles if t != main_tab][0]
        driver.switch_to.window(new_tab)

        # load trang phim
        driver.get(url)

        # Chờ page load bằng element luôn có: title
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "h1"))
        )

        # ====== LẤY GENRES ======
        try:
            # scroll đến storyline
            storyline = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.XPATH, '//span[text()="Storyline"]'))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", storyline)

            genre_block = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located(
                    (By.XPATH, '//li[@data-testid="storyline-genres"]')
                )
            )
            items = genre_block.find_elements(By.XPATH, './/ul/li')
            genres = ", ".join([i.text for i in items if i.text]) or None

        except:
            genres = None

        # ====== LẤY BUDGET ======
        try:
            li = WebDriverWait(driver, 4).until(
                EC.presence_of_element_located(
                    (By.XPATH, '//div[@data-testid="title-boxoffice-section"]//li[@data-testid="title-boxoffice-budget"]')
                )
            )
            budget_span = li.find_element(
                By.XPATH, './/span[contains(@class,"ipc-metadata-list-item__list-content-item")]'
            )
            budget = budget_span.text.strip()
        except:
            budget = None

    finally:
        # đóng tab mới
        try:
            driver.close()
        except:
            pass
        driver.switch_to.window(main_tab)

    return genres, budget


In [None]:
def crawl_imdb_popular_movies(driver_path, num_pages=201,start =70, end =80):
    url = 'https://www.imdb.com/search/title/?title_type=feature'
    driver = setup(driver_path)
    
    if driver is None:
        return []
    driver.get(url)
    time.sleep(3)

    row_template = {
        'rank': np.nan,
        'title': np.nan,
        'genres': np.nan,
        'budget': np.nan,
        'release_year': np.nan,
        'run_time': np.nan,
        'mpa': np.nan,
        'metascore': np.nan,
        'vote_count': np.nan,
        'rating': np.nan
    }
    
    data = []
    load_more_xpath = '//button[contains(@class, "ipc-see-more__button")]'
    count =0
    for page in range(num_pages):
        print(f"\n=== PAGE {page+1} ===")
        if page == end: 
            print('da crawl xong')
            break
        elif page < start:
                # Load more
            count+=50
            containers = driver.find_elements(
                By.XPATH, '//div[@class="ipc-metadata-list-summary-item__tc"]'
            )
            try:
                load_more_btn = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.XPATH, load_more_xpath))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", load_more_btn)
                time.sleep(1)
                driver.execute_script("arguments[0].click();", load_more_btn)
                time.sleep(3)
            except:
                print("No more movies to load.")
                break
        else:
            # Lấy tất cả phim đã load (cả cũ + mới)
            containers = driver.find_elements(
                By.XPATH, '//div[@class="ipc-metadata-list-summary-item__tc"]'
            )
            expected_total = 50 * (page+1)

            if len(containers) < expected_total:
                print(f"⚠ IMDb chưa load đủ. Đang thấy: {len(containers)}, cần: {expected_total}")
                
                # CHỜ THÊM CHO IMDB LOAD
                try:
                    WebDriverWait(driver, 10).until(
                        lambda d: len(
                            d.find_elements(By.XPATH, '//div[@class="ipc-metadata-list-summary-item__tc"]')
                        ) >= expected_total
                    )
                except:
                    pass
            
                # kiểm tra lần cuối
                containers = driver.find_elements(By.XPATH, '//div[@class="ipc-metadata-list-summary-item__tc"]')
                if len(containers) < expected_total:
                    print(f"❌ IMDb KHÔNG LOAD ĐỦ SAU KHI CHỜ")
                    print(f"Đang có {len(containers)}, cần {expected_total}")
                    break
            movie_links = [c.find_element(By.TAG_NAME, "a").get_attribute("href") 
                           for c in containers]
            
            print(f"Total movies loaded so far: {len(containers)}")
            
            # Chỉ lấy phần mới được load thêm
            #start_index = len(data)
            start_index = 50*page
            new_containers = containers[start_index:]
            new_links = movie_links[start_index:]
    
            print(f"New movies to process: {len(new_containers)}")
    
            # Crawl từng phim mới
            for i, container in enumerate(new_containers):
                global_idx = start_index + i + 1
                print(f"\nProcessing movie {global_idx}...")
    
                row = row_template.copy()
    
                # Rank + Title
                try:
                    row['rank'], row['title'] = container.find_element(By.CLASS_NAME, 'ipc-title__text').text.split('.', 1)
                    row['rank'] = row['rank'].strip()
                    row['title'] = row['title'].strip()
                    print("Rank:", row['rank'])
                except:
                    pass
    
                # Metadata block (year, runtime, mpa)
                movie_metadata_container = None
                try:
                    movie_metadata_container = container.find_element(By.XPATH, value='.//div[@class="sc-3eaf0513-6 dMOIgq dli-title-metadata"]')
                    try:
                        metadata = movie_metadata_container.find_elements(By.XPATH, value='./span[@class="sc-3eaf0513-7 hmmeot dli-title-metadata-item"]')
                        
                        for item in metadata:
                            text = item.text.strip()
                            
                            # Kiểm tra năm phát hành (4 chữ số)
                            if text.isdigit() and len(text) == 4:
                                row['release_year'] = text
                                print("release_year:", row['release_year'])
                            # Kiểm tra thời lượng (chứa "h" hoặc "m")
                            elif 'h' in text or 'm' in text:
                                row['run_time'] = text
                                print("run_time:", row['run_time'])
                            # Kiểm tra MPA rating (các giá trị phổ biến)
                            elif text in ['G', 'PG', 'PG-13', 'R', 'NC-17', 'TV-MA', 'TV-14', 'TV-PG', 'TV-G', 'Not Rated', 'Unrated', 'Approved']:
                                row['mpa'] = text
                                print("mpa:", row['mpa'])
                            else:
                                if len(text) < 15:
                                    row['mpa'] = text
                                    
                    except:
                        pass
                except:
                    pass
    
                try:
                    row['metascore'] = movie_metadata_container.find_element(By.XPATH, './/span[contains(@class,"metacritic-score-box")]').text
                    print("metascore:", row['metascore'])
                except:
                    pass
    
                try:
                    row['rating'] = container.find_element(By.XPATH, value='.//span[@class="ipc-rating-star--rating"]').text
                except:
                    pass
    
                try:
                    row['vote_count'] = container.find_element(By.XPATH, value='.//span[@class="ipc-rating-star--voteCount"]').text
                except:
                    pass
        
                # Crawl detail page
                detail_url = new_links[i]
                try:
                    genres, budget = get_movie_details(driver, detail_url)
                    row['genres'] = genres
                    row['budget'] = budget
                except:
                    pass
    
                print("Genres:", row['genres'])
                print("Budget:", row['budget'])
                print(" ✓ DONE")
    
                data.append(row)
    
            # Load more
            try:
                load_more_btn = WebDriverWait(driver, 5).until(
                    EC.presence_of_element_located((By.XPATH, load_more_xpath))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", load_more_btn)
                time.sleep(1)
                driver.execute_script("arguments[0].click();", load_more_btn)
                time.sleep(3)
            except:
                print("No more movies to load.")
                break
    driver.quit()
    return data


## 3. Main

In [None]:
data = crawl_imdb_popular_movies("C:/Program Files (x86)/Microsoft/Edge/Application/msedgedriver.exe")


In [None]:
print(data[1])
print(data[-1])
print(len(data))

In [None]:
df= pd.DataFrame(data)
df.to_csv('nhap_imdb.csv',index=False)

In [None]:
df = pd.DataFrame(data)
df.to_csv('IMDB_movies.csv', index=False, mode='a', header=False, encoding='utf-8')
