In [2]:
import time
import random
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import ElementClickInterceptedException, NoSuchElementException, TimeoutException, StaleElementReferenceException

In [4]:
# Cấu hình Selenium
options = webdriver.ChromeOptions()
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)

# Danh sách các User-Agent phổ biến
user_agents = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
]

# Chọn ngẫu nhiên 1 User-Agent từ danh sách
options.add_argument(f"user-agent={random.choice(user_agents)}")

driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 5)

# URL cần crawl
url = "https://www.imdb.com/search/title/?title_type=feature&num_votes=1000,&languages=en"
driver.get(url)

In [5]:
film_urls =[] #Danh sách link phim để đi vào và lấy user reviews
movies_data = []

In [None]:
# Số lần cần click vào "Load More" để tải đủ 10,000 bộ phim
load_more_clicks_needed = 110
current_clicks = 1

while current_clicks < load_more_clicks_needed:
    try:
        load_more_button = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.ipc-see-more__button'))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", load_more_button)
        time.sleep(1)
        load_more_button.click()
        current_clicks += 1
        time.sleep(random.uniform(1, 2))  

    except TimeoutException:
        print(f"Không tìm thấy nút '50 more' sau lần click thứ {current_clicks + 1}. Dừng việc click.")
        break

# Sau khi click xong, lấy danh sách các bộ phim
movies_list = driver.find_elements(By.CSS_SELECTOR, 'li.ipc-metadata-list-summary-item')

# Bắt đầu thu thập dữ liệu 
total_movies_to_crawl = 5100
i = 0

while i < total_movies_to_crawl and i < len(movies_list):
    try:
        movie = movies_list[i]

        # Cuộn tới bộ phim để lấy thông tin
        # Sử dụng vòng lặp để thử lại nếu gặp lỗi StaleElementReferenceException
        stale_attempts = 0
        while stale_attempts < 2:  # Thử lại tối đa 2 lần
            try:
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", movie)
                time.sleep(random.uniform(0.5, 1))  # Nghỉ ngẫu nhiên để tránh bị phát hiện là bot
                break  # Nếu không có lỗi, thoát khỏi vòng lặp
            except StaleElementReferenceException:
                print(f"Lỗi StaleElementReferenceException, thử lại lần {stale_attempts + 1}")
                stale_attempts += 1
                # Làm mới danh sách phần tử và thử lại
                movies_list = driver.find_elements(By.CSS_SELECTOR, 'li.ipc-metadata-list-summary-item')
                movie = movies_list[i]
        
        #Lấy link phim
        film_url = movie.find_element(By.CSS_SELECTOR, 'a.ipc-title-link-wrapper').get_attribute("href")
        film_urls.append(film_url)
        
        #Lấy hình phim
        srcset = movie.find_element(By.CSS_SELECTOR, 'div.ipc-media.ipc-media--poster-27x40.ipc-image-media-ratio--poster-27x40.ipc-media--media-radius.ipc-media--base.ipc-media--poster-m.ipc-poster__poster-image.ipc-media__img > img').get_attribute('srcset')
        image_links = re.findall(r'https?://\S+', srcset)
        image_link = image_links[-1]
        
        # Tìm nút "See more information" và click
        info_button = movie.find_element(By.CSS_SELECTOR, 'button[title^="See more information"]')
        info_button.click()
        time.sleep(random.uniform(1, 2))

        # Kiểm tra xem popup đã mở hay chưa
        if EC.presence_of_element_located((By.CSS_SELECTOR, 'div[role="dialog"]')):
            popup_container = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, 'div[role="dialog"]')))
        else:
            print(f"Không có popup container cho phim thứ {i + 1}, bỏ qua.")
            continue

        # Lấy các thông tin về phim
        title = popup_container.find_element(By.CSS_SELECTOR, 'h3.ipc-title__text.prompt-title-text').text
        print(f"Đang crawl phim thứ {i + 1}: {title}")


        # Lấy các thông tin khác
        try:
            info_list = popup_container.find_elements(By.CSS_SELECTOR, 'ul[data-testid="btp_ml"] li')
            year = info_list[0].text if len(info_list) > 0 else ""
            duration = info_list[1].text if len(info_list) > 1 else ""
            mpaa = info_list[2].text if len(info_list) > 2 else ""
        except NoSuchElementException:
            year, duration, mpaa = "", "", ""  

        try:
            genres = ', '.join([genre.text for genre in popup_container.find_elements(By.CSS_SELECTOR, 'ul[data-testid="btp_gl"] li.ipc-inline-list__item')])
        except NoSuchElementException:
            genres = ""

        try:
            imdb_rating = popup_container.find_element(By.CSS_SELECTOR, 'div.sc-b90eafb6-4.Qenpr > span > span.ipc-rating-star--rating').text
            imdb_rating = float(imdb_rating) if imdb_rating else None 
        except (NoSuchElementException):
            imdb_rating = None

        try:
            plot_summary = popup_container.find_element(By.CSS_SELECTOR, 'div.sc-8407191a-2.fQgJvQ').text 
        except NoSuchElementException:
            plot_summary = ""
        try:
            director = popup_container.find_element(By.CSS_SELECTOR, 'div.sc-1582ce06-3.iWfkOS > div:nth-child(1) > ul > li > a').text
        except NoSuchElementException:
            director = ""

        try:
            stars_elements = popup_container.find_elements(By.CSS_SELECTOR, 'div.sc-1582ce06-3.iWfkOS > div:nth-child(2) > ul li')
            stars = ', '.join([star.text for star in stars_elements if star.text != director]) if stars_elements else ""
        except NoSuchElementException:
            stars = ""

        # Lưu dữ liệu vào danh sách
        
        movie_data = {
            'Title': title,
            'Year': year,
            'Duration': duration,
            'MPAA': mpaa,
            'Genres': genres,
            'IMDb_Rating': imdb_rating,
            'Director': director,
            'Stars': stars,
            'Plot_Summary': plot_summary,
            'Image_link': image_link
        }


        movies_data.append(movie_data)

        # Đóng popup sau khi lấy thông tin
        try:
            close_button = popup_container.find_element(By.CSS_SELECTOR, 'button[title="Close Prompt"]')
            close_button.click()
            WebDriverWait(driver, 3).until(EC.invisibility_of_element(popup_container))
        except Exception as e:
            print(f"Lỗi khi đóng popup: {e}")
            continue

        i += 1
        time.sleep(random.uniform(0.5, 1))

    except (ElementClickInterceptedException, TimeoutException) as e:
        print(f"Lỗi khi click vào phần tử: {e}")
        continue


In [7]:
import pandas as pd

# Convert the data to a pandas DataFrame
df = pd.DataFrame(movies_data)
df.head(5)

Unnamed: 0,Title,Year,Duration,MPAA,Genres,IMDb_Rating,Director,Stars,Plot_Summary,Image_link
0,Gladiator II,2024,2h 28min,R,"Action, Adventure, Drama",6.9,Ridley Scott,"Paul Mescal, Denzel Washington, Pedro Pascal",After his home is conquered by the tyrannical ...,https://m.media-amazon.com/images/M/MV5BMWYzZT...
1,Moana 2,2024,1h 40min,PG,"Animation, Adventure, Comedy",7.1,David G. Derrick Jr.,"Auli'i Cravalho, Dwayne Johnson, Hualalai Chung",After receiving an unexpected call from her wa...,https://m.media-amazon.com/images/M/MV5BZDUxNT...
2,Gladiator,2000,2h 35min,R,"Action, Adventure, Drama",8.5,Ridley Scott,"Russell Crowe, Joaquin Phoenix, Connie Nielsen",A former Roman General sets out to exact venge...,https://m.media-amazon.com/images/M/MV5BYWQ4Ym...
3,The Substance,2024,2h 21min,R,"Drama, Horror",7.4,Coralie Fargeat,"Demi Moore, Margaret Qualley, Dennis Quaid",A fading celebrity takes a black-market drug: ...,https://m.media-amazon.com/images/M/MV5BZDQ1NG...


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2460 entries, 0 to 2459
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         2460 non-null   object 
 1   Year          2460 non-null   object 
 2   Duration      2460 non-null   object 
 3   MPAA          2460 non-null   object 
 4   Genres        2460 non-null   object 
 5   IMDb_Rating   2460 non-null   float64
 6   Director      2460 non-null   object 
 7   Stars         2460 non-null   object 
 8   Plot_Summary  2460 non-null   object 
 9   Image_link    2460 non-null   object 
dtypes: float64(1), object(9)
memory usage: 192.3+ KB


In [18]:
df.to_excel('FilmsData1.xlsx', index=False, sheet_name='Movies Data')

In [19]:
cleaned_urls = [url.split('/?')[0] for url in film_urls]
cleaned_urls

['https://www.imdb.com/title/tt1462758',
 'https://www.imdb.com/title/tt0088933',
 'https://www.imdb.com/title/tt0110005',
 'https://www.imdb.com/title/tt12964320',
 'https://www.imdb.com/title/tt0115571',
 'https://www.imdb.com/title/tt8580274',
 'https://www.imdb.com/title/tt3797868',
 'https://www.imdb.com/title/tt0066817',
 'https://www.imdb.com/title/tt0107616',
 'https://www.imdb.com/title/tt0265298',
 'https://www.imdb.com/title/tt0106965',
 'https://www.imdb.com/title/tt0114614',
 'https://www.imdb.com/title/tt1174732',
 'https://www.imdb.com/title/tt0116041',
 'https://www.imdb.com/title/tt0036855',
 'https://www.imdb.com/title/tt1020558',
 'https://www.imdb.com/title/tt1133985',
 'https://www.imdb.com/title/tt0116136',
 'https://www.imdb.com/title/tt0448694',
 'https://www.imdb.com/title/tt1068242',
 'https://www.imdb.com/title/tt5724948',
 'https://www.imdb.com/title/tt0049730',
 'https://www.imdb.com/title/tt0056801',
 'https://www.imdb.com/title/tt19244260',
 'https://www.

In [20]:
with open(r'film_urls1.txt', mode='w', encoding='utf-8') as file:
    for url in cleaned_urls:
        file.write(url + '\n')

In [None]:
driver.quit()