In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, ElementClickInterceptedException
import time
import json
import csv

In [None]:
# Get the URLs of movies's review on MoMo
main_url = "https://www.momo.vn/cinema/review?fromType=nav_menu"
movie_urls = set()

driver = webdriver.Chrome()
driver.implicitly_wait(15)
wait = WebDriverWait(driver, 15)

print("Getting the URLs of movies's review page...")
driver.get(main_url)
try:
    while True:
        try:
            show_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Xem tiếp')]")))
            show_button.click()
        except TimeoutException:
            print("Clicked all the extend buttons. Moving to the next step...")
            break
        except Exception as e:
            print(e)
            print("Error ocurred, retrying. If still more errors, consider stopping the program and debug it")
            continue

    movie_table = driver.find_element(By.CSS_SELECTOR, "#review div.grid")
    movie_elements = movie_table.find_elements(By.CSS_SELECTOR, ":scope > div")
    print(f'found {len(movie_elements)} movies from the first table')

    for movie_element in movie_elements:
        link_element = movie_element.find_element(By.TAG_NAME, "a")
        href = link_element.get_attribute("href")
        movie_urls.add(href)
    print(f'Got {len(movie_urls)} movie urls from the first table')

    momo_movie_table = driver.find_element(By.CSS_SELECTOR, "#reviewMoMo div.grid")
    momo_movie_elements = momo_movie_table.find_elements(By.CSS_SELECTOR, ":scope > div")
    print(f'found {len(momo_movie_elements)} movies from the second table')

    for movie_element in momo_movie_elements:
        link_element = movie_element.find_element(By.TAG_NAME, "a")
        href = link_element.get_attribute("href")
        movie_urls.add(href)
    print(f'Got a total of {len(movie_urls)} unique movie urls from the 2 tables')

    driver.quit()
except Exception as e:
    print(e)

Getting the URLs of movies's review page...
Message: stale element reference: stale element not found in the current frame
  (Session info: chrome=142.0.7444.163); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#staleelementreferenceexception
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x7ff62587a235
	0x7ff6255d2630
	0x7ff6253616dd
	0x7ff6253694c8
	0x7ff62536c8c2
	0x7ff62540ce53
	0x7ff6253e2b0a
	0x7ff62540baba
	0x7ff6253ab0ed
	0x7ff6253abf63
	0x7ff6258a5d60
	0x7ff62589fe8a
	0x7ff6258c1005
	0x7ff6255ed71e
	0x7ff6255f4e1f
	0x7ff6255db7c4
	0x7ff6255db97f
	0x7ff6255c18e8
	0x7ffd1c38e8d7
	0x7ffd1cc4c53c

Error ocurred, retrying. If still more errors, consider stopping the program and debug it
Clicked all the extend buttons. Moving to the next step...
found 1052 movies from the first table
Got 1052 movie urls from the first table
found 37 movies from the second table
Got a total of 1056 unique movie 

In [None]:
# save the crawled URLs
with open ('movie_urls.json', 'w', encoding='utf-8') as f:
    json.dump(list(movie_urls), f, ensure_ascii=False, indent=2)

In [None]:
# Go to each URL and crawl the ratings from comments
with open ('movie_urls.json', 'r', encoding='utf-8') as f:
    movie_urls = json.load(f)

options = webdriver.ChromeOptions()
options.page_load_strategy = 'eager'
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(5)
wait = WebDriverWait(driver, 5)

failed_urls = []
user_id_dict = {}
user_idx = 1
movie_idx = 1
max_None = 30
max_try = 5

print("Extracting ratings from every movie's review page...")
movies_file = open("movies_metadata.csv", "a", newline="", encoding="utf-8")
movies_writer = csv.writer(movies_file)
movies_writer.writerow(['id', 'name', 'genres', 'country', 'year', 'IMDb_score', 'MoMo_score'])

ratings_file = open("ratings.csv", "a", newline="", encoding="utf-8")
ratings_writer = csv.writer(ratings_file)
ratings_writer.writerow(['userId', 'movieId', 'rating', 'date'])

for movie_url in movie_urls: # you can change this to movie_urls[:i] to save time if you are just testing
    try:
        driver.get(movie_url)
    except Exception as e:
        print(f"took too long to load {movie_url}, skipping")
        failed_urls.append(movie_url)
        continue

    try:
        MoMo_score = IMDb_score = None
        score_elements = driver \
            .find_element(By.CSS_SELECTOR, "div.jsx-d074b6b0f0aeffcc.mt-1.flex") \
            .find_elements(By.CSS_SELECTOR, ":scope > div")
        for score_element in score_elements:
            score_sub_elements = score_element.find_elements(By.CSS_SELECTOR, ":scope > *")
            if score_sub_elements[0].find_element(By.CSS_SELECTOR, ":scope > *").tag_name == 'img':
                MoMo_score = score_sub_elements[1].text
            elif "h-auto" in score_sub_elements[0].find_element(By.CSS_SELECTOR, ":scope > *").get_attribute("class"):
                IMDb_score = score_sub_elements[1].text

        if MoMo_score is not None:
            info_element = driver.find_element(By.CSS_SELECTOR, 'div.jsx-d074b6b0f0aeffcc.flex-1')

            name_element = info_element.find_element(By.TAG_NAME, "a")
            name = name_element.text

            further_info_elements = info_element.find_element(By.TAG_NAME, "ul").find_elements(By.CSS_SELECTOR, ":scope > li")
            genres = further_info_elements[0].text[12:]
            year = further_info_elements[1].text[20:]
            if len(further_info_elements) > 2:
                country = further_info_elements[2].text[16:]
            else:
                country = None

            tries = 0
            pre_len = None
            while tries <= max_try:
                try:
                    rating_elements = driver \
                        .find_element(By.CSS_SELECTOR, "div.grid.grid-cols-1.divide-y") \
                        .find_elements(By.CSS_SELECTOR, ":scope > div")
                    length = len(rating_elements)
                    if pre_len is not None:
                        if length == pre_len:
                            tries += 1
                            time.sleep(0.3)
                            continue
                        else:
                            tries = 0
                            pre_len = length
                    else:
                        pre_len = length

                    show_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Xem tiếp nhé!')]")))
                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", show_button)
                    time.sleep(0.8)
                    show_button.click()
                    time.sleep(0.2)
                except TimeoutException:
                    break
                except StaleElementReferenceException:
                    print(f"DOM while loading {movie_url}, retrying")
                    tries += 1
                    if pre_len is not None:
                        pre_len -= 1 
                    time.sleep(0.2)
                    continue
                except ElementClickInterceptedException:
                    print(f'ElementClickInterceptedException encountered while working with {movie_url}, retrying')
                    tries += 1
                    if pre_len is not None:
                        pre_len -= 1 
                    time.sleep(1.6)
                    continue
                except Exception as e:
                    print(e)
                    print(movie_url)
                    tries = max_try + 1
                    break

            if tries > max_try:
                print(f"retried too many times or error occured, skipping {movie_url}")
                failed_urls.append(movie_url)
                continue
            
            None_count = 0
            for rating_element in rating_elements:
                if None_count > max_None:
                    print(f"Too many ratings with no point in {movie_url}, skipping the rest")
                    break
                user_name = rating_element.find_element(By.CSS_SELECTOR, 'div.text-md').text
                date = rating_element.find_element(By.CSS_SELECTOR, 'div.text-xs').text
                try:
                    rating = float(rating_element.find_element(By.CSS_SELECTOR, 'span.pl-0\\.5').text.split("/")[0])
                except Exception as e:
                    rating = None
                    None_count += 1

                if rating is not None:
                    if user_name not in user_id_dict:
                        user_id_dict[user_name] = user_idx
                        user_idx += 1
                    ratings_writer.writerow([user_id_dict[user_name], movie_idx, rating, date])

            movies_writer.writerow([movie_idx, name, genres, country, year, IMDb_score, MoMo_score])
            movie_idx += 1
            ratings_file.flush()
            movies_file.flush()

    except Exception as e:
        print(e)
        failed_urls.append(movie_url)
        continue

driver.quit()
movies_file.close()
ratings_file.close()

with open("failed_urls.json", "w", encoding="utf-8") as f:
    json.dump(failed_urls, f, ensure_ascii=False, indent=4)

Extracting ratings from every movie's review page...


In [None]:
# save the state for future use if neccessary
state = {
    "user_id_dict": user_id_dict,
    "user_idx": user_idx,
    "movie_idx": movie_idx
}

with open("crawl_state.json", "w", encoding="utf-8") as f:
    json.dump(state, f, ensure_ascii=False, indent=2)

In [None]:
# If there are any failures, run this to give them another try with a more careful approach
# Note that this code is made to run only once, so any more failures and the URL will be skipped
# This code also won't keep track of which failed URL got retried successfully
print("Retrying the failed urls with a 'less eager' approach. Fail again and we will skip the URL forever")

with open ('failed_urls.json', 'r', encoding='utf-8') as f:
    movie_urls = json.load(f)

options = webdriver.ChromeOptions()
options.page_load_strategy = 'eager'
driver = webdriver.Chrome(options=options)
driver.implicitly_wait(5)
wait = WebDriverWait(driver, 5)

with open("crawl_state.json", "r", encoding="utf-8") as f:
    state = json.load(f)
    user_id_dict = state["user_id_dict"]
    user_idx = state["user_idx"]
    movie_idx = state["movie_idx"]
max_None = 30
max_try = 10

print("Extracting ratings from every movie's review page...")
movies_file = open("movies_metadata.csv", "a", newline="", encoding="utf-8")
movies_writer = csv.writer(movies_file)

ratings_file = open("ratings.csv", "a", newline="", encoding="utf-8")
ratings_writer = csv.writer(ratings_file)

for movie_url in movie_urls:
    try:
        driver.get(movie_url)
    except Exception as e:
        print(f"took too long to load {movie_url}, skipping")
        continue

    try:
        MoMo_score = IMDb_score = None
        score_elements = driver \
            .find_element(By.CSS_SELECTOR, "div.jsx-d074b6b0f0aeffcc.mt-1.flex") \
            .find_elements(By.CSS_SELECTOR, ":scope > div")
        for score_element in score_elements:
            score_sub_elements = score_element.find_elements(By.CSS_SELECTOR, ":scope > *")
            if score_sub_elements[0].find_element(By.CSS_SELECTOR, ":scope > *").tag_name == 'img':
                MoMo_score = score_sub_elements[1].text
            elif "h-auto" in score_sub_elements[0].find_element(By.CSS_SELECTOR, ":scope > *").get_attribute("class"):
                IMDb_score = score_sub_elements[1].text

        if MoMo_score is not None:
            info_element = driver.find_element(By.CSS_SELECTOR, 'div.jsx-d074b6b0f0aeffcc.flex-1')

            name_element = info_element.find_element(By.TAG_NAME, "a")
            name = name_element.text

            further_info_elements = info_element.find_element(By.TAG_NAME, "ul").find_elements(By.CSS_SELECTOR, ":scope > li")
            genres = further_info_elements[0].text[12:]
            year = further_info_elements[1].text[20:]
            if len(further_info_elements) > 2:
                country = further_info_elements[2].text[16:]
            else:
                country = None

            tries = 0
            pre_len = None
            while tries <= max_try:
                try:
                    rating_elements = driver \
                        .find_element(By.CSS_SELECTOR, "div.grid.grid-cols-1.divide-y") \
                        .find_elements(By.CSS_SELECTOR, ":scope > div")
                    length = len(rating_elements)
                    if pre_len is not None:
                        if length == pre_len:
                            tries += 1
                            time.sleep(0.4)
                            continue
                        else:
                            tries = 0
                            pre_len = length
                    else:
                        pre_len = length

                    show_button = wait.until(EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Xem tiếp nhé!')]")))
                    driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", show_button)
                    time.sleep(0.8)
                    show_button.click()
                    time.sleep(0.4)
                except TimeoutException:
                    break
                except StaleElementReferenceException:
                    print(f"DOM while loading {movie_url}, retrying")
                    tries += 1
                    if pre_len is not None:
                        pre_len -= 1 
                    time.sleep(0.8)
                    continue
                except ElementClickInterceptedException:
                    print(f'ElementClickInterceptedException encountered while working with {movie_url}, retrying')
                    tries += 1
                    if pre_len is not None:
                        pre_len -= 1 
                    time.sleep(1.6)
                    continue
                except Exception as e:
                    print(e)
                    print(movie_url)
                    tries = max_try + 1
                    break

            if tries > max_try:
                print(f"retried too many times or error occured, skipping {movie_url}")
                continue
            
            None_count = 0
            for rating_element in rating_elements:
                if None_count > max_None:
                    print(f"Too many ratings with no point in {movie_url}, skipping the rest")
                    break
                user_name = rating_element.find_element(By.CSS_SELECTOR, 'div.text-md').text
                date = rating_element.find_element(By.CSS_SELECTOR, 'div.text-xs').text
                try:
                    rating = float(rating_element.find_element(By.CSS_SELECTOR, 'span.pl-0\\.5').text.split("/")[0])
                except Exception as e:
                    rating = None
                    None_count += 1

                if rating is not None:
                    if user_name not in user_id_dict:
                        user_id_dict[user_name] = user_idx
                        user_idx += 1
                    ratings_writer.writerow([user_id_dict[user_name], movie_idx, rating, date])

            movies_writer.writerow([movie_idx, name, genres, country, year, IMDb_score, MoMo_score])
            movie_idx += 1
            ratings_file.flush()
            movies_file.flush()

    except Exception as e:
        print(e)
        failed_urls.append(movie_url)
        continue

driver.quit()
movies_file.close()
ratings_file.close()

Retrying the failed urls with a 'less eager' approach. Fail again and we will skip the URL forever
Extracting ratings from every movie's review page...


In [16]:
state = {
    "user_id_dict": user_id_dict,
    "user_idx": user_idx,
    "movie_idx": movie_idx
}

with open("crawl_state.json", "w", encoding="utf-8") as f:
    json.dump(state, f, ensure_ascii=False, indent=2)