# Scrap webtoons comments using selenium/bs4

In [37]:
import json
import re
import random
import datetime
import time
import os
import pandas as pd
import polars as pl
from bs4 import BeautifulSoup
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from urllib.parse import urlparse, parse_qs

## Fonctions

In [38]:
def scrape_comments(driver, episode_url, episode_name, episode_likes, episode_date):
    
    # Find all reply buttons
    reply_buttons = driver.find_elements(By.CSS_SELECTOR, "a.u_cbox_btn_reply")

    for button in reply_buttons:
        try:
            reply_count_text = button.find_element(By.CSS_SELECTOR, "span.u_cbox_reply_cnt").text
            reply_count = int(reply_count_text) if reply_count_text else 0
        except Exception:
            reply_count = 0

        if reply_count > 0:
            driver.execute_script("arguments[0].click();", button)  # Click the button
            time.sleep(3)  # Allow time for the comments to load

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    comments = soup.select('.u_cbox_comment')

    comment_data = []

    for comment in comments:
        data_info = comment['data-info']
        data_info = data_info.replace("'", "").replace(" ", "")
        data_info_dict = dict(item.split(":") for item in data_info.split(",") if ":" in item)

        comment_id = data_info_dict.get('commentNo')
        reply_level = data_info_dict.get('replyLevel')
        parent_comment_no = data_info_dict.get('parentCommentNo')
        comment_text = comment.select_one('.u_cbox_contents').text
        comment_date = comment.select_one('.u_cbox_date')['data-value']
        comment_author = comment.select_one('.u_cbox_nick').text

        # Extract likes and dislikes
        likes = comment.select_one('.u_cbox_tool .u_cbox_cnt_recomm').text if comment.select_one('.u_cbox_tool .u_cbox_cnt_recomm') else "0"
        dislikes = comment.select_one('.u_cbox_tool .u_cbox_cnt_unrecomm').text if comment.select_one('.u_cbox_tool .u_cbox_cnt_unrecomm') else "0"

        comment_data.append([episode_url, episode_name, episode_likes, episode_date,\
                             comment_id, reply_level, parent_comment_no, comment_text, comment_date, comment_author, likes, dislikes])
    
    # Find all reply buttons again after scraping
    reply_buttons_after_scraping = driver.find_elements(By.CSS_SELECTOR, "a.u_cbox_btn_reply")

    # Click the reply buttons again to close the expanded replies
    for button in reply_buttons_after_scraping:
        try:
            reply_count_text = button.find_element(By.CSS_SELECTOR, "span.u_cbox_reply_cnt").text
            reply_count = int(reply_count_text) if reply_count_text else 0
        except Exception:
            reply_count = 0

        if reply_count > 0:
            driver.execute_script("arguments[0].click();", button)  # Click the button
            time.sleep(3)  # Allow time for the comments to load
    
    return comment_data


In [39]:
# Function to scrape all comments from all episodes
def scrape_all_comments(driver, episodes):
    def next_button_exists(driver, next_page):
        # Check if the next button or next 10 pages button exists on the page
        try:
            driver.find_element(By.CSS_SELECTOR, f"a.u_cbox_page[data-param='{next_page}']")
            return True
        except NoSuchElementException:
            try:
                driver.find_element(By.CSS_SELECTOR, "a.u_cbox_next")
                return True
            except NoSuchElementException:
                return False

    all_comments_data = []

    # Iterate through all episodes
    for episode in episodes:
        episode_url = episode["url"]
        episode_name = episode['name']
        episode_likes = episode['likes']
        episode_date = episode['date']
        print(f"Scraping comments from {episode_name}")
        driver.get(episode_url)
        time.sleep(5)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)

        current_page = 1
        # Loop through all the pages of comments for the current episode
        while True:
            all_comments_data.extend(scrape_comments(driver, episode_url, episode_name, episode_likes, episode_date))

            # Check if the next page button exists
            next_page = current_page + 1
            if not next_button_exists(driver, next_page):
                break

            # Click the next page button
            try:
                next_button_css = f"a.u_cbox_page[data-param='{next_page}']"
                next_button = driver.find_element(By.CSS_SELECTOR, next_button_css)
            except NoSuchElementException:
                # If the button for the next_page is not found, try clicking the next 10 pages button
                next_button_css = "a.u_cbox_next"
                next_button = driver.find_element(By.CSS_SELECTOR, next_button_css)

            # Scroll the next_button into view
            driver.execute_script("arguments[0].scrollIntoView();", next_button)

            # Wait for the next_button to be clickable and click it
            wait = WebDriverWait(driver, 10)
            next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, next_button_css)))
            next_button.click()

            # Wait for the next page of comments to load
            time.sleep(5)

            # Update the current_page
            current_page = next_page

    return all_comments_data

In [40]:
# Function to extract information about all episodes
def extract_episodes_info(driver, main_url, min_page, max_page):

    episodes_info = []

    for page_num in range(min_page, max_page + 1):
        if page_num == min_page:
            page_url = f"{main_url}&page={page_num}?lang=en"
        else:
            page_url = f"{main_url}&page={page_num}"
        
        driver.get(page_url)

        episode_items = driver.find_elements(By.CSS_SELECTOR, 'li._episodeItem')

        for episode_item in episode_items:
            episode_name = episode_item.find_element(By.CSS_SELECTOR, 'span.subj').text.strip()
            episode_url = episode_item.find_element(By.CSS_SELECTOR, 'a').get_attribute('href')
            # episode likes
            likes_text = episode_item.find_element(By.CSS_SELECTOR, 'span.like_area').text.strip()
            likes_text = likes_text.replace(",", "")  # Remove commas
            episode_likes = int(re.search(r'\d+', likes_text).group())  # Extract the number and convert it to an integer
            episode_date = episode_item.find_element(By.CSS_SELECTOR, 'span.date').text.strip()

            episodes_info.append({
                'name': episode_name,
                'url': episode_url,
                'likes': episode_likes,
                'date': episode_date
            })

    return episodes_info

In [41]:
# convert the list of comments to a dataframe
def format_and_save_to_csv(all_comments):
    column_names = [
        "Episode_URL", "Episode_Name", "Episode_Likes", "Episode_Date",
        "Comment_ID", "Comment_Reply_Level", "Comment_Parent_Comment_No",
        "Comment_Text", "Comment_Date", "Comment_Author",
        "Comment_Likes", "Comment_Dislikes",
    ]

    df = pd.DataFrame(all_comments, columns=column_names)    

    # Convert the Comment_Reply_Level column to integer
    df['Comment_Reply_Level'] = df['Comment_Reply_Level'].astype(int)

    # max level of reply
    max_level = df['Comment_Reply_Level'].max()

    if max_level > 1:
        # Loop through the levels starting from 2
        for level in range(2, max_level + 1):
            if level == 2:
                # Merge the original DataFrame with the level 2 DataFrame
                merged_df = pd.merge(
                    df[df['Comment_Reply_Level'] == 1],
                    df[df['Comment_Reply_Level'] == 2],
                    left_on='Comment_ID',
                    right_on='Comment_Parent_Comment_No',
                    suffixes=('_level1', '_level2'),
                    how='left'
                )
            else:
                # Create a new DataFrame for the current level
                df_level = df[df['Comment_Reply_Level'] == level]
                
                # Merge the current level DataFrame with the previous merged DataFrame
                merged_df = pd.merge(
                    merged_df,
                    df_level,
                    left_on='Comment_ID_level{}'.format(level - 1),
                    right_on='Comment_Parent_Comment_No_level{}'.format(level),
                    suffixes=('_level{}'.format(level - 1), '_level{}'.format(level)),
                    how='left'
                )

        # Select the desired columns for the final DataFrame
        final_df = merged_df

        # Reset the index of the final DataFrame
        final_df.reset_index(drop=True, inplace=True)

    else:
        # Select the desired columns for the final DataFrame
        final_df = df

    # Create a new folder with the date and time in the folder name
    folder_name = datetime.datetime.now().strftime('%Y-%m-%d')
    os.makedirs(folder_name, exist_ok=True)

    # Use the first 'Episode_URL' to extract genre, series name, episode name, and episode number
    first_episode_url = df['Episode_URL'].iloc[-1]
    genre, series_name, episode_name, episode_number = parse_url(first_episode_url)

    # Save the DataFrame to a CSV file with the date and time in the filename
    comments_csv_path = f"{folder_name}/{genre}_{series_name}_{episode_number}_{episode_name}_comments_{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.csv"
    final_df.to_csv(comments_csv_path, index=False, sep=";", encoding="utf-8-sig")


In [42]:
# Parse the URL to extract genre, series name, episode name, and episode number
def parse_url(url):
    # get genre, series name, episode name
    path = urlparse(url).path
    components = path.split('/')
    genre = components[2]
    series_name = components[3]
    episode_name = components[4]
    # get id
    parsed_url = urlparse(url)
    query_params = parse_qs(parsed_url.query)
    episode_number = query_params.get('episode_no', [None])[0]
    return genre, series_name, episode_name, episode_number

## Scrap
### Webtoons

In [43]:
# URL setup
url_list = [
    {"url": "https://www.webtoons.com/en/action/weakhero/list?title_no=1726", "min_page": 1, "max_page": 24},
    {"url": "https://www.webtoons.com/en/challenge/how-to-be-a-dragon/list?title_no=696410", "min_page": 1, "max_page": 4},
    {"url": "https://www.webtoons.com/en/challenge/nerd-and-jock/list?title_no=135963", "min_page": 1, "max_page": 20},
    {"url": "https://www.webtoons.com/en/challenge/goth-girl-the-jock/list?title_no=764411", "min_page": 1, "max_page": 4},
    {"url": "https://www.webtoons.com/en/challenge/seekers-log/list?title_no=102095", "min_page": 1, "max_page": 16},
    {"url": "https://www.webtoons.com/en/challenge/power-pills/list?title_no=18222", "min_page": 1, "max_page": 40},
    {"url": "https://www.webtoons.com/en/challenge/a-life-through-selfies/list?title_no=64761", "min_page": 1, "max_page": 32},
    {"url": "https://www.webtoons.com/en/fantasy/my-husband-changes-every-night/list?title_no=5214", "min_page": 1, "max_page": 1},
    
]

'''
trop long : {"url": "https://www.webtoons.com/en/romance/lore-olympus/list?title_no=1320", "min_page": 1, "max_page": 25}
'''


'\ntrop long : {"url": "https://www.webtoons.com/en/romance/lore-olympus/list?title_no=1320", "min_page": 1, "max_page": 25}\n'

In [44]:
# skip episodes already scraped

# Directory containing the CSV files
csv_directory = "2023-05-18"

# A list to hold all the data
data = []

# Iterate over each file in the directory
for filename in os.listdir(csv_directory):
    # Check if the file is a CSV file
    if filename.endswith(".csv"):
        # Construct the full file path
        full_filepath = os.path.join(csv_directory, filename)
        
        # Load the data from the CSV file
        df = pl.read_csv(full_filepath, separator=";")
        
        # Add a column to track the original file
        df = df.with_columns(pl.lit(filename).alias('original_file'))
        
        # Add the data to our list
        data.append(df)

# Combine all the data into one DataFrame
all_data = pl.concat(data)

# get all the unique values in the first column
skip_urls = all_data['Episode_URL_level1'].unique().to_list()

# add problematic urls to skip_urls
skip_urls.extend(['https://www.webtoons.com/en/action/weakhero/s3-ep-158/viewer?title_no=1726&episode_no=159',
])


### Scrap comments

In [45]:
def scrape_webtoons(episodes_info, all_comments, skip_urls):
    # Scrape the comments
    with tqdm(total=len(episodes_info), desc="Scraping comments") as pbar_outer:
        for episode in tqdm(episodes_info, desc="Episodes"):
            # skip urls that are already scraped
            if episode['url'] not in skip_urls:
                comments = scrape_all_comments(driver, [episode])
                # Append comments to the DataFrame
                all_comments.extend(comments)
                # wait a random time between 1 and 5 seconds
                time.sleep(random.randint(1, 5))
                # Save the comments to a CSV file
                format_and_save_to_csv(all_comments)
                # Reset all_comments
                all_comments = []
            # Update the progress bar
            pbar_outer.update(1)

In [46]:
# Iterate over each URL
for url_dict in url_list:
    
    all_comments = []

    # Selenium setup
    chrome_options = Options()
    chrome_options.add_argument("--lang=en") # Set the language to English
    #chrome_options.add_argument("--headless") # Run the browser in headless mode, if desired
    driver = webdriver.Chrome(service=Service(executable_path=ChromeDriverManager().install()), options=chrome_options)

    try:
        # URL setup
        main_url = url_dict["url"]
        min_page = url_dict["min_page"]
        max_page = url_dict["max_page"]


        # Extract the episodes info
        episodes_info = extract_episodes_info(driver, main_url, min_page, max_page)

        # Scrape the comments
        scrape_webtoons(episodes_info, all_comments, skip_urls)

    except Exception as e:
        print(f"An error occurred: {e}")
    
    finally:
        # Close the browser
        driver.quit()

[WDM] - Downloading: 100%|██████████| 6.81M/6.81M [00:00<00:00, 76.6MB/s]
Episodes: 100%|██████████| 240/240 [00:00<00:00, 60015.08it/s]
Scraping comments: 100%|██████████| 240/240 [00:00<00:00, 40004.49it/s]
Episodes: 100%|██████████| 33/33 [00:00<00:00, 33002.39it/s]
Scraping comments: 100%|██████████| 33/33 [00:00<00:00, 8244.21it/s]
Scraping comments:   0%|          | 0/199 [00:00<?, ?it/s]

Scraping comments from Nerd and Jock Ep 199


Episodes:   0%|          | 0/199 [00:56<?, ?it/s]
Scraping comments:   0%|          | 0/199 [00:56<?, ?it/s]


An error occurred: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=113.0.5672.127)
Stacktrace:
Backtrace:
	GetHandleVerifier [0x00388893+48451]
	(No symbol) [0x0031B8A1]
	(No symbol) [0x00225058]
	(No symbol) [0x0020D073]
	(No symbol) [0x0026DEBB]
	(No symbol) [0x0027BFD3]
	(No symbol) [0x0026A0B6]
	(No symbol) [0x00247E08]
	(No symbol) [0x00248F2D]
	GetHandleVerifier [0x005E8E3A+2540266]
	GetHandleVerifier [0x00628959+2801161]
	GetHandleVerifier [0x0062295C+2776588]
	GetHandleVerifier [0x00412280+612144]
	(No symbol) [0x00324F6C]
	(No symbol) [0x003211D8]
	(No symbol) [0x003212BB]
	(No symbol) [0x00314857]
	BaseThreadInitThunk [0x76F400C9+25]
	RtlGetAppContainerNamedObjectPath [0x77657B4E+286]
	RtlGetAppContainerNamedObjectPath [0x77657B1E+238]



KeyboardInterrupt: 