In [3]:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
import re

# Initialize cloudscraper
scraper = cloudscraper.create_scraper()

# Function to scrape the list of threads on a forum page
def scrape_thread_list(page_number):
    url = f'https://voz.vn/f/diem-bao.33/page-{page_number}'
    try:
        response = scraper.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching thread list page {page_number}: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    threads_data = []

    for item in soup.find_all('div', class_='structItem-cell structItem-cell--main'):
        title_link = item.find('a', {'data-tp-primary': 'on'})
        user_link = item.find('a', class_='username')
        if title_link and user_link:
            title = title_link.get_text(strip=True)
            link = title_link['href']  # Extract the relative link
            user = user_link.get_text(strip=True)
            threads_data.append({'Title': title, 'Link': link, 'User': user})

    return threads_data

# Function to scrape posts on a single thread page
def scrape_thread_page(url):
    try:
        response = scraper.get(url)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching thread page: {e}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    posts_data = []

    for post in soup.find_all('article', class_='message--post'):
        # Extract username of the poster
        user_section = post.find('section', class_='message-user')
        user_name = user_section.find('a', class_='username').get_text(strip=True) if user_section else "Unknown"

        # Extract content
        content_div = post.find('div', class_='bbWrapper')
        if content_div:
            # Extract the original quoted comment (inside blockquote)
            og_comment = None
            reply_to_user = None
            reply_to_div = content_div.find('div', class_='bbCodeBlock-expandContent')
            blockquote = content_div.find('blockquote')

            # Extract the 'data-quote' attribute for the reply-to user
            if blockquote and blockquote.has_attr('data-quote'):
                reply_to_user = blockquote['data-quote']  # This gets the 'data-quote' attribute value

            # Get the full text of the quoted comment
            if reply_to_div:
                og_comment = reply_to_div.get_text(separator=" ", strip=True)

            # Extract the actual reply content (exclude the quoted text)
            actual_comment = content_div.get_text(separator=" ", strip=True)
            if blockquote:
                quoted_text = blockquote.get_text(separator=" ", strip=True)
                actual_comment = actual_comment.replace(quoted_text, "").strip()

            posts_data.append({
                'User': user_name,
                'Reply_To': reply_to_user,
                'Original_Comment': og_comment,
                'Reply_Content': actual_comment
            })

    return posts_data

# Function to scrape the first 20 pages of a thread
def scrape_first_20_pages(base_url):
    all_data = []
    for page_num in range(1, 31):  # Scrape the first 20 pages
        page_url = f"{base_url}page-{page_num}"
        print(f"Scraping page {page_num}: {page_url}")
        page_data = scrape_thread_page(page_url)
        if not page_data:  # Stop if no more data is found
            break
        all_data.extend(page_data)
    return pd.DataFrame(all_data).drop_duplicates()

# Main script to scrape data from the forum
all_threads_data = []
for page_number in range(1, 21):  # Scrape the first 30 pages of threads
    print(f"Scraping thread list page {page_number}")
    threads = scrape_thread_list(page_number)
    for thread in threads:
        thread_url = 'https://voz.vn' + thread['Link']  # Construct the full URL
        print(f"Scraping thread: {thread['Title']} by user: {thread['User']}")

        # Scrape the thread content
        thread_content_df = scrape_first_20_pages(thread_url)
        if not thread_content_df.empty:
            thread_content_df['Title'] = thread['Title']  # Add thread title
            thread_content_df['OriginalUser'] = thread['User']  # Add thread creator

            # Append the data
            all_threads_data.append(thread_content_df)

# Combine all the data into a single DataFrame
if all_threads_data:
    final_df = pd.concat(all_threads_data, ignore_index=True)

    # Save to CSV
    final_df.to_csv('voz_complete_data.csv', index=False)
    print("Data saved to voz_complete_data.csv")
else:
    print("No data scraped.")

Scraping thread list page 1
Scraping thread: Hậu bầu cử Mỹ ver 2 | Official Thread by user: thuyvan
Scraping page 1: https://voz.vn/t/hau-bau-cu-my-ver-2-official-thread.1039280/page-1
Scraping page 2: https://voz.vn/t/hau-bau-cu-my-ver-2-official-thread.1039280/page-2
Scraping page 3: https://voz.vn/t/hau-bau-cu-my-ver-2-official-thread.1039280/page-3
Scraping page 4: https://voz.vn/t/hau-bau-cu-my-ver-2-official-thread.1039280/page-4
Scraping page 5: https://voz.vn/t/hau-bau-cu-my-ver-2-official-thread.1039280/page-5
Scraping page 6: https://voz.vn/t/hau-bau-cu-my-ver-2-official-thread.1039280/page-6
Scraping page 7: https://voz.vn/t/hau-bau-cu-my-ver-2-official-thread.1039280/page-7
Scraping page 8: https://voz.vn/t/hau-bau-cu-my-ver-2-official-thread.1039280/page-8
Scraping page 9: https://voz.vn/t/hau-bau-cu-my-ver-2-official-thread.1039280/page-9
Scraping page 10: https://voz.vn/t/hau-bau-cu-my-ver-2-official-thread.1039280/page-10
Scraping page 11: https://voz.vn/t/hau-bau-cu-my

In [5]:
import pandas as pd

data = pd.read_csv('voz_complete_data.csv')
# data['Content'][21711]

In [6]:
data

Unnamed: 0,User,Reply_To,Original_Comment,Reply_Content,Title,OriginalUser
0,thuyvan,,,Vì thread Hậu bầu cử ver 1 bị khóa do Google q...,Hậu bầu cử Mỹ ver 2 | Official Thread,thuyvan
1,theoluat,,,lại bị quét,Hậu bầu cử Mỹ ver 2 | Official Thread,thuyvan
2,eragonhnn,,,xí,Hậu bầu cử Mỹ ver 2 | Official Thread,thuyvan
3,Vive la Nation,,,Uýnh dấu,Hậu bầu cử Mỹ ver 2 | Official Thread,thuyvan
4,Anh Ba Ngọc Sơn,,,Mấy topic hay khi mà dài quá sẽ bị GG nó quét ...,Hậu bầu cử Mỹ ver 2 | Official Thread,thuyvan
...,...,...,...,...,...,...
21994,Ô Long Trà Ôn,tzuthegod,"Móa ăn lẩu bên đấy, đặc biệt là đồ Tứ Xuyên th...","Đúng, VN đi ăn lẩu Tứ Xuyên về cũng đau bụng, ...",5 thói quen ăn lẩu hại thận,CuteGecko
21995,Phanh Blank 2,,,6. Thói quen ăn LẨU CUa thiếu kỹ năng 7.,5 thói quen ăn lẩu hại thận,CuteGecko
21996,Cow Biền,,,Lẩu cua đồng,5 thói quen ăn lẩu hại thận,CuteGecko
21997,DungBanNickToiVer4,,,Lẩu cua đồng,5 thói quen ăn lẩu hại thận,CuteGecko


In [7]:
specific_row_df = data.iloc[[1097]]
specific_row_df

Unnamed: 0,User,Reply_To,Original_Comment,Reply_Content,Title,OriginalUser
1097,21041995,Anna Karenina,bớt thuyết âm mưu dùm cái. lên mặt báo cả tòa ...,Xin hình ảnh tòa nhà nát vụn cái?. Hay nguồn f...,Tình hình các điểm nóng xung đột vũ trang trên...,thuyvan


In [8]:
specific_row_df['Original_Comment']

1097    bớt thuyết âm mưu dùm cái. lên mặt báo cả tòa ...
Name: Original_Comment, dtype: object

In [9]:
pd.set_option('display.max_colwidth', None)
print(specific_row_df['Reply_Content'])

1097    Xin hình ảnh tòa nhà nát vụn cái?. Hay nguồn fake news cũng đc ?. Tôi để auto load news mà mới chỉ có vài hố tên lửa, chưa thấy nhà nào nát cả.
Name: Reply_Content, dtype: object
