In [18]:
import requests
import json
import csv
import time
import os
from urllib.parse import urlparse

In [19]:
# ====== Cấu hình ======
json_output_file = './data/crawl/tiktok_comments.json'
csv_output_file = './data/crawl/tiktok_comments.csv'
request_delay = 0.4
max_retries = 3
retry_delay = 0.5

post_urls = [
    'https://www.tiktok.com/@vnexpress.official/video/7491513522380131591',
    'https://www.tiktok.com/@vnexpress.official/video/7488627753286159634',
    'https://www.tiktok.com/@tienvekechuyen/video/7486836163353824518',
    'https://www.tiktok.com/@hoangnamtien/video/7488600544383077638',
    'https://www.tiktok.com/@vietnamhungcuong92/video/7490965989215997192',
    'https://www.tiktok.com/@wesaigon/video/7495135477184466194',
    'https://www.tiktok.com/@tnhuy16/video/7493787518131834113',
    'https://www.tiktok.com/@theanh28entertainment/video/7496074735076396295',
    'https://www.tiktok.com/@nguyentien93/video/6995044063556750594?q=ch%C3%A1y%20nh%C3%A0&t=1747561808829'
]

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
    'Referer': 'https://www.tiktok.com/',
    'Accept': 'application/json',
}

In [20]:
# ====== Các hàm xử lý ======

def get_post_id(post_url):
    return urlparse(post_url).path.split('/')[-1]


def req(post_id, cursor):
    for attempt in range(max_retries):
        try:
            params = {
                'aid': '1988',
                'aweme_id': post_id,
                'cursor': str(cursor),
                'count': '20',
                'webcast_language': 'en',
            }

            response = requests.get(
                'https://www.tiktok.com/api/comment/list/',
                headers=headers,
                params=params,
                timeout=10
            )

            if response.status_code == 200:
                return response.json()
            else:
                print(f"HTTP Error {response.status_code}")
        except Exception as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            time.sleep(retry_delay)

    print("Max retries reached.")
    return None


def process_comments(data, post_url):
    if not data or 'comments' not in data:
        return [], False

    comments = []
    for cm in data['comments']:
        comment_text = cm.get('text') or cm.get('share_info', {}).get('desc', '')
        if comment_text:
            comments.append({
                'post_url': post_url,
                'comment_id': cm.get('cid', ''),
                'text': comment_text,
                'user': cm.get('user', {}).get('unique_id', 'Unknown'),
                'nickname': cm.get('user', {}).get('nickname', 'Unknown'),
                'likes': cm.get('digg_count', 0),
                'timestamp': cm.get('create_time', 0),
                'reply_count': cm.get('reply_comment_total', 0)
            })

    return comments, data.get('has_more', 0) == 1


def crawl_video_comments(post_url):
    post_id = get_post_id(post_url)
    print(f"\n🟢 Crawling: {post_url}")
    comments = []
    cursor = 0
    has_more = True

    while has_more:
        data = req(post_id, cursor)
        if not data:
            break
        batch, has_more = process_comments(data, post_url)
        comments.extend(batch)
        cursor += 20
        print(f"   Collected: {len(comments)} comments so far...")
        time.sleep(request_delay)

    return comments


def read_processed_urls(csv_file):
    processed = set()
    if os.path.exists(csv_file):
        try:
            with open(csv_file, 'r', encoding='utf-8') as f:
                reader = csv.DictReader(f)
                for row in reader:
                    processed.add(row['post_url'])
        except Exception as e:
            print(f"Error reading CSV: {e}")
    return processed


def append_comments_to_csv(csv_file, comments):
    if not comments:
        return

    fieldnames = comments[0].keys()
    file_exists = os.path.exists(csv_file)
    write_header = not file_exists or os.path.getsize(csv_file) == 0

    try:
        with open(csv_file, 'a', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            if write_header:
                writer.writeheader()
            writer.writerows(comments)
        print(f"✅ Appended {len(comments)} comments to {csv_file}")
    except Exception as e:
        print(f"Error writing CSV: {e}")

In [21]:
processed_urls = read_processed_urls(csv_output_file)
remaining_urls = [url for url in post_urls if url not in processed_urls]

print(f"\n📊 Tổng video: {len(post_urls)} | Đã xử lý: {len(processed_urls)} | Còn lại: {len(remaining_urls)}")

for url in remaining_urls:
    try:
        comments = crawl_video_comments(url)
        if comments:
            append_comments_to_csv(csv_output_file, comments)
    except Exception as e:
        print(f"❌ Error crawling {url}: {e}")


📊 Tổng video: 9 | Đã xử lý: 9 | Còn lại: 0
