In [None]:
import requests
import json
import csv
import time
from urllib.parse import urlparse
import os

In [None]:
# Nơi lưu trữ dữ liệu vừa crawl
json_output_file = './data/crawl/tiktok_comments.json'
csv_output_file = './data/crawl/tiktok_comments.csv'

In [4]:
# Danh sách các video TikTok cần crawl
post_urls = [
    'https://www.tiktok.com/@vnexpress.official/video/7491513522380131591',
    'https://www.tiktok.com/@vnexpress.official/video/7488627753286159634',
    'https://www.tiktok.com/@tienvekechuyen/video/7486836163353824518',
    'https://www.tiktok.com/@hoangnamtien/video/7488600544383077638',
    'https://www.tiktok.com/@vietnamhungcuong92/video/7490965989215997192',
    'https://www.tiktok.com/@wesaigon/video/7495135477184466194',
    'https://www.tiktok.com/@tnhuy16/video/7493787518131834113',
    'https://www.tiktok.com/@theanh28entertainment/video/7496074735076396295'
]

In [5]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
    'Referer': 'https://www.tiktok.com/',
    'Accept': 'application/json',
}

all_comments = []
max_retries = 3
retry_delay = 0.5
request_delay = 0.4  # Thời gian chờ giữa các request

In [6]:
def get_post_id(post_url):
    """Lấy ID video từ URL"""
    path = urlparse(post_url).path
    return path.split('/')[-1]

In [7]:
def req(post_id, cursor):
    """Gửi request lấy comments"""
    for attempt in range(max_retries):
        try:
            params = {
                'aid': '1988',
                'aweme_id': post_id,
                'cursor': str(cursor),
                'count': '20',
                'webcast_language': 'en',
            }

            response = requests.get(
                'https://www.tiktok.com/api/comment/list/',
                headers=headers,
                params=params,
                timeout=10
            )

            if response.status_code != 200:
                raise ValueError(f"HTTP Status: {response.status_code}")

            return response.json()

        except requests.exceptions.RequestException as e:
            print(f"Request failed (attempt {attempt + 1}): {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
            continue
        except json.JSONDecodeError:
            print(f"Invalid JSON response (attempt {attempt + 1})")
            if attempt < max_retries - 1:
                time.sleep(retry_delay)
            continue

    print("Max retries reached, skipping...")
    return None

In [8]:
def process_comments(data, post_url):
    """Xử lý dữ liệu comments"""
    if not data or not isinstance(data, dict) or 'comments' not in data:
        return [], False

    comments = []
    for cm in data['comments']:
        try:
            comment_text = cm.get('text', '') or cm.get('share_info', {}).get('desc', '')
            if comment_text:
                comments.append({
                    'post_url': post_url,
                    'comment_id': cm.get('cid', ''),
                    'text': comment_text,
                    'user': cm.get('user', {}).get('unique_id', 'Unknown'),
                    'nickname': cm.get('user', {}).get('nickname', 'Unknown'),
                    'likes': cm.get('digg_count', 0),
                    'timestamp': cm.get('create_time', 0),
                    'reply_count': cm.get('reply_comment_total', 0)
                })
        except Exception as e:
            print(f"Error processing comment: {str(e)}")
            continue

    return comments, data.get('has_more', 0) == 1

In [9]:
def crawl_video_comments(post_url):
    """Crawl comments từ một video"""
    post_id = get_post_id(post_url)
    print(f"\nStarting to crawl comments for video: {post_url}")

    video_comments = []
    cursor = 0
    has_more = True

    while has_more:
        data = req(post_id, cursor)
        if not data:
            break

        comments, has_more = process_comments(data, post_url)
        if comments:
            video_comments.extend(comments)

        cursor += 20
        print(f"Processed {cursor} comments...")
        time.sleep(request_delay)

    return {
        'post_url': post_url,
        'post_id': post_id,
        'total_comments': len(video_comments),
        'comments': video_comments
    }

In [10]:
# Crawl comments từ tất cả các video trong danh sách
for post_url in post_urls:
    try:
        video_data = crawl_video_comments(post_url)
        all_comments.append(video_data)
        print(f"Finished crawling {video_data['total_comments']} comments from {post_url}")
    except Exception as e:
        print(f"Error crawling {post_url}: {str(e)}")
        continue


Starting to crawl comments for video: https://www.tiktok.com/@vnexpress.official/video/7491513522380131591
Processed 20 comments...
Processed 40 comments...
Processed 60 comments...
Processed 80 comments...
Processed 100 comments...
Processed 120 comments...
Processed 140 comments...
Processed 160 comments...
Processed 180 comments...
Processed 200 comments...
Processed 220 comments...
Processed 240 comments...
Processed 260 comments...
Processed 280 comments...
Processed 300 comments...
Processed 320 comments...
Processed 340 comments...
Processed 360 comments...
Processed 380 comments...
Processed 400 comments...
Processed 420 comments...
Processed 440 comments...
Processed 460 comments...
Processed 480 comments...
Processed 500 comments...
Processed 520 comments...
Processed 540 comments...
Processed 560 comments...
Processed 580 comments...
Processed 600 comments...
Processed 620 comments...
Processed 640 comments...
Processed 660 comments...
Processed 680 comments...
Processed 70

In [11]:
# try:
#     with open(json_output_file, 'w', encoding='utf-8') as f:
#         json.dump(all_comments, f, ensure_ascii=False, indent=4)
#     print(f"\nJSON: Saved to {json_output_file}")
# except Exception as e:
#     print(f"\nError saving JSON file: {str(e)}")

In [12]:
try:
    with open(csv_output_file, 'w', newline='', encoding='utf-8') as csvfile:
        # Chuẩn bị header từ key của comment đầu tiên
        if all_comments and all_comments[0]['comments']:
            fieldnames = all_comments[0]['comments'][0].keys()
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for video in all_comments:
                for comment in video['comments']:
                    writer.writerow(comment)

    total_videos = len(all_comments)
    total_comments = sum(video['total_comments'] for video in all_comments)
    print(f"CSV: Saved {total_comments} comments from {total_videos} videos to {csv_output_file}")
except Exception as e:
    print(f"\nError saving CSV file: {str(e)}")

CSV: Saved 19611 comments from 8 videos to ./data/crawl/tiktok_comments.csv
