In [6]:
import time
import random
from datetime import datetime
from typing import List, Dict, Tuple, Optional
import yt_dlp
import pandas as pd
from tqdm import tqdm

## 채널url 입력시 채널영상 크롤링

In [83]:
# ------------------------------
# 공통 유틸
# ------------------------------
def _sec_to_hms(sec: Optional[int]) -> Optional[str]:
    if sec is None:
        return None
    m, s = divmod(int(sec), 60)
    h, m = divmod(m, 60)
    return f"{h:02d}:{m:02d}:{s:02d}"


# ------------------------------
# 1) 채널 업로드 목록에서 영상 URL 수집
# ------------------------------
def list_channel_video_urls(
    channel_url: str,
    max_videos: Optional[int] = None,
) -> Tuple[List[str], Dict]:
    """
    yt_dlp로 채널 업로드 목록(playlist처럼 동작)을 '평면 추출'해서 영상 URL만 빠르게 수집.
    필요시 '/videos' 탭으로 재시도.
    반환: (video_urls, channel_meta)
    """
    def _extract_entries(url: str):
        ydl_opts = {
            "quiet": True,
            "no_warnings": True,
            "skip_download": True,
            "noplaylist": False,          # 채널은 내부적으로 playlist처럼 처리
            "extract_flat": True,         # 목록만 빠르게
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            return ydl.extract_info(url, download=False)

    info = None
    try:
        info = _extract_entries(channel_url)
    except Exception:
        # /videos 탭으로 1차 보정
        if "/videos" not in channel_url:
            fixed = channel_url.rstrip("/") + "/videos"
            info = _extract_entries(fixed)
        else:
            raise

    # 엔트리 없으면 /videos로 최종 재시도
    entries = (info or {}).get("entries") or []
    if not entries and "/videos" not in channel_url:
        fixed = channel_url.rstrip("/") + "/videos"
        info = _extract_entries(fixed)
        entries = (info or {}).get("entries") or []

    if not entries:
        raise RuntimeError("채널 업로드 목록을 찾지 못했습니다. 채널 URL이 맞는지 확인하거나 '/videos' 탭 URL을 사용하세요.")

    # 채널 메타
    channel_meta = {
        "channel_title": info.get("channel") or info.get("uploader"),
        "channel_id": info.get("channel_id") or info.get("uploader_id"),
        "channel_url": info.get("uploader_url") or channel_url,
        "subscriber_count": info.get("channel_follower_count"),
    }

    # 평면 엔트리 → 동영상 URL 생성
    urls: List[str] = []
    for e in entries:
        if isinstance(e, dict):
            # e["url"]이 id만 들어있을 수 있으니 보정
            raw = e.get("url") or e.get("id")
            if not raw:
                continue
            full = raw if str(raw).startswith("http") else f"https://www.youtube.com/watch?v={raw}"
            # Shorts 등도 업로드 목록에 포함될 수 있음(필터는 상세 단계에서 가능)
            urls.append(full)
        elif isinstance(e, str):
            urls.append(f"https://www.youtube.com/watch?v={e}")

        if max_videos and len(urls) >= max_videos:
            break

    # 중복 제거(간혹 중복이 섞이는 경우 방지)
    urls = list(dict.fromkeys(urls))
    return urls, channel_meta


# ------------------------------
# 2) 단일 영상 상세 메타데이터 수집
# ------------------------------
def get_youtube_video_info(video_url: str) -> Dict:
    ydl_opts = {
        "noplaylist": True,
        "quiet": True,
        "no_warnings": True,
        "skip_download": True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=False)

    # 날짜 ISO 변환
    upload_date = info.get("upload_date")
    published_date = None
    if upload_date:
        try:
            published_date = datetime.strptime(upload_date, "%Y%m%d").date().isoformat()
        except Exception:
            published_date = upload_date  # 원본 유지

    # 가장 큰 썸네일
    thumbnail_url = None
    thumbs = info.get("thumbnails")
    if isinstance(thumbs, list) and thumbs:
        thumbnail_url = sorted(thumbs, key=lambda t: t.get("height", 0))[-1].get("url")

    duration_str = info.get("duration_string") or _sec_to_hms(info.get("duration"))
    channel_name = info.get("channel") or info.get("uploader")

    return {
        "title": info.get("title"),
        "video_id": info.get("id"),
        "video_url": f"https://www.youtube.com/watch?v={info.get('id')}" if info.get("id") else video_url,
        "published_date": published_date,
        "thumbnail_url": thumbnail_url,
        "view_count": info.get("view_count"),
        "like_count": info.get("like_count"),
        "comment_count": info.get("comment_count"),
        "duration_hms": duration_str,
        "duration_sec": info.get("duration"),
        "channel_name": channel_name,
        "channel_id": info.get("channel_id") or info.get("uploader_id"),
        "subscriber_count": info.get("channel_follower_count"),

        # 선택 필드
        "categories": info.get("categories"),
        "tags": info.get("tags"),
        "uploader_url": info.get("uploader_url"),
        "age_limit": info.get("age_limit"),
        "availability": info.get("availability"),
        "live_status": info.get("live_status"),
        "description": info.get("description"),
    }


# ------------------------------
# 3) 채널 → (URL 수집) → (각 영상 상세 수집)
# ------------------------------
def collect_channel_videos(
    channel_url: str,
    max_videos: Optional[int] = None,
    include_shorts: bool = True,
    sleep_range: Tuple[float, float] = (0.05, 0.2),
    retry: int = 2,
) -> Tuple[pd.DataFrame, Dict]:
    """
    channel_url: 채널 홈/핸들/UCID URL 모두 가능. 필요시 '/videos'로 자동 보정
    max_videos: 수집 개수 제한(None이면 전체)
    include_shorts: False면 duration<60인 영상(숏츠) 제외
    sleep_range: 요청 간 랜덤 지연(차단 방지)
    retry: 영상별 상세 수집 실패 시 재시도 횟수
    """
    video_urls, channel_meta = list_channel_video_urls(channel_url, max_videos=max_videos)

    rows = []
    for url in tqdm(video_urls):
        last_err = None
        for attempt in range(retry + 1):
            try:
                row = get_youtube_video_info(url)
                # 숏츠 제외 옵션
                if not include_shorts:
                    dur = row.get("duration_sec")
                    if dur is not None and int(dur) < 60:
                        # skip
                        pass
                    else:
                        rows.append(row)
                else:
                    rows.append(row)
                break
            except Exception as e:
                last_err = e
                if attempt < retry:
                    time.sleep(0.5 + attempt * 0.5)
                else:
                    # 실패해도 최소 정보 기록
                    rows.append({
                        "title": None,
                        "video_id": None,
                        "video_url": url,
                        "published_date": None,
                        "thumbnail_url": None,
                        "view_count": None,
                        "like_count": None,
                        "comment_count": None,
                        "duration_hms": None,
                        "duration_sec": None,
                        "channel_name": channel_meta.get("channel_title"),
                        "channel_id": channel_meta.get("channel_id"),
                        "subscriber_count": channel_meta.get("subscriber_count"),
                        "_error": str(last_err),
                    })
        time.sleep(random.uniform(*sleep_range))

    df = pd.DataFrame(rows)

    # 정렬(가능하면)
    if "published_date" in df.columns:
        try:
            df["published_date"] = pd.to_datetime(df["published_date"], errors="coerce")
            df = df.sort_values("published_date", ascending=False).reset_index(drop=True)
        except Exception:
            pass

    return df, channel_meta

In [None]:
if __name__ == "__main__":
    channel_url = "https://www.youtube.com/@KoreanCryingGuy/videos"

    # 옵션
    MAX_VIDEOS = 300
    INCLUDE_SHORTS = False
    SAVE_CSV = True
    CSV_PATH = "channel(@KoreanCryingGuy)_videos_metadata.csv"

    df, meta = collect_channel_videos(
        channel_url=channel_url,
        max_videos=MAX_VIDEOS,
        include_shorts=INCLUDE_SHORTS,
        sleep_range=(0.08, 0.25),
        retry=2
    )

    print("Channel Meta:", meta)
    print("Collected:", len(df))
    print(df.head(5))

    if SAVE_CSV:
        df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
        print(f"Saved -> {CSV_PATH}")

In [8]:
df=pd.read_csv('channel(@KoreanCryingGuy)_videos_metadata.csv', encoding='utf-8-sig')
df

Unnamed: 0,title,video_id,video_url,published_date,thumbnail_url,view_count,like_count,comment_count,duration_hms,duration_sec,channel_name,channel_id,subscriber_count,categories,tags,uploader_url,description
0,[KIA 타이거즈] 선수별 응원가 제안서.pdf,VzzZ70tWMJ8,https://www.youtube.com/watch?v=VzzZ70tWMJ8,2025-09-18,https://i.ytimg.com/vi/VzzZ70tWMJ8/maxresdefau...,174862.0,4022.0,368.0,24:24,1464.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,필라이트 클리어가 12캔에 만원...⁉️\n야구 1회마다 한 캔씩 마셔도 9회까지 ...
1,[무공해] 험한 것이 나와도 김고은이랑 무조건 공감합니다,_ttuPeDExTo,https://www.youtube.com/watch?v=_ttuPeDExTo,2025-09-12,https://i.ytimg.com/vi/_ttuPeDExTo/maxresdefau...,758030.0,10983.0,535.0,36:18,2178.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,"넷플릭스 시리즈 [은중과 상연] 드디어 👉오늘 공개 👈\n\n친구란 게... 참,\..."
2,[무공해] 폭주하는 공감 요정.. 아이브 레이랑 무조건 공감합니다,N5Zk-xH1e0k,https://www.youtube.com/watch?v=N5Zk-xH1e0k,2025-09-05,https://i.ytimg.com/vi/N5Zk-xH1e0k/maxresdefau...,677690.0,9259.0,569.0,35:35,2135.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,*사연자들은 모두 일반인입니다. 무분별한 비난은 삭제될 수 있습니다*\n\n공감이 ...
3,[ENG SUB] 직역된 가사보고 노래 맞히기 (w.태래 규빈 리키),-6vOqZs6CFA,https://www.youtube.com/watch?v=-6vOqZs6CFA,2025-09-02,https://i.ytimg.com/vi/-6vOqZs6CFA/maxresdefau...,192365.0,8331.0,516.0,21:34,1294.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,📌https://bit.ly/3ULB2X7\n👉 스픽XT1 코스를 무료 체험으로 딱...
4,근데 운전은 로이킴이,QxSIqNOMCm4,https://www.youtube.com/watch?v=QxSIqNOMCm4,2025-08-29,https://i.ytimg.com/vi_webp/QxSIqNOMCm4/maxres...,167341.0,1979.0,201.0,24:17,1457.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,쏘카는 보이는 가격 그대로😶\n이제 [총 결제 요금] 한 번만 내고 타세요\n\n5...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,[유병재 라이브] 상훈의 밤 - 랜덤 상황극,6SPQEm3CuB0,https://www.youtube.com/watch?v=6SPQEm3CuB0,2018-09-13,https://i.ytimg.com/vi_webp/6SPQEm3CuB0/maxres...,1250076.0,18173.0,1000.0,24:08,1448.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '유병재유튜브', '문학의밤', '유병재라이브', '유규선', '문상훈']",https://www.youtube.com/@KoreanCryingGuy,
292,[유병재 스케치] 600만 빚의 사나이,P6SwY359Tls,https://www.youtube.com/watch?v=P6SwY359Tls,2018-09-11,https://i.ytimg.com/vi/P6SwY359Tls/sddefault.jpg,385705.0,10696.0,926.0,6:39,399.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '스케치코미디', '유병재유튜브', 'snl']",https://www.youtube.com/@KoreanCryingGuy,유병재 스케치 코미디\nep.1 600만 빚의 사나이
293,[세상에서 가장 고독한 팬미팅] 2시간 풀버전!!,0SJ2z36tXwM,https://www.youtube.com/watch?v=0SJ2z36tXwM,2018-09-07,https://i.ytimg.com/vi_webp/0SJ2z36tXwM/maxres...,432138.0,5425.0,408.0,1:44:45,6285.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,[],https://www.youtube.com/@KoreanCryingGuy,
294,[세상에서 가장 고독한 팬미팅] 고독한 초대손님,QdMT51pQVeA,https://www.youtube.com/watch?v=QdMT51pQVeA,2018-09-06,https://i.ytimg.com/vi_webp/QdMT51pQVeA/maxres...,113727.0,1471.0,122.0,5:06,306.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '문학의밤', '고독한 팬미팅', '사이렌', '방탄소년단', 'BT...",https://www.youtube.com/@KoreanCryingGuy,


In [57]:
df.columns

Index(['title', 'video_id', 'video_url', 'published_date', 'thumbnail_url',
       'view_count', 'like_count', 'comment_count', 'duration_hms',
       'duration_sec', 'channel_name', 'channel_id', 'subscriber_count',
       'categories', 'tags', 'uploader_url', 'age_limit', 'availability',
       'live_status', 'description', '_error'],
      dtype='object')

In [3]:
import ast

def normalize_category(val):
    """
    categories가 ['Comedy'] 같은 리스트/문자열일 수 있어 이를 'Comedy'로 통일.
    """
    if val is None:
        return None
    # 이미 문자열인데 "['Comedy']" 형태인 경우
    if isinstance(val, str):
        val = val.strip()
        if val.startswith("[") and val.endswith("]"):
            try:
                parsed = ast.literal_eval(val)
                if isinstance(parsed, list) and parsed:
                    return str(parsed[0])
            except Exception:
                pass
        return val  # 정상 문자열
    # 리스트인 경우
    if isinstance(val, (list, tuple)) and val:
        return str(val[0])
    return None

In [81]:
# df=df.iloc[:296,:]
# df

Unnamed: 0,title,video_id,video_url,published_date,thumbnail_url,view_count,like_count,comment_count,duration_hms,duration_sec,...,channel_id,subscriber_count,categories,tags,uploader_url,age_limit,availability,live_status,description,_error
0,[KIA 타이거즈] 선수별 응원가 제안서.pdf,VzzZ70tWMJ8,https://www.youtube.com/watch?v=VzzZ70tWMJ8,2025-09-18,https://i.ytimg.com/vi/VzzZ70tWMJ8/maxresdefau...,174862.0,4022.0,368.0,24:24,1464.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,필라이트 클리어가 12캔에 만원...⁉️\n야구 1회마다 한 캔씩 마셔도 9회까지 ...,
1,[무공해] 험한 것이 나와도 김고은이랑 무조건 공감합니다,_ttuPeDExTo,https://www.youtube.com/watch?v=_ttuPeDExTo,2025-09-12,https://i.ytimg.com/vi/_ttuPeDExTo/maxresdefau...,758030.0,10983.0,535.0,36:18,2178.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,"넷플릭스 시리즈 [은중과 상연] 드디어 👉오늘 공개 👈\n\n친구란 게... 참,\...",
2,[무공해] 폭주하는 공감 요정.. 아이브 레이랑 무조건 공감합니다,N5Zk-xH1e0k,https://www.youtube.com/watch?v=N5Zk-xH1e0k,2025-09-05,https://i.ytimg.com/vi/N5Zk-xH1e0k/maxresdefau...,677690.0,9259.0,569.0,35:35,2135.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,*사연자들은 모두 일반인입니다. 무분별한 비난은 삭제될 수 있습니다*\n\n공감이 ...,
3,[ENG SUB] 직역된 가사보고 노래 맞히기 (w.태래 규빈 리키),-6vOqZs6CFA,https://www.youtube.com/watch?v=-6vOqZs6CFA,2025-09-02,https://i.ytimg.com/vi/-6vOqZs6CFA/maxresdefau...,192365.0,8331.0,516.0,21:34,1294.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,📌https://bit.ly/3ULB2X7\n👉 스픽XT1 코스를 무료 체험으로 딱...,
4,근데 운전은 로이킴이,QxSIqNOMCm4,https://www.youtube.com/watch?v=QxSIqNOMCm4,2025-08-29,https://i.ytimg.com/vi_webp/QxSIqNOMCm4/maxres...,167341.0,1979.0,201.0,24:17,1457.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,쏘카는 보이는 가격 그대로😶\n이제 [총 결제 요금] 한 번만 내고 타세요\n\n5...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,[유병재 라이브] 상훈의 밤 - 랜덤 상황극,6SPQEm3CuB0,https://www.youtube.com/watch?v=6SPQEm3CuB0,2018-09-13,https://i.ytimg.com/vi_webp/6SPQEm3CuB0/maxres...,1250076.0,18173.0,1000.0,24:08,1448.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '유병재유튜브', '문학의밤', '유병재라이브', '유규선', '문상훈']",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,,
292,[유병재 스케치] 600만 빚의 사나이,P6SwY359Tls,https://www.youtube.com/watch?v=P6SwY359Tls,2018-09-11,https://i.ytimg.com/vi/P6SwY359Tls/sddefault.jpg,385705.0,10696.0,926.0,6:39,399.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '스케치코미디', '유병재유튜브', 'snl']",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,유병재 스케치 코미디\nep.1 600만 빚의 사나이,
293,[세상에서 가장 고독한 팬미팅] 2시간 풀버전!!,0SJ2z36tXwM,https://www.youtube.com/watch?v=0SJ2z36tXwM,2018-09-07,https://i.ytimg.com/vi_webp/0SJ2z36tXwM/maxres...,432138.0,5425.0,408.0,1:44:45,6285.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,[],https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,,
294,[세상에서 가장 고독한 팬미팅] 고독한 초대손님,QdMT51pQVeA,https://www.youtube.com/watch?v=QdMT51pQVeA,2018-09-06,https://i.ytimg.com/vi_webp/QdMT51pQVeA/maxres...,113727.0,1471.0,122.0,5:06,306.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '문학의밤', '고독한 팬미팅', '사이렌', '방탄소년단', 'BT...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,,


In [85]:
df.describe()

Unnamed: 0,view_count,like_count,comment_count,duration_sec,subscriber_count,age_limit
count,296.0,296.0,296.0,296.0,296.0,296.0
mean,909948.8,13061.297297,724.925676,1408.662162,1690000.0,0.0
std,1289519.0,23244.062432,1297.686194,1980.570177,0.0,0.0
min,48786.0,674.0,41.0,88.0,1690000.0,0.0
25%,247666.2,3290.25,186.0,726.5,1690000.0,0.0
50%,521181.5,6874.5,346.5,999.5,1690000.0,0.0
75%,1033335.0,13817.25,740.75,1439.25,1690000.0,0.0
max,8720112.0,211460.0,10000.0,19818.0,1690000.0,0.0


In [93]:
# df=df.drop(columns=['age_limit', '_error', 'availability', 'live_status'])

In [95]:
# df.to_csv("channel(@KoreanCryingGuy)_videos_metadata.csv", index=False, encoding="utf-8-sig")

In [103]:
df['title'].head(30)

0                       [KIA 타이거즈] 선수별 응원가 제안서.pdf
1                  [무공해] 험한 것이 나와도 김고은이랑 무조건 공감합니다
2             [무공해] 폭주하는 공감 요정.. 아이브 레이랑 무조건 공감합니다
3           [ENG SUB] 직역된 가사보고 노래 맞히기 (w.태래 규빈 리키)
4                                      근데 운전은 로이킴이
5                  [무공해] 샤이니 T랑, 아니 키랑.. 무조건 공감합니다
6                         [SUB] 지드래곤의 웃으면 안되는 생일파티
7                    [궤도의 잠 못 드는 밤] 솔직히 진짜 안 졸ㄹ...
8                    [무딱싫] 무서운 거 딱 싫은디.. 자꾸 시킴.. ㅠ
9                        [무공해] 퇴사마려운 직장인분들과도 공감합니다
10                       [롯데 자이언츠] 선수별 응원가 제안서.pdf
11                   제 2회 소리내면 안되는 야구중계 (LG vs 롯데)
12    (꿀잼) 100% 애드립! 스우파 보는 척 하는 애들 [스웃파 시즌3 EP01]
13                           [무공해] 강하늘이랑 무조건 공감합니다
14                        [무딱싫] 무서운 거 딱 싫어.. 진짜루..
15                        [한화 이글스] 선수별 응원가 제안서.pdf
16                                  너의 목소리가 잘 안 보여
17                           [무공해] 살다살다 별걸 다 공감합니다
18                      [LG TWINS] 선수별 응원가 제안서.pdf
19                  귀 닫고 서로 덕질 

In [19]:
channel_main_cat = df["categories"].dropna().mode().iloc[0] if df["categories"].notna().any() else None
print("채널 최빈 카테고리:", channel_main_cat)

채널 최빈 카테고리: Comedy


## 관련 채널 인기 동영상 크롤링 (추천영상 활용)

### 추천 영상들 정보 추출

In [44]:
# pip install selenium webdriver-manager

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

def make_driver(headless=False):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--start-maximized")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

def extract_cards_all_fields(url, headless=False):
    driver = make_driver(headless=headless)
    try:
        driver.get(url)

        # 메인 레이아웃 대기
        WebDriverWait(driver, 12).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ytd-watch-flexy"))
        )

        # (있으면) 쿠키/동의 닫기
        try:
            btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(.,'동의') or contains(.,'I agree') or contains(.,'Accept')]"))
            )
            btn.click()
            time.sleep(0.4)
        except Exception:
            pass

        # #secondary로 스크롤 후 카드 수집
        secondary = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#secondary"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block:'start'});", secondary)
        time.sleep(0.6)

        card_sel = "#secondary yt-lockup-view-model, #secondary ytd-compact-video-renderer"
        cards = driver.find_elements(By.CSS_SELECTOR, card_sel)

        data = []
        for idx, card in enumerate(cards, 1):
            # 1) 제목/영상 링크
            link = None
            for sel in ["a#video-title-link", "a#video-title", "a[href^='/watch']"]:
                els = card.find_elements(By.CSS_SELECTOR, sel)
                if els:
                    link = els[0]
                    break
            title = (link.text or link.get_attribute("title") or "").strip() if link else ""
            video_url = link.get_attribute("href") if link else None

            # 2) 메타(span)들: 채널명/조회수/업로드시점이 보통 들어있음
            spans_text = []
            metas = card.find_elements(By.CSS_SELECTOR, "yt-lockup-metadata-view-model yt-content-metadata-view-model")
            meta = metas[0] if metas else None
            if meta:
                divs = meta.find_elements(By.XPATH, "./div")
                for dv in divs:
                    spans = dv.find_elements(By.TAG_NAME, "span")
                    if spans:
                        t = spans[0].text.strip()
                        if t:
                            spans_text.append(t)

            # 3) 채널 링크(있으면), 없으면 span의 첫 항목을 채널명으로 사용
            ch_anchor = None
            for sel in [
                "yt-lockup-metadata-view-model a[href*='/@']",
                "yt-lockup-metadata-view-model a[href^='/channel/']",
                "a[href*='/@']",
                "a[href^='/channel/']",
            ]:
                els = card.find_elements(By.CSS_SELECTOR, sel)
                if els:
                    ch_anchor = els[0]
                    break
            channel_name = (ch_anchor.text.strip() if ch_anchor and ch_anchor.text else (spans_text[0] if spans_text else ""))
            channel_url = ch_anchor.get_attribute("href") if ch_anchor else None

            # 4) 길이/썸네일
            duration = None
            dur_els = card.find_elements(By.CSS_SELECTOR, "ytd-thumbnail-overlay-time-status-renderer span, .ytd-thumbnail-overlay-time-status-renderer span")
            if dur_els:
                dtxt = dur_els[0].text.strip()
                if dtxt:
                    duration = dtxt

            thumb = None
            for img in card.find_elements(By.CSS_SELECTOR, "ytd-thumbnail img, img.yt-core-image--loaded, img"):
                src = (img.get_attribute("src") or "").strip()
                if src.startswith("http"):
                    thumb = src
                    break

            data.append({
                "idx": idx,
                "title": title,
                "video_url": video_url,
                "channel_name": channel_name,
                "channel_url": channel_url,
                "spans": spans_text,      # 원시 메타 텍스트들(검증용)
                "duration": duration,
                "thumbnail": thumb,
            })

        return data
    finally:
        driver.quit()

# ===== 사용 예시 =====
if __name__ == "__main__":
    test_url = "https://www.youtube.com/watch?v=VzzZ70tWMJ8"
    items = extract_cards_all_fields(test_url, headless=False)
    print(f"cards: {len(items)}")
    for it in items[:5]:
        print(it)


cards: 20
{'idx': 1, 'title': '28:50', 'video_url': 'https://www.youtube.com/watch?v=lBOhRSfeOD4', 'channel_name': '유병재', 'channel_url': None, 'spans': ['유병재', '조회수 1만회'], 'duration': None, 'thumbnail': 'https://i.ytimg.com/vi/lBOhRSfeOD4/hqdefault.jpg?sqp=-oaymwEnCNACELwBSFryq4qpAxkIARUAAIhCGAHYAQHiAQoIGBACGAY4AUAB&rs=AOn4CLBdBygaBfc3dhPPCrYTbpNAdK3e8g'}
{'idx': 2, 'title': '33:08', 'video_url': 'https://www.youtube.com/watch?v=LedMWQQ-N2E', 'channel_name': '유병재', 'channel_url': None, 'spans': ['유병재', '조회수 45만회'], 'duration': None, 'thumbnail': 'https://i.ytimg.com/vi/LedMWQQ-N2E/hqdefault.jpg?sqp=-oaymwEnCNACELwBSFryq4qpAxkIARUAAIhCGAHYAQHiAQoIGBACGAY4AUAB&rs=AOn4CLBMPACykqNuRDl0R5at3JgUcjOqkA'}
{'idx': 3, 'title': '32:22', 'video_url': 'https://www.youtube.com/watch?v=UhQUSVzKEUE', 'channel_name': '카더정원', 'channel_url': None, 'spans': ['카더정원', '조회수 73만회'], 'duration': None, 'thumbnail': 'https://i.ytimg.com/vi/UhQUSVzKEUE/hqdefault.jpg?sqp=-oaymwEnCNACELwBSFryq4qpAxkIARUAAIhCGAHYAQH

### 채널별 인기동영상 url 추출

In [71]:
import re
import time, random
from selenium.common.exceptions import TimeoutException

# --- (이전 코드의 _make_driver, _sleep, _close_consent_if_any,
#      get_channel_url_from_video, list_popular_video_urls_from_channel 등은 그대로 둡니다) ---

def _norm_channel_name(name: str) -> str:
    """대소문자/공백차이를 줄여 중복 판단을 안정화"""
    if not name:
        return ""
    # 공백 제거 + 소문자
    return re.sub(r"\s+", "", name).casefold()

def dedupe_by_channel_name(items):
    """
    같은 channel_name 은 하나만 남기기.
    - 동일 채널명이 여러 개면 'channel_url'이 있는 항목을 우선 채택
    - channel_name이 비어있는 항목은 dedupe 대상에서 제외(그냥 그대로 둠)
    """
    chosen = {}
    order = []  # 출력 순서 보존용
    for it in items:
        name = (it.get("channel_name") or "").strip()
        if not name:
            # 이름이 없으면 중복 판단 불가 → 그대로 유지
            key = None
        else:
            key = _norm_channel_name(name)

        if not key:
            order.append(it)
            continue

        prev = chosen.get(key)
        if prev is None:
            chosen[key] = it
            order.append(it)
        else:
            # 둘 다 같은 채널명: channel_url 보유 쪽을 우선
            if (not prev.get("channel_url")) and it.get("channel_url"):
                # order 리스트 안에서도 교체
                idx = order.index(prev)
                order[idx] = it
                chosen[key] = it
            # 그 외에는 기존 유지
    return order

def collect_popular_videos_for_items(items, top_k=20, headless=False, dedupe=True):
    """
    items: [{'video_url':..., 'channel_name':..., 'channel_url':...?, ...}, ...]
    -> 각 item에 'popular_video_urls' 키로 리스트 추가하여 반환
    - dedupe=True 면 channel_name 기준으로 한 번만 수집
    """
    # 0) 채널명 기준 dedupe
    work_items = dedupe_by_channel_name(items) if dedupe else list(items)

    driver = _make_driver(headless=headless)
    try:
        results = []
        for it in work_items:
            video_url = it.get("video_url")
            ch_url = it.get("channel_url")

            # 1) 채널 URL이 없으면 영상 페이지에서 확보
            if not ch_url:
                try:
                    ch_url = get_channel_url_from_video(driver, video_url)
                except Exception:
                    ch_url = None

            # 2) 채널 인기 동영상 수집
            try:
                popular_urls = list_popular_video_urls_from_channel(driver, ch_url, top_k=top_k)
            except Exception:
                popular_urls = []

            # 3) 결과 병합
            out = dict(it)
            out["channel_url"] = ch_url
            out["popular_video_urls"] = popular_urls
            results.append(out)

            # 살짝 지터
            time.sleep(random.uniform(0.3, 0.7))

        return results
    finally:
        driver.quit()

In [73]:
out = collect_popular_videos_for_items(items, top_k=15, headless=False, dedupe=True)
print(len(out), "unique channels collected")
for row in out:
    print(row["channel_name"], "->", row["channel_url"], "  top:", len(row["popular_video_urls"]))

100%|████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 4540.74it/s]

17 unique channels collected
유병재 -> https://www.youtube.com/@KoreanCryingGuy   top: 15
카더정원 -> https://www.youtube.com/@carthejungwon   top: 15
운동부 둘이 왔어요 -> https://www.youtube.com/@SportsmenMukbang   top: 15
ootb STUDIO -> https://www.youtube.com/@ootbstudio   top: 15
SSG랜더스 -> https://www.youtube.com/@SSGLANDERS   top: 15
고재영 -> https://www.youtube.com/@gojaeyoung   top: 15
JTBC Voyage -> https://www.youtube.com/@JTBCvoyage   top: 15
기아타이거즈 - 갸티비 -> https://www.youtube.com/@kiatigerstv   top: 15
십이층 -> https://www.youtube.com/@12%EC%B8%B5   top: 15
뜬뜬 DdeunDdeun -> https://www.youtube.com/@ddeunddeun   top: 15
사이버 윤석민 -> https://www.youtube.com/@%EC%82%AC%EC%9D%B4%EB%B2%84%EC%9C%A4%EC%84%9D%EB%AF%BC   top: 15
Giants TV -> https://www.youtube.com/@giantstv   top: 15
김종국 GYM JONG KOOK -> https://www.youtube.com/@GYMJONGKOOK   top: 15
보다 BODA -> https://www.youtube.com/@%EB%B3%B4%EB%8B%A4BODA   top: 15
숏박스 -> https://www.youtube.com/@shortbox   top: 15
Eagles TV -> https://www.youtube.




### 인기동영상별 상세메타데이터 추출

In [78]:
# 필요 모듈
import time, random
from typing import List, Dict, Optional
import pandas as pd

# (안전장치) 네 함수가 참조하는 _sec_to_hms가 없다면 간단 구현
try:
    _ = _sec_to_hms  # noqa: F821
except NameError:
    def _sec_to_hms(sec: Optional[int]) -> Optional[str]:
        if sec is None:
            return None
        h = sec // 3600
        m = (sec % 3600) // 60
        s = sec % 60
        return f"{h:d}:{m:02d}:{s:02d}" if h else f"{m:d}:{s:02d}"

def _sleep_jitter(a=0.3, b=0.7):
    time.sleep(random.uniform(a, b))

def flatten_popular_urls(out: List[Dict], per_channel_limit: Optional[int] = None) -> List[Dict]:
    """
    out의 각 항목에서 popular_video_urls를 (채널 메타와 함께) 평탄화.
    - per_channel_limit: 채널당 상위 N개만 사용하고 싶을 때
    반환: [{'source_channel_name', 'source_channel_url', 'video_url'}, ...]
    """
    flat = []
    for ch in out:
        urls = (ch.get("popular_video_urls") or [])
        if per_channel_limit:
            urls = urls[:per_channel_limit]
        for u in urls:
            flat.append({
                "source_channel_name": ch.get("channel_name"),
                "source_channel_url": ch.get("channel_url"),
                "video_url": u,
            })
    return flat

def collect_video_metadata_with_ytdlp(
    out: List[Dict],
    per_channel_limit: Optional[int] = None,
    dedupe_urls: bool = True,
    sleep_range=(0.25, 0.6),
    save_path: Optional[str] = None,
) -> pd.DataFrame:
    """
    - out: 채널별 popular_video_urls가 들어있는 리스트 (너가 만든 out)
    - per_channel_limit: 채널당 몇 개까지만 수집할지 제한 (None이면 전부)
    - dedupe_urls: 서로 다른 채널에서 같은 영상이 나오면 1번만 수집
    - sleep_range: (min, max) 요청 사이 랜덤 딜레이
    - save_path: '...csv' | '...parquet' | '...json' 저장 경로 (선택)

    반환: 수집된 메타데이터 DataFrame
    """
    # 1) 평탄화 + (선택) URL 중복 제거
    flat = flatten_popular_urls(out, per_channel_limit=per_channel_limit)
    if dedupe_urls:
        seen = set()
        uniq = []
        for row in flat:
            u = row["video_url"]
            if u in seen:
                continue
            seen.add(u)
            uniq.append(row)
        flat = uniq

    # 2) 수집 실행
    rows = []
    for i, row in enumerate(flat, 1):
        url = row["video_url"]
        try:
            meta = get_youtube_video_info(url)  # 네가 이미 정의한 함수 사용
            # 채널 컨텍스트를 붙여 보존
            meta["source_channel_name"] = row["source_channel_name"]
            meta["source_channel_url"]  = row["source_channel_url"]
            rows.append(meta)
        except Exception as e:
            rows.append({
                "video_url": url,
                "error": str(e),
                "source_channel_name": row["source_channel_name"],
                "source_channel_url": row["source_channel_url"],
            })
        _sleep_jitter(*sleep_range)

    df = pd.DataFrame(rows)

    # 3) (선택) 저장
    if save_path:
        if save_path.lower().endswith(".csv"):
            df.to_csv(save_path, index=False, encoding="utf-8-sig")
        elif save_path.lower().endswith(".parquet"):
            df.to_parquet(save_path, index=False)
        elif save_path.lower().endswith(".json"):
            df.to_json(save_path, orient="records", force_ascii=False, indent=2)
        else:
            print(f"[WARN] 확장자를 알 수 없어 저장 생략: {save_path}")

    return df

In [85]:
df_meta = collect_video_metadata_with_ytdlp(
    out,
    per_channel_limit=3,
    dedupe_urls=True,
    sleep_range=(0.25, 0.6),
    save_path=None,   # 저장 원치 않으면 None
)

print(len(df_meta), "개 수집")
print(df_meta.head(3).to_string(index=False))

[download] Got error: HTTP Error 403: Forbidden
ERROR: [youtube] 13qE6rkL1gk: Join this channel to get access to members-only content like this video, and other exclusive perks.
[download] Got error: HTTP Error 403: Forbidden
ERROR: [youtube] jWTVr32rf2k: Join this channel to get access to members-only content like this video, and other exclusive perks.


51 개 수집
                          title    video_id                                   video_url published_date                                        thumbnail_url  view_count  like_count  comment_count duration_hms  duration_sec channel_name               channel_id  subscriber_count categories                                                              tags                             uploader_url  age_limit availability live_status                                                                                                                                                                                                                                                            description source_channel_name                       source_channel_url error
                너의 목소리가 아예 안 보여 lBOhRSfeOD4 https://www.youtube.com/watch?v=lBOhRSfeOD4     2025-09-22 https://i.ytimg.com/vi/lBOhRSfeOD4/maxresdefault.jpg     16931.0       595.0           64.0        28:49        1729.0         

In [95]:
df_meta.iloc[6,:]

title                  내 동생 천재 유격수 김재호랑 몸보신 먹방! (매운소갈비찜, 해신탕, 전복버터구이,...
video_id                                                     LgJazGXO02Q
video_url                    https://www.youtube.com/watch?v=LgJazGXO02Q
published_date                                                2025-09-17
thumbnail_url          https://i.ytimg.com/vi/LgJazGXO02Q/maxresdefau...
view_count                                                      232646.0
like_count                                                        3219.0
comment_count                                                      319.0
duration_hms                                                       18:40
duration_sec                                                      1120.0
channel_name                                                  운동부 둘이 왔어요
channel_id                                      UC__J0wvxgqq4kg3i5iTgkLQ
subscriber_count                                                500000.0
categories                                         