In [1]:
import time
import random
from datetime import datetime
from typing import List, Dict, Tuple, Optional
import yt_dlp
import pandas as pd
from tqdm import tqdm

## 채널url 입력시 채널영상 크롤링

In [3]:
# ------------------------------
# 공통 유틸
# ------------------------------
def _sec_to_hms(sec: Optional[int]) -> Optional[str]:
    if sec is None:
        return None
    m, s = divmod(int(sec), 60)
    h, m = divmod(m, 60)
    return f"{h:02d}:{m:02d}:{s:02d}"


# ------------------------------
# 1) 채널 업로드 목록에서 영상 URL 수집
# ------------------------------
def list_channel_video_urls(
    channel_url: str,
    max_videos: Optional[int] = None,
) -> Tuple[List[str], Dict]:
    """
    yt_dlp로 채널 업로드 목록(playlist처럼 동작)을 '평면 추출'해서 영상 URL만 빠르게 수집.
    필요시 '/videos' 탭으로 재시도.
    반환: (video_urls, channel_meta)
    """
    def _extract_entries(url: str):
        ydl_opts = {
            "quiet": True,
            "no_warnings": True,
            "skip_download": True,
            "noplaylist": False,          # 채널은 내부적으로 playlist처럼 처리
            "extract_flat": True,         # 목록만 빠르게
        }
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            return ydl.extract_info(url, download=False)

    info = None
    try:
        info = _extract_entries(channel_url)
    except Exception:
        # /videos 탭으로 1차 보정
        if "/videos" not in channel_url:
            fixed = channel_url.rstrip("/") + "/videos"
            info = _extract_entries(fixed)
        else:
            raise

    # 엔트리 없으면 /videos로 최종 재시도
    entries = (info or {}).get("entries") or []
    if not entries and "/videos" not in channel_url:
        fixed = channel_url.rstrip("/") + "/videos"
        info = _extract_entries(fixed)
        entries = (info or {}).get("entries") or []

    if not entries:
        raise RuntimeError("채널 업로드 목록을 찾지 못했습니다. 채널 URL이 맞는지 확인하거나 '/videos' 탭 URL을 사용하세요.")

    # 채널 메타
    channel_meta = {
        "channel_title": info.get("channel") or info.get("uploader"),
        "channel_id": info.get("channel_id") or info.get("uploader_id"),
        "channel_url": info.get("uploader_url") or channel_url,
        "subscriber_count": info.get("channel_follower_count"),
    }

    # 평면 엔트리 → 동영상 URL 생성
    urls: List[str] = []
    for e in entries:
        if isinstance(e, dict):
            # e["url"]이 id만 들어있을 수 있으니 보정
            raw = e.get("url") or e.get("id")
            if not raw:
                continue
            full = raw if str(raw).startswith("http") else f"https://www.youtube.com/watch?v={raw}"
            # Shorts 등도 업로드 목록에 포함될 수 있음(필터는 상세 단계에서 가능)
            urls.append(full)
        elif isinstance(e, str):
            urls.append(f"https://www.youtube.com/watch?v={e}")

        if max_videos and len(urls) >= max_videos:
            break

    # 중복 제거(간혹 중복이 섞이는 경우 방지)
    urls = list(dict.fromkeys(urls))
    return urls, channel_meta


# ------------------------------
# 2) 단일 영상 상세 메타데이터 수집
# ------------------------------
def get_youtube_video_info(video_url: str) -> Dict:
    ydl_opts = {
        "noplaylist": True,
        "quiet": True,
        "no_warnings": True,
        "skip_download": True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(video_url, download=False)

    # 날짜 ISO 변환
    upload_date = info.get("upload_date")
    published_date = None
    if upload_date:
        try:
            published_date = datetime.strptime(upload_date, "%Y%m%d").date().isoformat()
        except Exception:
            published_date = upload_date  # 원본 유지

    # 가장 큰 썸네일
    thumbnail_url = None
    thumbs = info.get("thumbnails")
    if isinstance(thumbs, list) and thumbs:
        thumbnail_url = sorted(thumbs, key=lambda t: t.get("height", 0))[-1].get("url")

    duration_str = info.get("duration_string") or _sec_to_hms(info.get("duration"))
    channel_name = info.get("channel") or info.get("uploader")

    return {
        "title": info.get("title"),
        "video_id": info.get("id"),
        "video_url": f"https://www.youtube.com/watch?v={info.get('id')}" if info.get("id") else video_url,
        "published_date": published_date,
        "thumbnail_url": thumbnail_url,
        "view_count": info.get("view_count"),
        "like_count": info.get("like_count"),
        "comment_count": info.get("comment_count"),
        "duration_hms": duration_str,
        "duration_sec": info.get("duration"),
        "channel_name": channel_name,
        "channel_id": info.get("channel_id") or info.get("uploader_id"),
        "subscriber_count": info.get("channel_follower_count"),

        # 선택 필드
        "categories": info.get("categories"),
        "tags": info.get("tags"),
        "uploader_url": info.get("uploader_url"),
        "age_limit": info.get("age_limit"),
        "availability": info.get("availability"),
        "live_status": info.get("live_status"),
        "description": info.get("description"),
    }


# ------------------------------
# 3) 채널 → (URL 수집) → (각 영상 상세 수집)
# ------------------------------
def collect_channel_videos(
    channel_url: str,
    max_videos: Optional[int] = None,
    include_shorts: bool = True,
    sleep_range: Tuple[float, float] = (0.05, 0.2),
    retry: int = 2,
) -> Tuple[pd.DataFrame, Dict]:
    """
    channel_url: 채널 홈/핸들/UCID URL 모두 가능. 필요시 '/videos'로 자동 보정
    max_videos: 수집 개수 제한(None이면 전체)
    include_shorts: False면 duration<60인 영상(숏츠) 제외
    sleep_range: 요청 간 랜덤 지연(차단 방지)
    retry: 영상별 상세 수집 실패 시 재시도 횟수
    """
    video_urls, channel_meta = list_channel_video_urls(channel_url, max_videos=max_videos)

    rows = []
    for url in tqdm(video_urls):
        last_err = None
        for attempt in range(retry + 1):
            try:
                row = get_youtube_video_info(url)
                # 숏츠 제외 옵션
                if not include_shorts:
                    dur = row.get("duration_sec")
                    if dur is not None and int(dur) < 60:
                        # skip
                        pass
                    else:
                        rows.append(row)
                else:
                    rows.append(row)
                break
            except Exception as e:
                last_err = e
                if attempt < retry:
                    time.sleep(0.5 + attempt * 0.5)
                else:
                    # 실패해도 최소 정보 기록
                    rows.append({
                        "title": None,
                        "video_id": None,
                        "video_url": url,
                        "published_date": None,
                        "thumbnail_url": None,
                        "view_count": None,
                        "like_count": None,
                        "comment_count": None,
                        "duration_hms": None,
                        "duration_sec": None,
                        "channel_name": channel_meta.get("channel_title"),
                        "channel_id": channel_meta.get("channel_id"),
                        "subscriber_count": channel_meta.get("subscriber_count"),
                        "_error": str(last_err),
                    })
        time.sleep(random.uniform(*sleep_range))

    df = pd.DataFrame(rows)

    # 정렬(가능하면)
    if "published_date" in df.columns:
        try:
            df["published_date"] = pd.to_datetime(df["published_date"], errors="coerce")
            df = df.sort_values("published_date", ascending=False).reset_index(drop=True)
        except Exception:
            pass

    return df, channel_meta

In [None]:
if __name__ == "__main__":
    channel_url = "https://www.youtube.com/@KoreanCryingGuy/videos"

    # 옵션
    MAX_VIDEOS = 300
    INCLUDE_SHORTS = False
    SAVE_CSV = True
    CSV_PATH = "channel(@KoreanCryingGuy)_videos_metadata.csv"

    df, meta = collect_channel_videos(
        channel_url=channel_url,
        max_videos=MAX_VIDEOS,
        include_shorts=INCLUDE_SHORTS,
        sleep_range=(0.08, 0.25),
        retry=2
    )

    print("Channel Meta:", meta)
    print("Collected:", len(df))
    print(df.head(5))

    if SAVE_CSV:
        df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
        print(f"Saved -> {CSV_PATH}")

In [27]:
df=pd.read_csv('channel(@KoreanCryingGuy)_videos_metadata.csv', encoding='utf-8-sig')
df

Unnamed: 0,title,video_id,video_url,published_date,thumbnail_url,view_count,like_count,comment_count,duration_hms,duration_sec,channel_name,channel_id,subscriber_count,categories,tags,uploader_url,description
0,[KIA 타이거즈] 선수별 응원가 제안서.pdf,VzzZ70tWMJ8,https://www.youtube.com/watch?v=VzzZ70tWMJ8,2025-09-18,https://i.ytimg.com/vi/VzzZ70tWMJ8/maxresdefau...,174862.0,4022.0,368.0,24:24,1464.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,필라이트 클리어가 12캔에 만원...⁉️\n야구 1회마다 한 캔씩 마셔도 9회까지 ...
1,[무공해] 험한 것이 나와도 김고은이랑 무조건 공감합니다,_ttuPeDExTo,https://www.youtube.com/watch?v=_ttuPeDExTo,2025-09-12,https://i.ytimg.com/vi/_ttuPeDExTo/maxresdefau...,758030.0,10983.0,535.0,36:18,2178.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,"넷플릭스 시리즈 [은중과 상연] 드디어 👉오늘 공개 👈\n\n친구란 게... 참,\..."
2,[무공해] 폭주하는 공감 요정.. 아이브 레이랑 무조건 공감합니다,N5Zk-xH1e0k,https://www.youtube.com/watch?v=N5Zk-xH1e0k,2025-09-05,https://i.ytimg.com/vi/N5Zk-xH1e0k/maxresdefau...,677690.0,9259.0,569.0,35:35,2135.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,*사연자들은 모두 일반인입니다. 무분별한 비난은 삭제될 수 있습니다*\n\n공감이 ...
3,[ENG SUB] 직역된 가사보고 노래 맞히기 (w.태래 규빈 리키),-6vOqZs6CFA,https://www.youtube.com/watch?v=-6vOqZs6CFA,2025-09-02,https://i.ytimg.com/vi/-6vOqZs6CFA/maxresdefau...,192365.0,8331.0,516.0,21:34,1294.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,📌https://bit.ly/3ULB2X7\n👉 스픽XT1 코스를 무료 체험으로 딱...
4,근데 운전은 로이킴이,QxSIqNOMCm4,https://www.youtube.com/watch?v=QxSIqNOMCm4,2025-08-29,https://i.ytimg.com/vi_webp/QxSIqNOMCm4/maxres...,167341.0,1979.0,201.0,24:17,1457.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,쏘카는 보이는 가격 그대로😶\n이제 [총 결제 요금] 한 번만 내고 타세요\n\n5...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,[유병재 라이브] 상훈의 밤 - 랜덤 상황극,6SPQEm3CuB0,https://www.youtube.com/watch?v=6SPQEm3CuB0,2018-09-13,https://i.ytimg.com/vi_webp/6SPQEm3CuB0/maxres...,1250076.0,18173.0,1000.0,24:08,1448.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '유병재유튜브', '문학의밤', '유병재라이브', '유규선', '문상훈']",https://www.youtube.com/@KoreanCryingGuy,
292,[유병재 스케치] 600만 빚의 사나이,P6SwY359Tls,https://www.youtube.com/watch?v=P6SwY359Tls,2018-09-11,https://i.ytimg.com/vi/P6SwY359Tls/sddefault.jpg,385705.0,10696.0,926.0,6:39,399.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '스케치코미디', '유병재유튜브', 'snl']",https://www.youtube.com/@KoreanCryingGuy,유병재 스케치 코미디\nep.1 600만 빚의 사나이
293,[세상에서 가장 고독한 팬미팅] 2시간 풀버전!!,0SJ2z36tXwM,https://www.youtube.com/watch?v=0SJ2z36tXwM,2018-09-07,https://i.ytimg.com/vi_webp/0SJ2z36tXwM/maxres...,432138.0,5425.0,408.0,1:44:45,6285.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,[],https://www.youtube.com/@KoreanCryingGuy,
294,[세상에서 가장 고독한 팬미팅] 고독한 초대손님,QdMT51pQVeA,https://www.youtube.com/watch?v=QdMT51pQVeA,2018-09-06,https://i.ytimg.com/vi_webp/QdMT51pQVeA/maxres...,113727.0,1471.0,122.0,5:06,306.0,유병재,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '문학의밤', '고독한 팬미팅', '사이렌', '방탄소년단', 'BT...",https://www.youtube.com/@KoreanCryingGuy,


In [5]:
df.columns

Index(['title', 'video_id', 'video_url', 'published_date', 'thumbnail_url',
       'view_count', 'like_count', 'comment_count', 'duration_hms',
       'duration_sec', 'channel_name', 'channel_id', 'subscriber_count',
       'categories', 'tags', 'uploader_url', 'description'],
      dtype='object')

In [5]:
import ast

def normalize_category(val):
    """
    categories가 ['Comedy'] 같은 리스트/문자열일 수 있어 이를 'Comedy'로 통일.
    """
    if val is None:
        return None
    # 이미 문자열인데 "['Comedy']" 형태인 경우
    if isinstance(val, str):
        val = val.strip()
        if val.startswith("[") and val.endswith("]"):
            try:
                parsed = ast.literal_eval(val)
                if isinstance(parsed, list) and parsed:
                    return str(parsed[0])
            except Exception:
                pass
        return val  # 정상 문자열
    # 리스트인 경우
    if isinstance(val, (list, tuple)) and val:
        return str(val[0])
    return None

In [81]:
# df=df.iloc[:296,:]
# df

Unnamed: 0,title,video_id,video_url,published_date,thumbnail_url,view_count,like_count,comment_count,duration_hms,duration_sec,...,channel_id,subscriber_count,categories,tags,uploader_url,age_limit,availability,live_status,description,_error
0,[KIA 타이거즈] 선수별 응원가 제안서.pdf,VzzZ70tWMJ8,https://www.youtube.com/watch?v=VzzZ70tWMJ8,2025-09-18,https://i.ytimg.com/vi/VzzZ70tWMJ8/maxresdefau...,174862.0,4022.0,368.0,24:24,1464.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,필라이트 클리어가 12캔에 만원...⁉️\n야구 1회마다 한 캔씩 마셔도 9회까지 ...,
1,[무공해] 험한 것이 나와도 김고은이랑 무조건 공감합니다,_ttuPeDExTo,https://www.youtube.com/watch?v=_ttuPeDExTo,2025-09-12,https://i.ytimg.com/vi/_ttuPeDExTo/maxresdefau...,758030.0,10983.0,535.0,36:18,2178.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,"넷플릭스 시리즈 [은중과 상연] 드디어 👉오늘 공개 👈\n\n친구란 게... 참,\...",
2,[무공해] 폭주하는 공감 요정.. 아이브 레이랑 무조건 공감합니다,N5Zk-xH1e0k,https://www.youtube.com/watch?v=N5Zk-xH1e0k,2025-09-05,https://i.ytimg.com/vi/N5Zk-xH1e0k/maxresdefau...,677690.0,9259.0,569.0,35:35,2135.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,*사연자들은 모두 일반인입니다. 무분별한 비난은 삭제될 수 있습니다*\n\n공감이 ...,
3,[ENG SUB] 직역된 가사보고 노래 맞히기 (w.태래 규빈 리키),-6vOqZs6CFA,https://www.youtube.com/watch?v=-6vOqZs6CFA,2025-09-02,https://i.ytimg.com/vi/-6vOqZs6CFA/maxresdefau...,192365.0,8331.0,516.0,21:34,1294.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,📌https://bit.ly/3ULB2X7\n👉 스픽XT1 코스를 무료 체험으로 딱...,
4,근데 운전은 로이킴이,QxSIqNOMCm4,https://www.youtube.com/watch?v=QxSIqNOMCm4,2025-08-29,https://i.ytimg.com/vi_webp/QxSIqNOMCm4/maxres...,167341.0,1979.0,201.0,24:17,1457.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['코미디', '개그', '라이브코미디', 'comedy', 'korea comed...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,쏘카는 보이는 가격 그대로😶\n이제 [총 결제 요금] 한 번만 내고 타세요\n\n5...,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,[유병재 라이브] 상훈의 밤 - 랜덤 상황극,6SPQEm3CuB0,https://www.youtube.com/watch?v=6SPQEm3CuB0,2018-09-13,https://i.ytimg.com/vi_webp/6SPQEm3CuB0/maxres...,1250076.0,18173.0,1000.0,24:08,1448.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '유병재유튜브', '문학의밤', '유병재라이브', '유규선', '문상훈']",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,,
292,[유병재 스케치] 600만 빚의 사나이,P6SwY359Tls,https://www.youtube.com/watch?v=P6SwY359Tls,2018-09-11,https://i.ytimg.com/vi/P6SwY359Tls/sddefault.jpg,385705.0,10696.0,926.0,6:39,399.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '스케치코미디', '유병재유튜브', 'snl']",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,유병재 스케치 코미디\nep.1 600만 빚의 사나이,
293,[세상에서 가장 고독한 팬미팅] 2시간 풀버전!!,0SJ2z36tXwM,https://www.youtube.com/watch?v=0SJ2z36tXwM,2018-09-07,https://i.ytimg.com/vi_webp/0SJ2z36tXwM/maxres...,432138.0,5425.0,408.0,1:44:45,6285.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,[],https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,,
294,[세상에서 가장 고독한 팬미팅] 고독한 초대손님,QdMT51pQVeA,https://www.youtube.com/watch?v=QdMT51pQVeA,2018-09-06,https://i.ytimg.com/vi_webp/QdMT51pQVeA/maxres...,113727.0,1471.0,122.0,5:06,306.0,...,UCHw9p667e9l0qoYfY8calaA,1690000,Comedy,"['유병재', '문학의밤', '고독한 팬미팅', '사이렌', '방탄소년단', 'BT...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,,


In [85]:
df.describe()

Unnamed: 0,view_count,like_count,comment_count,duration_sec,subscriber_count,age_limit
count,296.0,296.0,296.0,296.0,296.0,296.0
mean,909948.8,13061.297297,724.925676,1408.662162,1690000.0,0.0
std,1289519.0,23244.062432,1297.686194,1980.570177,0.0,0.0
min,48786.0,674.0,41.0,88.0,1690000.0,0.0
25%,247666.2,3290.25,186.0,726.5,1690000.0,0.0
50%,521181.5,6874.5,346.5,999.5,1690000.0,0.0
75%,1033335.0,13817.25,740.75,1439.25,1690000.0,0.0
max,8720112.0,211460.0,10000.0,19818.0,1690000.0,0.0


In [93]:
# df=df.drop(columns=['age_limit', '_error', 'availability', 'live_status'])

In [95]:
# df.to_csv("channel(@KoreanCryingGuy)_videos_metadata.csv", index=False, encoding="utf-8-sig")

In [103]:
df['title'].head(30)

0                       [KIA 타이거즈] 선수별 응원가 제안서.pdf
1                  [무공해] 험한 것이 나와도 김고은이랑 무조건 공감합니다
2             [무공해] 폭주하는 공감 요정.. 아이브 레이랑 무조건 공감합니다
3           [ENG SUB] 직역된 가사보고 노래 맞히기 (w.태래 규빈 리키)
4                                      근데 운전은 로이킴이
5                  [무공해] 샤이니 T랑, 아니 키랑.. 무조건 공감합니다
6                         [SUB] 지드래곤의 웃으면 안되는 생일파티
7                    [궤도의 잠 못 드는 밤] 솔직히 진짜 안 졸ㄹ...
8                    [무딱싫] 무서운 거 딱 싫은디.. 자꾸 시킴.. ㅠ
9                        [무공해] 퇴사마려운 직장인분들과도 공감합니다
10                       [롯데 자이언츠] 선수별 응원가 제안서.pdf
11                   제 2회 소리내면 안되는 야구중계 (LG vs 롯데)
12    (꿀잼) 100% 애드립! 스우파 보는 척 하는 애들 [스웃파 시즌3 EP01]
13                           [무공해] 강하늘이랑 무조건 공감합니다
14                        [무딱싫] 무서운 거 딱 싫어.. 진짜루..
15                        [한화 이글스] 선수별 응원가 제안서.pdf
16                                  너의 목소리가 잘 안 보여
17                           [무공해] 살다살다 별걸 다 공감합니다
18                      [LG TWINS] 선수별 응원가 제안서.pdf
19                  귀 닫고 서로 덕질 

In [19]:
channel_main_cat = df["categories"].dropna().mode().iloc[0] if df["categories"].notna().any() else None
print("채널 최빈 카테고리:", channel_main_cat)

채널 최빈 카테고리: Comedy


## 관련 채널 인기 동영상 크롤링 (추천영상 활용)

### 추천 영상들 정보 추출

In [7]:
# pip install selenium webdriver-manager

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time

def make_driver(headless=False):
    opts = Options()
    if headless:
        opts.add_argument("--headless=new")
    opts.add_argument("--start-maximized")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

def extract_cards_all_fields(url, headless=False):
    driver = make_driver(headless=headless)
    try:
        driver.get(url)

        # 메인 레이아웃 대기
        WebDriverWait(driver, 12).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "ytd-watch-flexy"))
        )

        # (있으면) 쿠키/동의 닫기
        try:
            btn = WebDriverWait(driver, 3).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(.,'동의') or contains(.,'I agree') or contains(.,'Accept')]"))
            )
            btn.click()
            time.sleep(0.4)
        except Exception:
            pass

        # #secondary로 스크롤 후 카드 수집
        secondary = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "#secondary"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block:'start'});", secondary)
        time.sleep(0.6)

        card_sel = "#secondary yt-lockup-view-model, #secondary ytd-compact-video-renderer"
        cards = driver.find_elements(By.CSS_SELECTOR, card_sel)

        data = []
        for idx, card in enumerate(cards, 1):
            # 1) 제목/영상 링크
            link = None
            for sel in ["a#video-title-link", "a#video-title", "a[href^='/watch']"]:
                els = card.find_elements(By.CSS_SELECTOR, sel)
                if els:
                    link = els[0]
                    break
            title = (link.text or link.get_attribute("title") or "").strip() if link else ""
            video_url = link.get_attribute("href") if link else None

            # 2) 메타(span)들: 채널명/조회수/업로드시점이 보통 들어있음
            spans_text = []
            metas = card.find_elements(By.CSS_SELECTOR, "yt-lockup-metadata-view-model yt-content-metadata-view-model")
            meta = metas[0] if metas else None
            if meta:
                divs = meta.find_elements(By.XPATH, "./div")
                for dv in divs:
                    spans = dv.find_elements(By.TAG_NAME, "span")
                    if spans:
                        t = spans[0].text.strip()
                        if t:
                            spans_text.append(t)

            # 3) 채널 링크(있으면), 없으면 span의 첫 항목을 채널명으로 사용
            ch_anchor = None
            for sel in [
                "yt-lockup-metadata-view-model a[href*='/@']",
                "yt-lockup-metadata-view-model a[href^='/channel/']",
                "a[href*='/@']",
                "a[href^='/channel/']",
            ]:
                els = card.find_elements(By.CSS_SELECTOR, sel)
                if els:
                    ch_anchor = els[0]
                    break
            channel_name = (ch_anchor.text.strip() if ch_anchor and ch_anchor.text else (spans_text[0] if spans_text else ""))
            channel_url = ch_anchor.get_attribute("href") if ch_anchor else None

            # 4) 길이/썸네일
            duration = None
            dur_els = card.find_elements(By.CSS_SELECTOR, "ytd-thumbnail-overlay-time-status-renderer span, .ytd-thumbnail-overlay-time-status-renderer span")
            if dur_els:
                dtxt = dur_els[0].text.strip()
                if dtxt:
                    duration = dtxt

            thumb = None
            for img in card.find_elements(By.CSS_SELECTOR, "ytd-thumbnail img, img.yt-core-image--loaded, img"):
                src = (img.get_attribute("src") or "").strip()
                if src.startswith("http"):
                    thumb = src
                    break

            data.append({
                "idx": idx,
                "title": title,
                "video_url": video_url,
                "channel_name": channel_name,
                "channel_url": channel_url,
                "spans": spans_text,      # 원시 메타 텍스트들(검증용)
                "duration": duration,
                "thumbnail": thumb,
            })

        return data
    finally:
        driver.quit()

In [9]:
import json, traceback

def flatten_items(items, source_url):
    """extract_cards_all_fields()가 반환한 카드 리스트(list[dict])를 CSV-friendly하게 평탄화."""
    rows = []
    for d in items:
        row = dict(d)                 # idx, title, video_url, channel_name, channel_url, spans, duration, thumbnail ...
        row["source_watch_url"] = source_url
        rows.append(row)
    return rows

def save_on_stop(all_rows, all_items_full):
    n = len(all_items_full)
    csv_name  = f"all_items_{n}.csv"
    json_name = f"all_items_full_{n}.json"
    try:
        # CSV: 평탄화된 카드 단위
        pd.DataFrame(all_rows).to_csv(csv_name, index=False, encoding="utf-8-sig")
        # JSON: 원본 구조(list[list[dict]]) 보존
        with open(json_name, "w", encoding="utf-8") as f:
            json.dump(all_items_full, f, ensure_ascii=False, indent=2)
        print(f"[SAVED] {csv_name} / {json_name}")
    except Exception as e:
        print(f"[SAVE-ERR] 저장 실패: {e}")

In [15]:
# ===== 메인 루프 =====
all_items_full = []   # URL별 카드 리스트를 그대로 보관 (list[list[dict]])
all_rows       = []   # CSV 저장용(카드 평탄화)
processed      = 0

try:
    for test_url in tqdm(df.iloc[:100, 2].tolist()):
        try:
            items = extract_cards_all_fields(test_url, headless=False)
        except Exception as e:
            print(f"[WARN] extract 실패: {test_url}\n  -> {e}")
            traceback.print_exc(limit=1)
            continue  # 개별 URL 에러는 건너뜀

        processed += 1
        all_items_full.append(items)
        all_rows.extend(flatten_items(items, test_url))

        # (요청에 따라 중간 저장 없음)

except KeyboardInterrupt:
    print("\n[STOP] KeyboardInterrupt 감지 → 진행분 저장합니다.")
    save_on_stop(all_rows, all_items_full)

except Exception as e:
    print(f"\n[STOP] 치명 오류로 중단 → 진행분 저장합니다. 에러: {e}")
    traceback.print_exc(limit=2)
    save_on_stop(all_rows, all_items_full)

else:
    # 정상 종료 시에는 저장하지 않음(요청 사항)
    # 필요하면 아래 주석 해제:
    save_on_stop(all_rows, all_items_full)
    print(f"\n[DONE] 정상 완료 (processed={processed}, items_blocks={len(all_items_full)})")

 69%|███████████████████████████████████████████████████████▉                         | 69/100 [31:02<13:56, 27.00s/it]



[STOP] KeyboardInterrupt 감지 → 진행분 저장합니다.
[SAVED] all_items_69.csv / all_items_full_69.json


### 채널별 인기동영상 url 추출

In [175]:
items=pd.read_csv('all_items_69.csv', encoding='utf-8')
items

Unnamed: 0,idx,title,video_url,channel_name,channel_url,spans,duration,thumbnail,source_watch_url
0,1,33:08,https://www.youtube.com/watch?v=LedMWQQ-N2E,유병재,,"['유병재', '조회수 45만회']",,https://i.ytimg.com/vi/LedMWQQ-N2E/hqdefault.j...,https://www.youtube.com/watch?v=VzzZ70tWMJ8
1,2,52:06,https://www.youtube.com/watch?v=EQXy86XKsRU,십이층,,"['십이층', '조회수 49만회']",,https://i.ytimg.com/vi/EQXy86XKsRU/hqdefault.j...,https://www.youtube.com/watch?v=VzzZ70tWMJ8
2,3,28:50,https://www.youtube.com/watch?v=lBOhRSfeOD4,유병재,,"['유병재', '조회수 33만회']",,https://i.ytimg.com/vi/lBOhRSfeOD4/hqdefault.j...,https://www.youtube.com/watch?v=VzzZ70tWMJ8
3,4,22:34,https://www.youtube.com/watch?v=aicuBmhDOSo,기아타이거즈 - 갸티비,,"['기아타이거즈 - 갸티비', '조회수 10만회']",,https://i.ytimg.com/vi/aicuBmhDOSo/hqdefault.j...,https://www.youtube.com/watch?v=VzzZ70tWMJ8
4,5,32:22,https://www.youtube.com/watch?v=UhQUSVzKEUE,카더정원,,"['카더정원', '조회수 86만회']",,https://i.ytimg.com/vi/UhQUSVzKEUE/hqdefault.j...,https://www.youtube.com/watch?v=VzzZ70tWMJ8
...,...,...,...,...,...,...,...,...,...
1337,17,38:41,https://www.youtube.com/watch?v=FFDumzq6ATg,차린건 쥐뿔도 없지만,,"['차린건 쥐뿔도 없지만', '조회수 716만회']",,,https://www.youtube.com/watch?v=l08L-lj7WLM
1338,18,2:58:34,https://www.youtube.com/watch?v=vnLKdZtcw0w,유병재,,"['유병재', '조회수 121만회']",,,https://www.youtube.com/watch?v=l08L-lj7WLM
1339,19,실시간,https://www.youtube.com/watch?v=fNPdR6ucWvc,Jazzne | 기분Jazz네,,"['Jazzne | 기분Jazz네', '1.7천명 시청 중']",,,https://www.youtube.com/watch?v=l08L-lj7WLM
1340,20,27:25,https://www.youtube.com/watch?v=yqoKLTsce20,동네스타K,,"['동네스타K', '조회수 75만회']",,,https://www.youtube.com/watch?v=l08L-lj7WLM


In [177]:
items_unique = items.drop_duplicates(subset='channel_name', keep='first')
len(items_unique)

194

In [127]:
for i in items_unique['channel_name']:
    print(i)

유병재
십이층
기아타이거즈 - 갸티비
카더정원
빠니보틀 Pani Bottle
보다 BODA
MBC every1
추성훈 ChooSungHoon
KBO
노홍철
뜬뜬 DdeunDdeun
침착맨
하하 PD HAHA PD
고재영
TEO 테오
워크맨-Workman
원지의하루
요정재형
ootb STUDIO
오늘의 주우재
유 퀴즈 온 더 튜브
빠더너스 BDNS
채널십오야
너덜트
M2
KODE 코드
동네스타K
꼰대희
GENIE MUSIC
KBS Entertain
STUDIO CHOOM [스튜디오 춤]
미미미누
Mnet K-POP
KBS Kpop
스튜디오 수제
에픽하이
효연의 레벨업 Hyo's Level Up
VIVO TV - 비보티비
텍텍붐 textextboom
집대성
일일칠 - 117
숏박스
kiu기우쌤
정형돈의 제목없음TV
디글 클래식 :Diggle Classic
김종국 GYM JONG KOOK
14F 일사에프
감스트GAMST
B tv 이동진의 파이아키아
달리 [SBS DALI] - SBS 공식 교양 채널
mono radio
초특가퀴즈쇼
tvN D ENT
Giants TV
Pixid
유희관희유
현장응원맛집
진돌
Eagles TV
스포츠게임 하는 사람
LGTWINSTV
이넉살
침착맨 플러스
은지랑 이은지
퀸민카이
마플 마인크래프트 채널
우아 스튜디오 [Ooh-ah Studio]
KBS대전 [CULTURE&DOCUMENTARY]
다인이공
윤하다(YUNHADA)
닿다
김준표
최프진
찰스엔터
상상상상
라이프교회
공부왕찐천재 홍진경
nan
배지현 BaeJihyun
⌗나나☆彡
불방맹이TV
스브스스포츠 SUBUSU SPORTS
치과의사 매직박
스튜디오투쁠(Studio++)
이병훈소장TV
MC뻐꾹
CEC 아카데미
윤성 굿맨
SSG랜더스
김태균 [TK52]
샾잉 #ing
별빛
재훍 영상툰
우정잉
고독한 낭독회
tvN Joy
장동선의 궁금한 뇌
Jazzne | 기분Jazz네
JTBC Voyage
play 채널A
그것이 알고싶다
올끌 (All of MBClassic)
혜안
토대장
디글 :Di

In [189]:
import re
import time, random
from selenium.common.exceptions import TimeoutException

# --- (이전 코드의 _make_driver, _sleep, _close_consent_if_any,
#      get_channel_url_from_video, list_popular_video_urls_from_channel 등은 그대로 둡니다) ---

def _norm_channel_name(name: str) -> str:
    """대소문자/공백차이를 줄여 중복 판단을 안정화"""
    if not name:
        return ""
    # 공백 제거 + 소문자
    return re.sub(r"\s+", "", name).casefold()

def dedupe_by_channel_name(items):
    """
    같은 channel_name 은 하나만 남기기.
    - 동일 채널명이 여러 개면 'channel_url'이 있는 항목을 우선 채택
    - channel_name이 비어있는 항목은 dedupe 대상에서 제외(그냥 그대로 둠)
    """
    chosen = {}
    order = []  # 출력 순서 보존용
    for it in items:
        name = (it.get("channel_name") or "").strip()
        if not name:
            # 이름이 없으면 중복 판단 불가 → 그대로 유지
            key = None
        else:
            key = _norm_channel_name(name)

        if not key:
            order.append(it)
            continue

        prev = chosen.get(key)
        if prev is None:
            chosen[key] = it
            order.append(it)
        else:
            # 둘 다 같은 채널명: channel_url 보유 쪽을 우선
            if (not prev.get("channel_url")) and it.get("channel_url"):
                # order 리스트 안에서도 교체
                idx = order.index(prev)
                order[idx] = it
                chosen[key] = it
            # 그 외에는 기존 유지
    return order

def collect_popular_videos_for_items(items, top_k=20, headless=False, dedupe=True):
    """
    items: [{'video_url':..., 'channel_name':..., 'channel_url':...?, ...}, ...]
    -> 각 item에 'popular_video_urls' 키로 리스트 추가하여 반환
    - dedupe=True 면 channel_name 기준으로 한 번만 수집
    """
    # 0) 채널명 기준 dedupe
    work_items = dedupe_by_channel_name(items) if dedupe else list(items)

    driver = _make_driver(headless=headless)
    try:
        results = []
        for it in tqdm(work_items):
            video_url = it.get("video_url")
            ch_url = it.get("channel_url")

            # 1) 채널 URL이 없으면 영상 페이지에서 확보
            if not ch_url:
                try:
                    ch_url = get_channel_url_from_video(driver, video_url)
                except Exception:
                    ch_url = None

            # 2) 채널 인기 동영상 수집
            try:
                popular_urls = list_popular_video_urls_from_channel(driver, ch_url, top_k=top_k)
            except Exception:
                popular_urls = []

            # 3) 결과 병합
            out = dict(it)
            out["channel_url"] = ch_url
            out["popular_video_urls"] = popular_urls
            results.append(out)

            # 살짝 지터
            time.sleep(random.uniform(0.3, 0.7))

        return results
    finally:
        driver.quit()

In [181]:
# 컬럼 존재 보정
required_cols = ["video_url", "channel_name"]
for c in required_cols:
    if c not in items_unique.columns:
        raise ValueError(f"[ERR] CSV에 '{c}' 컬럼이 없습니다.")

# channel_url 없으면 생성
if "channel_url" not in items_unique.columns:
    items_unique["channel_url"] = None

# 결측/공백 정리 + video_url 없는 행 제거
items_unique["video_url"] = items_unique["video_url"].astype(str).str.strip()
items_unique["channel_name"] = items_unique["channel_name"].astype(str).str.strip()
items_unique["channel_url"] = items_unique["channel_url"].astype(str).str.strip().replace({"": None, "nan": None, "None": None})

items_unique = items_unique[items_unique["video_url"].str.startswith("http")].copy()

# (선택) 위에서 만들었던 '맨 위만 남기기' 로직을 이미 했다면 스킵 가능
# 여기서는 강제 중복 제거는 하지 않고, collect 함수의 dedupe=True로 처리함.

# list[dict]로 변환 (collect_popular_videos_for_items가 기대하는 형태)
items = items_unique[["video_url", "channel_name", "channel_url"]].to_dict(orient="records")
len(items), items[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  items_unique["video_url"] = items_unique["video_url"].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  items_unique["channel_name"] = items_unique["channel_name"].astype(str).str.strip()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  items_unique["channel_url"] = items_unique[

(193,
 {'video_url': 'https://www.youtube.com/watch?v=LedMWQQ-N2E',
  'channel_name': '유병재',
  'channel_url': None})

In [185]:
items=items[:194]

In [191]:
# 방금 구성한 items를 이용해 인기영상 수집
out = collect_popular_videos_for_items(items, top_k=5, headless=True, dedupe=True)

print(len(out), "unique channels collected")
for row in out:
    print(row.get("channel_name"), "->", row.get("channel_url"), "  top:", len(row.get("popular_video_urls") or []))

100%|████████████████████████████████████████████████████████████████████████████████| 193/193 [54:03<00:00, 16.81s/it]


193 unique channels collected
유병재 -> https://www.youtube.com/@KoreanCryingGuy   top: 5
십이층 -> https://www.youtube.com/@12%EC%B8%B5   top: 5
기아타이거즈 - 갸티비 -> https://www.youtube.com/@kiatigerstv   top: 5
카더정원 -> https://www.youtube.com/@carthejungwon   top: 5
빠니보틀 Pani Bottle -> https://www.youtube.com/@PaniBottle   top: 5
보다 BODA -> https://www.youtube.com/@%EB%B3%B4%EB%8B%A4BODA   top: 5
MBC every1 -> https://www.youtube.com/@MBCevery1   top: 5
추성훈 ChooSungHoon -> https://www.youtube.com/@Choosunghoon_ajossi   top: 5
KBO -> https://www.youtube.com/@KBO1982   top: 5
노홍철 -> https://www.youtube.com/@luckyhongchul   top: 5
뜬뜬 DdeunDdeun -> https://www.youtube.com/@ddeunddeun   top: 5
침착맨 -> https://www.youtube.com/@ChimChakMan_Official   top: 5
하하 PD HAHA PD -> https://www.youtube.com/@%ED%95%98%ED%95%98PD   top: 5
고재영 -> https://www.youtube.com/@gojaeyoung   top: 5
TEO 테오 -> https://www.youtube.com/@TEO_universe   top: 5
워크맨-Workman -> https://www.youtube.com/@workman   top: 5
원지의하루 -> ht

In [None]:
with open('channel(@KoreanCryingGuy)_related_videos.json', "w", encoding="utf-8") as f:
            json.dump(out, f, ensure_ascii=False, indent=2)

### 인기동영상별 상세메타데이터 추출

In [17]:
with open("channel(@KoreanCryingGuy)_related_videos.json", "r", encoding="utf-8") as f:
    out = json.load(f)

In [19]:
# 필요 모듈
import time, random
from typing import List, Dict, Optional
import pandas as pd
from tqdm.auto import tqdm

# (안전장치) 네 함수가 참조하는 _sec_to_hms가 없다면 간단 구현
try:
    _ = _sec_to_hms  # noqa: F821
except NameError:
    def _sec_to_hms(sec: Optional[int]) -> Optional[str]:
        if sec is None:
            return None
        h = sec // 3600
        m = (sec % 3600) // 60
        s = sec % 60
        return f"{h:d}:{m:02d}:{s:02d}" if h else f"{m:d}:{s:02d}"

def _sleep_jitter(a=0.3, b=0.7):
    time.sleep(random.uniform(a, b))

def flatten_popular_urls(out: List[Dict], per_channel_limit: Optional[int] = None) -> List[Dict]:
    """
    out의 각 항목에서 popular_video_urls를 (채널 메타와 함께) 평탄화.
    - per_channel_limit: 채널당 상위 N개만 사용하고 싶을 때
    반환: [{'source_channel_name', 'source_channel_url', 'video_url'}, ...]
    """
    flat = []
    for ch in out:
        urls = (ch.get("popular_video_urls") or [])
        if per_channel_limit:
            urls = urls[:per_channel_limit]
        for u in urls:
            flat.append({
                "source_channel_name": ch.get("channel_name"),
                "source_channel_url": ch.get("channel_url"),
                "video_url": u,
            })
    return flat

def collect_video_metadata_with_ytdlp(
    out: List[Dict],
    per_channel_limit: Optional[int] = None,
    dedupe_urls: bool = True,
    sleep_range=(0.25, 0.6),
    save_path: Optional[str] = None,
) -> pd.DataFrame:
    """
    - out: 채널별 popular_video_urls가 들어있는 리스트 (너가 만든 out)
    - per_channel_limit: 채널당 몇 개까지만 수집할지 제한 (None이면 전부)
    - dedupe_urls: 서로 다른 채널에서 같은 영상이 나오면 1번만 수집
    - sleep_range: (min, max) 요청 사이 랜덤 딜레이
    - save_path: '...csv' | '...parquet' | '...json' 저장 경로 (선택)

    반환: 수집된 메타데이터 DataFrame
    """
    # 1) 평탄화 + (선택) URL 중복 제거
    flat = flatten_popular_urls(out, per_channel_limit=per_channel_limit)
    if dedupe_urls:
        seen = set()
        uniq = []
        for row in flat:
            u = row["video_url"]
            if u in seen:
                continue
            seen.add(u)
            uniq.append(row)
        flat = uniq

    # 2) 수집 실행
    rows = []
    for i, row in enumerate(
        tqdm(flat, total=len(flat), desc="Collecting video metadata",
             unit="video", dynamic_ncols=True), 1):
        url = row["video_url"]
        try:
            meta = get_youtube_video_info(url)  # 네가 이미 정의한 함수 사용
            # 채널 컨텍스트를 붙여 보존
            meta["source_channel_name"] = row["source_channel_name"]
            meta["source_channel_url"]  = row["source_channel_url"]
            rows.append(meta)
        except Exception as e:
            rows.append({
                "video_url": url,
                "error": str(e),
                "source_channel_name": row["source_channel_name"],
                "source_channel_url": row["source_channel_url"],
            })
        _sleep_jitter(*sleep_range)

    df = pd.DataFrame(rows)

    # 3) (선택) 저장
    if save_path:
        if save_path.lower().endswith(".csv"):
            df.to_csv(save_path, index=False, encoding="utf-8-sig")
        elif save_path.lower().endswith(".parquet"):
            df.to_parquet(save_path, index=False)
        elif save_path.lower().endswith(".json"):
            df.to_json(save_path, orient="records", force_ascii=False, indent=2)
        else:
            print(f"[WARN] 확장자를 알 수 없어 저장 생략: {save_path}")

    return df

In [21]:
df_meta = collect_video_metadata_with_ytdlp(
    out,
    per_channel_limit=5,   # 총 1000개
    dedupe_urls=True,
    sleep_range=(0.25, 0.6),
    save_path=None,   # 저장 원치 않으면 None
)

print(len(df_meta), "개 수집")
print(df_meta.head(3))

Collecting video metadata:   0%|                                                            | 0/936 [00:00<?, …

ERROR: [youtube] 13qE6rkL1gk: Join this channel to get access to members-only content like this video, and other exclusive perks.
ERROR: [youtube] HNNpyeTjoDQ: Sign in to confirm your age. This video may be inappropriate for some users. Use --cookies-from-browser or --cookies for the authentication. See  https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp  for how to manually pass cookies. Also see  https://github.com/yt-dlp/yt-dlp/wiki/Extractors#exporting-youtube-cookies  for tips on effectively exporting YouTube cookies
ERROR: [youtube] -338ZqRtjsk: This video is available to this channel's members on level: 😋제육볶음🥩 (or any higher level). Join this channel to get access to members-only content and other exclusive perks.
ERROR: [youtube] N_IB_7UsdYU: This video is available to this channel's members on level: 😋제육볶음🥩 (or any higher level). Join this channel to get access to members-only content and other exclusive perks.
ERROR: [youtube] OoDLCcNQFwM: Join this cha

936 개 수집
                             title     video_id  \
0                  너의 목소리가 아예 안 보여  lBOhRSfeOD4   
1       [KIA 타이거즈] 선수별 응원가 제안서.pdf  VzzZ70tWMJ8   
2  [무공해] 험한 것이 나와도 김고은이랑 무조건 공감합니다  _ttuPeDExTo   

                                     video_url published_date  \
0  https://www.youtube.com/watch?v=lBOhRSfeOD4     2025-09-22   
1  https://www.youtube.com/watch?v=VzzZ70tWMJ8     2025-09-18   
2  https://www.youtube.com/watch?v=_ttuPeDExTo     2025-09-12   

                                       thumbnail_url  view_count  like_count  \
0  https://i.ytimg.com/vi/lBOhRSfeOD4/maxresdefau...    360217.0      4333.0   
1  https://i.ytimg.com/vi/VzzZ70tWMJ8/maxresdefau...    200291.0      4356.0   
2  https://i.ytimg.com/vi/_ttuPeDExTo/maxresdefau...    826049.0     11836.0   

   comment_count duration_hms  duration_sec  ... categories  \
0          255.0        28:49        1729.0  ...   [Comedy]   
1          412.0        24:24        1464.0  ...   [Comedy]   
2          569.

In [23]:
df_meta

Unnamed: 0,title,video_id,video_url,published_date,thumbnail_url,view_count,like_count,comment_count,duration_hms,duration_sec,...,categories,tags,uploader_url,age_limit,availability,live_status,description,source_channel_name,source_channel_url,error
0,너의 목소리가 아예 안 보여,lBOhRSfeOD4,https://www.youtube.com/watch?v=lBOhRSfeOD4,2025-09-22,https://i.ytimg.com/vi/lBOhRSfeOD4/maxresdefau...,360217.0,4333.0,255.0,28:49,1729.0,...,[Comedy],"[코미디, 개그, 라이브코미디, comedy, korea comedy, 유병재, 문...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,[8/6(수)에 진행한 라이브 방송입니다],유병재,https://www.youtube.com/@KoreanCryingGuy,
1,[KIA 타이거즈] 선수별 응원가 제안서.pdf,VzzZ70tWMJ8,https://www.youtube.com/watch?v=VzzZ70tWMJ8,2025-09-18,https://i.ytimg.com/vi/VzzZ70tWMJ8/maxresdefau...,200291.0,4356.0,412.0,24:24,1464.0,...,[Comedy],"[코미디, 개그, 라이브코미디, comedy, korea comedy, 유병재, 문...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,필라이트 클리어가 12캔에 만원...⁉️\n야구 1회마다 한 캔씩 마셔도 9회까지 ...,유병재,https://www.youtube.com/@KoreanCryingGuy,
2,[무공해] 험한 것이 나와도 김고은이랑 무조건 공감합니다,_ttuPeDExTo,https://www.youtube.com/watch?v=_ttuPeDExTo,2025-09-12,https://i.ytimg.com/vi/_ttuPeDExTo/maxresdefau...,826049.0,11836.0,569.0,36:18,2178.0,...,[Comedy],"[코미디, 개그, 라이브코미디, comedy, korea comedy, 유병재, 문...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,"넷플릭스 시리즈 [은중과 상연] 드디어 👉오늘 공개 👈\n\n친구란 게... 참,\...",유병재,https://www.youtube.com/@KoreanCryingGuy,
3,[무공해] 폭주하는 공감 요정.. 아이브 레이랑 무조건 공감합니다,N5Zk-xH1e0k,https://www.youtube.com/watch?v=N5Zk-xH1e0k,2025-09-05,https://i.ytimg.com/vi/N5Zk-xH1e0k/maxresdefau...,698686.0,9461.0,581.0,35:35,2135.0,...,[Comedy],"[코미디, 개그, 라이브코미디, comedy, korea comedy, 유병재, 문...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,*사연자들은 모두 일반인입니다. 무분별한 비난은 삭제될 수 있습니다*\n\n공감이 ...,유병재,https://www.youtube.com/@KoreanCryingGuy,
4,[ENG SUB] 직역된 가사보고 노래 맞히기 (w.태래 규빈 리키),-6vOqZs6CFA,https://www.youtube.com/watch?v=-6vOqZs6CFA,2025-09-02,https://i.ytimg.com/vi/-6vOqZs6CFA/maxresdefau...,196934.0,8444.0,521.0,21:34,1294.0,...,[Comedy],"[코미디, 개그, 라이브코미디, comedy, korea comedy, 유병재, 문...",https://www.youtube.com/@KoreanCryingGuy,0.0,public,not_live,📌https://bit.ly/3ULB2X7\n👉 내가 좋아하는 T1 선수들과 스픽X...,유병재,https://www.youtube.com/@KoreanCryingGuy,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
931,"(SUB) 박규영 | “촬영 동안 돌아가시게 만든 분들이 너무 많아요” | 사마귀,...",8GS4S-ubOPE,https://www.youtube.com/watch?v=8GS4S-ubOPE,2025-09-24,https://i.ytimg.com/vi/8GS4S-ubOPE/maxresdefau...,211645.0,2211.0,140.0,38:43,2323.0,...,[Entertainment],"[내편하자, 찐친, 나래식, 박나래, 나래바, 박나래 유튜브, 코빅, 코미디빅리그,...",https://www.youtube.com/@나래식,0.0,public,not_live,작품에 따라 이미지가 확확 바뀌는\n천의 얼굴 박규영✨\n나도 넷플릭스 딸이었는데\...,나래식,https://www.youtube.com/@%EB%82%98%EB%9E%98%EC...,
932,"(SUB) 윤남노 | “감사한 기억 때문에 나오고 싶었어요” | 박나래 미담, 윤남...",fkBJJ8gPSiI,https://www.youtube.com/watch?v=fkBJJ8gPSiI,2025-09-17,https://i.ytimg.com/vi/fkBJJ8gPSiI/maxresdefau...,631572.0,5345.0,200.0,37:02,2222.0,...,[Entertainment],"[내편하자, 찐친, 나래식, 박나래, 나래바, 박나래 유튜브, 코빅, 코미디빅리그,...",https://www.youtube.com/@나래식,0.0,public,not_live,털릴대로 털린 나래시피...\n고오급 레시피 한번 배워보려고 뫼신\n ‘요리하는 돌...,나래식,https://www.youtube.com/@%EB%82%98%EB%9E%98%EC...,
933,(SUB) 송중기X천우희 | 🙋우희: 나 오늘 삐뚤어지고 싶어!! 💁‍♂️중기:난 ...,MANALVR7L7I,https://www.youtube.com/watch?v=MANALVR7L7I,2025-09-10,https://i.ytimg.com/vi_webp/MANALVR7L7I/maxres...,760945.0,4671.0,212.0,38:40,2320.0,...,[Entertainment],"[내편하자, 찐친, 나래식, 박나래, 나래바, 박나래 유튜브, 코빅, 코미디빅리그,...",https://www.youtube.com/@나래식,0.0,public,not_live,일탈해 보는 것도 청춘이잖아~😎 \n전공 분야에 신이난 주인장의 일탈 강의부터\n모...,나래식,https://www.youtube.com/@%EB%82%98%EB%9E%98%EC...,
934,(SUB) 베이비복스 | 황소개구리 먹고 대한해협까지 건넌 1세대 걸그룹 | 야만의...,VOLXrIgk6nk,https://www.youtube.com/watch?v=VOLXrIgk6nk,2025-09-03,https://i.ytimg.com/vi/VOLXrIgk6nk/maxresdefau...,626737.0,7087.0,704.0,41:59,2519.0,...,[Entertainment],"[내편하자, 찐친, 나래식, 박나래, 나래바, 박나래 유튜브, 코빅, 코미디빅리그,...",https://www.youtube.com/@나래식,0.0,public,not_live,데뷔가 번지점프?!🥶\n지금은 상상도 못 할 일들 다 겪은\n원조 쇠맛 테토녀 1세...,나래식,https://www.youtube.com/@%EB%82%98%EB%9E%98%EC...,


In [25]:
df_meta=df_meta.drop(columns=['age_limit', 'availability', 'live_status', 'source_channel_name', 'source_channel_url'])

In [33]:
df_meta=df_meta.iloc[5:,:]

In [35]:
df_all=pd.concat([df, df_meta])
df_all.describe()

Unnamed: 0,view_count,like_count,comment_count,duration_sec,subscriber_count
count,1171.0,1152.0,1121.0,1170.0,1171.0
mean,507328.2,9233.955729,697.747547,1857.963248,1602591.0
std,950595.2,20253.893475,2039.557901,3420.825403,2337990.0
min,12.0,0.0,1.0,20.0,5.0
25%,40747.0,794.25,85.0,709.25,331000.0
50%,202089.0,3586.0,254.0,1173.5,1360000.0
75%,540530.5,9110.5,652.0,1797.5,1690000.0
max,8720112.0,256684.0,44000.0,42459.0,21900000.0


In [39]:
df_all.to_csv('channel(@KoreanCryingGuy)_all_videos.csv', index=False, encoding='utf-8')