In [None]:
pip install opencv-python pillow easyocr ultralytics deepface tf-keras

In [1]:
import json
import os
import pandas as pd
import isodate

with open('video_info.json', "r", encoding="utf-8") as file:
        data = json.load(file)

flattened_data = [video for sublist in data for video in sublist]
df = pd.DataFrame(flattened_data)

def convert_duration(iso_duration):
    if pd.isna(iso_duration):  # NaN 값 처리
        return "00:00:00"
    if not isinstance(iso_duration, str):  # 문자열이 아닐 경우 처리
        return "00:00:00"

    try:
        duration = isodate.parse_duration(iso_duration)
        total_seconds = int(duration.total_seconds())
        return f"{total_seconds // 3600:02}:{(total_seconds % 3600) // 60:02}:{total_seconds % 60:02}"
    except Exception as e:
        print(f"Error parsing duration: {iso_duration} -> {e}")
        return "00:00:00"

# duration 변환 적용
df['duration'] = df['duration'].apply(convert_duration)

# 1분 이상인 데이터 필터링
df = df[df['duration'] > "00:01:00"]

In [2]:
df=df[200:203]
df

Unnamed: 0,title,video_id,published_date,thumbnail_url,view_count,like_count,comment_count,category_id,duration,channel_id
255,"결혼, 퇴사 후 유학 가는 이유 | 전공? 현실?",QAZnK8hausc,2024-10-31,https://i.ytimg.com/vi/QAZnK8hausc/hqdefault.jpg,386462.0,8497.0,1326.0,22,00:09:58,"[유네린LIN, UCpGmwseGcRtyG8GxRmozKPA]"
256,유학 준비부터 최종합격까지 4개월 | 그동안 숨긴 이유,KivJ4cMSgQw,2024-11-03,https://i.ytimg.com/vi/KivJ4cMSgQw/hqdefault.jpg,362688.0,7174.0,435.0,22,00:09:00,"[유네린LIN, UCpGmwseGcRtyG8GxRmozKPA]"
257,호주 유학 떠나는 날 | 호주 도착했어요! ✈️,eNAn_1xDZA8,2025-03-02,https://i.ytimg.com/vi/eNAn_1xDZA8/hqdefault.jpg,219345.0,2774.0,266.0,22,00:16:45,"[유네린LIN, UCpGmwseGcRtyG8GxRmozKPA]"


## 썸네일에서 피쳐 추출하기

In [10]:
import requests
import numpy as np
import cv2
from collections import Counter
from sklearn.cluster import KMeans
import easyocr
import torch
import webcolors

# YOLO 모델 로드 (Ultralytics YOLOv5 예제)
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
ocr_reader = easyocr.Reader(['en', 'ko'])  # 영어 & 한글 OCR 지원

# 🔹 이미지 로드 (URL에서 다운로드)
def load_image_from_url(url):
    try:
        response = requests.get(url, stream=True, timeout=5)
        response.raise_for_status()
        image = np.asarray(bytearray(response.content), dtype=np.uint8)
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        if image is None:
            raise ValueError("이미지를 디코딩할 수 없습니다.")
        return image
    except requests.RequestException as e:
        print(f"이미지를 불러오는 중 오류 발생: {e}")
        return None

# 🔹 이미지 전처리 (그레이스케일 + 이진화)
def preprocess_image(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    denoised_image = cv2.GaussianBlur(binary_image, (5, 5), 0)
    return denoised_image

# 🔹 텍스트 분석 (OCR)
def extract_text(image, confidence_threshold=0.7):
    results = ocr_reader.readtext(image)
    text_info = []
    for (bbox, text, prob) in results:
        if prob >= confidence_threshold:  # Filter text based on confidence threshold
            (top_left, _, bottom_right, _) = bbox
            x, y = int(top_left[0]), int(top_left[1])
            width = int(bottom_right[0] - top_left[0])
            height = int(bottom_right[1] - top_left[1])
            area = width * height
            # 신뢰도에 상관없이 모든 텍스트 정보 추출
            text_info.append({
                "text": text,
                "x": x, "y": y,
                "width": width, "height": height,
                "area": area,
                "probability": prob  # 신뢰도 정보도 함께 반환
            })
    return text_info

# 🔹 객체 탐지 (YOLO)
def detect_objects(image):
    if image is None:
        return {"objects": [], "central_focus": False}
    
    results = yolo_model(image)
    objects = []
    central_focus = False
    height, width, _ = image.shape
    
    for result in results.xyxy[0]:  # YOLOv5 결과
        x1, y1, x2, y2, conf, cls = map(int, result[:6])
        label = yolo_model.names[cls]
        
        # 객체의 중앙 여부 확인
        obj_center_x = (x1 + x2) / 2
        obj_center_y = (y1 + y2) / 2
        if (width * 0.3) < obj_center_x < (width * 0.7) and (height * 0.3) < obj_center_y < (height * 0.7):
            central_focus = True
        
        objects.append({"label": label, "x": x1, "y": y1, "width": x2-x1, "height": y2-y1})
    
    return {"objects": objects, "central_focus": central_focus}


# 주요 색상 추출 및 이름 변환
def closest_colour(requested_colour):
    """주어진 RGB 값과 가장 가까운 CSS3 색상명을 찾음"""
    min_colours = {}
    for name in webcolors.names("css3"):
        r_c, g_c, b_c = webcolors.name_to_rgb(name)
        rd = (r_c - requested_colour[0]) ** 2
        gd = (g_c - requested_colour[1]) ** 2
        bd = (b_c - requested_colour[2]) ** 2
        min_colours[(rd + gd + bd)] = name
    return min_colours[min(min_colours.keys())]

def get_color_name_from_rgb(r, g, b):
    """정확한 색상이 있으면 반환, 없으면 가장 가까운 색상명 반환"""
    try:
        return webcolors.rgb_to_name((r, g, b), spec='css3')
    except ValueError:
        return closest_colour((r, g, b))

def extract_colors(image, num_colors=5):
    """이미지에서 주요 색상을 추출하고, 각 색상의 비율을 반환"""
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).reshape((-1, 3))
    kmeans = KMeans(n_clusters=num_colors, n_init="auto")
    labels = kmeans.fit_predict(image)
    palette = kmeans.cluster_centers_.astype(int)
    
    # 색상의 출현 빈도 계산
    counter = np.bincount(labels)
    total_count = np.sum(counter)
    
    # 주요 색상 및 비율 저장
    color_ratios = {tuple(palette[i]): counter[i] / total_count for i in range(len(palette))}
    
    # 색상명을 가져오도록 수정
    color_list = [(get_color_name_from_rgb(*color), ratio) for color, ratio in color_ratios.items()]
    
    return sorted(color_list, key=lambda x: x[1], reverse=True)

# 🔹 밝기 및 대비 계산
def calculate_brightness(image):
    return np.mean(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))

def calculate_contrast(image):
    return np.std(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))

# 🔹 전체 썸네일 분석 함수
def analyze_image(image_url):
    image = load_image_from_url(image_url)
    if image is None:
        return None
    
    # Step 1: 이미지 특성 추출 (색상, 밝기, 대비)
    colors = extract_colors(image)
    brightness = calculate_brightness(image)
    contrast = calculate_contrast(image)
    
    # Step 2: 이미지 전처리 후 OCR 텍스트 추출
    processed_image = preprocess_image(image)
    text_data = extract_text(processed_image, confidence_threshold=0.7)
    
    # Step 3: 객체 탐지 (YOLO)
    object_data = detect_objects(image)
    
    # 결과 리턴
    result = {
        "dominant_colors": colors,
        "brightness": brightness,
        "contrast": contrast,
        "text_details": text_data,
        "object_details": object_data
    }
    
    return result

brightness_li=[]
contrast_li=[]
dominant_colors_li=[]
text_details_li=[]
largest_text_li=[]
objects_details_li=[]

for i in df['thumbnail_url']:
    image_url = i
    analysis_result = analyze_image(image_url)
    
    brightness_li.append(analysis_result['brightness'])
    contrast_li.append(analysis_result['contrast'])
    dominant_colors_li.append(analysis_result['dominant_colors'])
    text_details_li.append(analysis_result['text_details'])
    objects_details_li.append(analysis_result['object_details']['objects'])

df['brightness']=brightness_li
df['contrast']=contrast_li
df['dominant_colors']=dominant_colors_li
df['text_details']=text_details_li
df['objects_details']=objects_details_li

Using cache found in C:\Users\duwjd/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2025-3-18 Python-3.11.7 torch-2.6.0+cu118 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):


In [11]:
brightness_li=[]
contrast_li=[]
dominant_colors_li=[]
text_details_li=[]
largest_text_li=[]
objects_details_li=[]

for i in df['thumbnail_url']:
    image_url = i
    analysis_result = analyze_image(image_url)
    
    brightness_li.append(analysis_result['brightness'])
    contrast_li.append(analysis_result['contrast'])
    dominant_colors_li.append(analysis_result['dominant_colors'])
    text_details_li.append(analysis_result['text_details'])
    objects_details_li.append(analysis_result['object_details']['objects'])

df['brightness']=brightness_li
df['contrast']=contrast_li
df['dominant_colors']=dominant_colors_li
df['text_details']=text_details_li
df['objects_details']=objects_details_li

  with amp.autocast(autocast):
  with amp.autocast(autocast):
  with amp.autocast(autocast):


## 썸네일 분석결과 정리 및 칼럼 추출

In [197]:
url='https://i.ytimg.com/vi/FR4E9DPYG3A/hqdefault.jpg'
response = requests.get(url, stream=True, timeout=5)
response.raise_for_status()
image = np.asarray(bytearray(response.content), dtype=np.uint8)
image = cv2.imdecode(image, cv2.IMREAD_COLOR)
# 이미지 크기 얻기
height, width, _ = image.shape  # height와 width 값을 가져옵니다.
print(f"Image Width: {width}, Image Height: {height}")

Image Width: 480, Image Height: 360


In [15]:
def classify_positions(image_width, image_height, text_details, object_details):
    def classify_position(x, y, width, height):
        """ 위치를 왼쪽/중간/오른쪽, 위/중간/아래, 크기별로 분류 """
        right_x = x + width
        bottom_y = y + height

        # 가로 위치 (left, middle, right)
        if right_x < image_width * 0.33:
            horizontal_position = "left"
        elif x > image_width * 0.67:
            horizontal_position = "right"
        else:
            horizontal_position = "middle"

        # 세로 위치 (up, middle, down)
        if bottom_y < image_height * 0.33:
            vertical_position = "up"
        elif y > image_height * 0.67:
            vertical_position = "down"
        else:
            vertical_position = "middle"

        # 크기 분류 (s, m, l)
        area = width * height
        size_category = "s" if area < 10000 else ("m" if area < 30000 else "l")

        return f"{horizontal_position} {vertical_position} {size_category}"

    # 📌 Text 위치 분석
    text_positions = [classify_position(td['x'], td['y'], td['width'], td['height']) for td in text_details]
    if not text_positions:  # 텍스트가 없을 경우
        text_positions = ["텍스트 없음"]

    # 📌 Person 위치 분석
    person_positions = [
        classify_position(obj['x'], obj['y'], obj['width'], obj['height'])
        for obj in object_details if obj['label'] == 'person'
    ]
    if not person_positions:  # 사람 객체가 없을 경우
        person_positions = ["사람 없음"]

    # 📌 신뢰도 높은 텍스트 추출 (probability ≥ 0.7)
    high_confidence_texts = [td["text"] for td in text_details if td.get("probability", 0) >= 0.7]
    prob_text = high_confidence_texts if high_confidence_texts else ["해당 없음"]

    return text_positions, person_positions, prob_text


# 🖼️ DataFrame에 새로운 컬럼 추가
df[["text_positions", "person_positions", "prob_text"]] = df.apply(
    lambda row: pd.Series(
        classify_positions(
            image_width=800,  # 이미지 가로 크기
            image_height=600,  # 이미지 세로 크기
            text_details=row["text_details"],
            object_details=row["objects_details"]
        )
    ),
    axis=1
)

In [17]:
objects_li=[]
for i in df['objects_details']:
    object_li=[]
    for j in i:
        object_li.append(j['label'])
    objects_li.append(object_li)

assert len(df) == len(objects_li), "Length of text_position_li does not match the number of rows in df"
df['contain_object'] = objects_li

del df['text_details']
del df['objects_details']

In [45]:
print(df.iloc[2])

title                                      호주 유학 떠나는 날 | 호주 도착했어요! ✈️
video_id                                                  eNAn_1xDZA8
published_date                                             2025-03-02
thumbnail_url        https://i.ytimg.com/vi/eNAn_1xDZA8/hqdefault.jpg
view_count                                                   219345.0
like_count                                                     2774.0
comment_count                                                   266.0
category_id                                                        22
duration                                                     00:16:45
channel_id                         [유네린LIN, UCpGmwseGcRtyG8GxRmozKPA]
brightness                                                 110.232338
contrast                                                    76.880678
dominant_colors     [(darkgrey, 0.30931712962962965), (black, 0.28...
text_positions                                               [텍스트 없음]
person_positions    

In [47]:
import requests
from bs4 import BeautifulSoup

# 크롤링하려는 YouTube 채널 URL
channel_url = 'https://www.youtube.com/channel/UCpGmwseGcRtyG8GxRmozKPA'  # 예시 채널 URL

# HTTP 요청 보내기
response = requests.get(channel_url)

# BeautifulSoup을 사용해 HTML 파싱
soup = BeautifulSoup(response.text, 'html.parser')

# 채널 이름 추출
channel_name = soup.find('meta', {'property': 'og:title'})['content']
print(f"Channel Name: {channel_name}")

# 구독자 수 추출 (구독자 수는 페이지 내에 'yt-subscription-button-subscriber-count-branded-horizontal' 클래스에 포함되어 있음)
subscriber_count = soup.find('span', {'class': 'yt-subscription-button-subscriber-count-branded-horizontal'})
if subscriber_count:
    print(f"Subscriber Count: {subscriber_count.text.strip()}")
else:
    print("Subscriber count not found")

Channel Name: 유네린LIN
Subscriber count not found


In [137]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

channel_name_li=[]
subscriber_count_li=[]
video_count_li=[]

# Chrome 웹드라이버 설정
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

for i in df['channel_id']:
    # 크롤링하려는 YouTube 채널 URL
    channel_url = f'https://www.youtube.com/channel/{i[1]}'
    
    # 채널 페이지 열기
    driver.get(channel_url)
    
    # 페이지 로딩 대기 (동적 콘텐츠가 로딩될 시간을 줍니다)
    time.sleep(1)
    
    # 정보 가져오기
    channel_name = driver.find_element(By.XPATH, '//meta[@property="og:title"]').get_attribute('content')
    channel_name_li.append(channel_name)
    subscriber_count = driver.find_element(By.XPATH, '//*[@id="page-header"]/yt-page-header-renderer/yt-page-header-view-model/div/div[1]/div/yt-content-metadata-view-model/div[2]/span[1]').text
    subscriber_count_li.append(subscriber_count)
    video_count = driver.find_element(By.XPATH, '//*[@id="page-header"]/yt-page-header-renderer/yt-page-header-view-model/div/div[1]/div/yt-content-metadata-view-model/div[2]/span[2]').text
    video_count_li.append(video_count)

# 브라우저 종료
driver.quit()


In [141]:
video_count_li

['•', '•', '•']

In [145]:
channel_url = 'https://www.youtube.com/channel/UCpGmwseGcRtyG8GxRmozKPA'
    
    # 채널 페이지 열기
driver.get(channel_url)
    

subscriber_count = driver.find_element(By.XPATH, '//*[@id="page-header"]/yt-page-header-renderer/yt-page-header-view-model/div/div[1]/div/yt-content-metadata-view-model/div[2]/span[3]').text
subscriber_count_li.append(subscriber_count)

MaxRetryError: HTTPConnectionPool(host='localhost', port=60506): Max retries exceeded with url: /session/a47ca2b1d1adbc4e2e1b3422223312eb/url (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000188C40C6450>: Failed to establish a new connection: [WinError 10061] 대상 컴퓨터에서 연결을 거부했으므로 연결하지 못했습니다'))