In [None]:
import json
import os
import pandas as pd
import isodate
import requests
import numpy as np
import cv2
from collections import Counter
from sklearn.cluster import KMeans
import easyocr
import torch
import webcolors
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
from tqdm import tqdm
from selenium.webdriver.chrome.options import Options
import random

## 썸네일

In [None]:
# YOLO 모델 로드 (Ultralytics YOLOv5 예제)
yolo_model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
ocr_reader = easyocr.Reader(['en', 'ko'])  # 영어 & 한글 OCR 지원

# 이미지 로드 (URL에서 다운로드)
def load_image_from_url(url):
    try:
        response = requests.get(url, stream=True, timeout=5)
        response.raise_for_status()
        image = np.asarray(bytearray(response.content), dtype=np.uint8)
        image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        if image is None:
            raise ValueError("이미지를 디코딩할 수 없습니다.")
        return image
    except requests.RequestException as e:
        print(f"이미지를 불러오는 중 오류 발생: {e}")
        return None

# 이미지 전처리 (그레이스케일 + 이진화)
def preprocess_image(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    denoised_image = cv2.GaussianBlur(binary_image, (5, 5), 0)
    return denoised_image

# 텍스트 분석 (OCR)
def extract_text(image, confidence_threshold=0.7):
    results = ocr_reader.readtext(image)
    text_info = []
    for (bbox, text, prob) in results:
        if prob >= confidence_threshold: 
            (top_left, _, bottom_right, _) = bbox
            x, y = int(top_left[0]), int(top_left[1])
            width = int(bottom_right[0] - top_left[0])
            height = int(bottom_right[1] - top_left[1])
            area = width * height
            
            text_info.append({
                "text": text,
                "x": x, "y": y,
                "width": width, "height": height,
                "area": area,
                "probability": prob 
            })
    return text_info

# 객체 탐지 (YOLO)
def detect_objects(image):
    if image is None:
        return {"objects": [], "central_focus": False}
    
    results = yolo_model(image)
    objects = []
    central_focus = False
    height, width, _ = image.shape
    
    for result in results.xyxy[0]:  # YOLOv5 결과
        x1, y1, x2, y2, conf, cls = map(int, result[:6])
        label = yolo_model.names[cls]
        
        # 객체의 중앙 여부 확인
        obj_center_x = (x1 + x2) / 2
        obj_center_y = (y1 + y2) / 2
        if (width * 0.3) < obj_center_x < (width * 0.7) and (height * 0.3) < obj_center_y < (height * 0.7):
            central_focus = True
        
        objects.append({"label": label, "x": x1, "y": y1, "width": x2-x1, "height": y2-y1})
    
    return {"objects": objects, "central_focus": central_focus}


# 주요 색상 추출 및 이름 변환
def closest_colour(requested_colour):
    """주어진 RGB 값과 가장 가까운 CSS3 색상명을 찾음"""
    min_colours = {}
    for name in webcolors.names("css3"):
        r_c, g_c, b_c = webcolors.name_to_rgb(name)
        rd = (r_c - requested_colour[0]) ** 2
        gd = (g_c - requested_colour[1]) ** 2
        bd = (b_c - requested_colour[2]) ** 2
        min_colours[(rd + gd + bd)] = name
    return min_colours[min(min_colours.keys())]

def get_color_name_from_rgb(r, g, b):
    """정확한 색상이 있으면 반환, 없으면 가장 가까운 색상명 반환"""
    try:
        return webcolors.rgb_to_name((r, g, b), spec='css3')
    except ValueError:
        return closest_colour((r, g, b))

def extract_colors(image, num_colors=5):
    """이미지에서 주요 색상을 추출하고, 각 색상의 비율을 반환"""
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).reshape((-1, 3))
    kmeans = KMeans(n_clusters=num_colors, n_init="auto")
    labels = kmeans.fit_predict(image)
    palette = kmeans.cluster_centers_.astype(int)
    
    # 색상의 출현 빈도 계산
    counter = np.bincount(labels)
    total_count = np.sum(counter)
    # 주요 색상 및 비율 저장
    color_ratios = {tuple(palette[i]): counter[i] / total_count for i in range(len(palette))}
    # 색상명을 가져오도록 수정
    color_list = [(get_color_name_from_rgb(*color), ratio) for color, ratio in color_ratios.items()]
    
    return sorted(color_list, key=lambda x: x[1], reverse=True)

# 🔹 밝기 및 대비 계산
def calculate_brightness(image):
    return np.mean(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))

def calculate_contrast(image):
    return np.std(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))

# 🔹 전체 썸네일 분석 함수
def analyze_image(image_url):
    image = load_image_from_url(image_url)
    if image is None:
        return None
    
    # Step 1: 이미지 특성 추출 (색상, 밝기, 대비)
    colors = extract_colors(image)
    brightness = calculate_brightness(image)
    contrast = calculate_contrast(image)
    
    # Step 2: 이미지 전처리 후 OCR 텍스트 추출
    processed_image = preprocess_image(image)
    text_data = extract_text(processed_image, confidence_threshold=0.7)
    
    # Step 3: 객체 탐지 (YOLO)
    object_data = detect_objects(image)
    
    # 결과 리턴
    result = {
        "dominant_colors": colors,
        "brightness": brightness,
        "contrast": contrast,
        "text_details": text_data,
        "object_details": object_data
    }
    
    return result

brightness_li=[]
contrast_li=[]
dominant_colors_li=[]
text_details_li=[]
largest_text_li=[]
objects_details_li=[]

for i in df['thumbnail_url']:
    image_url = i
    analysis_result = analyze_image(image_url)
    
    brightness_li.append(analysis_result['brightness'])
    contrast_li.append(analysis_result['contrast'])
    dominant_colors_li.append(analysis_result['dominant_colors'])
    text_details_li.append(analysis_result['text_details'])
    objects_details_li.append(analysis_result['object_details']['objects'])

df['brightness']=brightness_li
df['contrast']=contrast_li
df['dominant_colors']=dominant_colors_li
df['text_details']=text_details_li
df['objects_details']=objects_details_li

In [None]:
def classify_positions(image_width, image_height, text_details, object_details):
    def classify_position(x, y, width, height):
        """ 위치를 왼쪽/중간/오른쪽, 위/중간/아래, 크기별로 분류 """
        right_x = x + width
        bottom_y = y + height

        # 가로 위치 (left, middle, right)
        if right_x < image_width * 0.33:
            horizontal_position = "left"
        elif x > image_width * 0.67:
            horizontal_position = "right"
        else:
            horizontal_position = "middle"

        # 세로 위치 (up, middle, down)
        if bottom_y < image_height * 0.33:
            vertical_position = "up"
        elif y > image_height * 0.67:
            vertical_position = "down"
        else:
            vertical_position = "middle"

        # 크기 분류 (s, m, l)
        area = width * height
        size_category = "s" if area < 10000 else ("m" if area < 30000 else "l")

        return f"{horizontal_position} {vertical_position} {size_category}"

    # Text 위치 분석
    text_positions = [classify_position(td['x'], td['y'], td['width'], td['height']) for td in text_details]
    if not text_positions:  # 텍스트가 없을 경우
        text_positions = ["텍스트 없음"]

    # Person 위치 분석
    person_positions = [
        classify_position(obj['x'], obj['y'], obj['width'], obj['height'])
        for obj in object_details if obj['label'] == 'person'
    ]
    if not person_positions:  # 사람 객체가 없을 경우
        person_positions = ["사람 없음"]

    # 신뢰도 높은 텍스트 추출 (probability ≥ 0.7)
    high_confidence_texts = [td["text"] for td in text_details if td.get("probability", 0) >= 0.7]
    prob_text = high_confidence_texts if high_confidence_texts else ["해당 없음"]

    return text_positions, person_positions, prob_text


df[["text_positions", "person_positions", "prob_text"]] = df.apply(
    lambda row: pd.Series(
        classify_positions(
            image_width=800,  # 이미지 가로 크기
            image_height=600,  # 이미지 세로 크기
            text_details=row["text_details"],
            object_details=row["objects_details"]
        )
    ),
    axis=1
)

In [None]:
objects_li=[]
for i in df['objects_details']:
    object_li=[]
    for j in i:
        object_li.append(j['label'])
    objects_li.append(object_li)

assert len(df) == len(objects_li), "Length of text_position_li does not match the number of rows in df"
df['contain_object'] = objects_li

del df['text_details']
del df['objects_details']

## 제목

In [None]:
import re
import emoji
import pandas as pd
from konlpy.tag import Okt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

from nltk.corpus import stopwords

# KoNLPy의 Okt 형태소 분석기 초기화
okt = Okt()

# 불용어 리스트 (영어 불용어 및 한글 불용어)
stopwords_korean = [
    '이', '그', '저', '그리고', '하지만', '그래서', '또는', '왜', '어떻게', '나', '너', '저희', '우리', '그녀', '그의', 
    '그들', '같은', '많은', '다', '좀', '그렇지만', '여기', '거기', '이것', '그것', '이야', '할', '지금', '시간', '것', 
    '수', '같이', '되다', '하다', '있다', '없다', '위해', '왜냐하면', '하기', '까지', '좀', '나중에'
]
stopwords_english = [
    'the', 'and', 'a', 'an', 'in', 'on', 'at', 'for', 'with', 'about', 'as', 'by', 'of', 'to', 'from', 'that', 'which', 
    'who', 'whom', 'this', 'it', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'has', 'have', 'had', 'having', 'do', 
    'does', 'did', 'doing', 'doing', 'themselves', 'yours', 'ours', 'its', 'their', 'theirs', 'what', 'how', 'why', 'where', 
    'when', 'i', 'you', 'he', 'she', 'we', 'they', 'all', 'any', 'one', 'some', 'each', 'every', 'no', 'not', 'nor', 'only', 
    'own', 'same', 'so', 'than', 'too', 'very', 'just', 'don’t', 'should', 'now', 'up', 'down', 'here', 'there', 'when', 
    'where', 'why'
]
# 감성 분석을 위한 SentimentIntensityAnalyzer 초기화
analyzer = SentimentIntensityAnalyzer()

# 텍스트 전처리 함수 (특수문자 제거 및 소문자 변환)
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # 특수문자 제거
    return text.lower()

# 제목 길이 계산
def calculate_title_length(title):
    return len(title)

# 감성 분석 함수
def sentiment_analysis(title):
    sentiment_score = analyzer.polarity_scores(title)
    sentiment = 'positive' if sentiment_score['compound'] > 0 else 'negative' if sentiment_score['compound'] < 0 else 'neutral'
    return sentiment

# 이모티콘 포함 여부 확인
def contains_emoji(title):
    return any(emoji.is_emoji(char) for char in title)

def count_emojis(title):
    return sum(1 for char in title if emoji.is_emoji(char))

# 특수문자 포함 여부 확인
def contains_special_characters(title):
    return bool(re.search(r'[^\w\s]', title))

# 핵심 키워드 추출 함수 (KoNLPy를 사용한 명사 추출)
def extract_keywords_korean(title):
    nouns = okt.nouns(title)  # 명사만 추출
    filtered_nouns = [word for word in nouns if word not in stopwords_korean and word not in stopwords_english]
    
    if not filtered_nouns:  # 만약 필터링된 명사가 없다면 빈 리스트를 반환
        return []
    
    # 단어 빈도 계산
    vectorizer = CountVectorizer(stop_words=None, ngram_range=(1, 1))  # 1-gram만 추출
    X = vectorizer.fit_transform([' '.join(filtered_nouns)])
    word_freq = dict(zip(vectorizer.get_feature_names_out(), X.toarray().flatten()))
    
    # 상위 3개 키워드 추출
    sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
    return [word for word, freq in sorted_words[:3]]

# 전체 피쳐 추출 함수
def extract_features_from_titles(df):
    features = []
    for title in df['title']:  # DataFrame에서 제목을 하나씩 처리
        preprocessed_title = preprocess_text(title)
        title_length = calculate_title_length(title)
        sentiment = sentiment_analysis(title)
        has_emoji = contains_emoji(title)
        count_emoji = count_emojis(title)
        has_special_characters = contains_special_characters(title)
        keywords = extract_keywords_korean(preprocessed_title)
        
        # 추출된 피쳐들
        features.append({
            'title': title,
            'title_length': title_length,
            'sentiment': sentiment,
            'has_emoji': has_emoji,
            'emoji_count': count_emoji, 
            'has_special_characters': has_special_characters,
            'keywords': keywords
        })
    
    # 피쳐들을 DataFrame으로 변환
    feature_df = pd.DataFrame(features)
    return feature_df

# 피쳐 추출
feature_df = extract_features_from_titles(df)