In [45]:
import ast
import cv2
import numpy as np
import requests
import torch  # YOLOv5는 PyTorch로 구현됨
from collections import Counter, defaultdict
from sklearn.cluster import KMeans
import webcolors
import easyocr
import pandas as pd
from datetime import datetime
from konlpy.tag import Okt
import emoji
import re

# 사용자 입력 받기
title = '저 드디어 강남에 샵 오픈했어요😭취향 가득 담은 제 샵을 소개합니다🤍 | 인테리어 브이로그 | 반셀프 인테리어 | 뷰티샵 인테리어 | 재유JEYU'
thumbnail_url = 'https://i.ytimg.com/vi/IbxI43fHWnk/maxresdefault.jpg'
duration = '08:30'
subscriber_count = 200000

# 현재 시간 기준으로 pub_year, pub_month, pub_weekday 구하기
current_time = datetime.now()
pub_year = current_time.year
pub_month = current_time.month
pub_weekday = current_time.weekday()  # 월요일=0, 일요일=6

# 입력받은 데이터를 dictionary로 저장
data = {
    "title": title,
    "thumbnail_url": thumbnail_url,
    "duration": duration,
    "subscriber_count": subscriber_count,
    "pub_year": pub_year,
    "pub_month": pub_month,
    "pub_weekday": pub_weekday
}

# DataFrame으로 변환
df = pd.DataFrame([data])

# OCR Reader 설정
ocr_reader = easyocr.Reader(['en', 'ko'])  # 영어 & 한글 OCR 지원

def convert_duration_to_minutes(duration_str):
    try:
        hours, minutes = map(int, duration_str.split(":"))
        return hours * 60 + minutes
    except Exception as e:
        print(f"Error converting duration: {e}")
        return 0

# 'duration' 컬럼을 분으로 변환
df['duration'] = df['duration'].apply(convert_duration_to_minutes)

# 이미지 로드 (URL에서 다운로드)
def load_image_from_url(url):
    fallback_urls = [
        url,
        url.replace('maxresdefault', 'sddefault'),
        url.replace('sddefault', 'hqdefault'),
        url.replace('hqdefault', 'mqdefault'),
        url.replace('mqdefault', 'default')
    ]

    for new_url in fallback_urls:
        try:
            response = requests.get(new_url, stream=True, timeout=5)
            response.raise_for_status()
            image = np.asarray(bytearray(response.content), dtype=np.uint8)
            image = cv2.imdecode(image, cv2.IMREAD_COLOR)
            if image is not None:
                height, width, _ = image.shape
                return image, height, width
        except requests.RequestException:
            print(f"이미지 로드 실패: {new_url}")

    raise ValueError("모든 URL에서 이미지를 불러올 수 없습니다.")

# 이미지 전처리 (그레이스케일 + 이진화)
def preprocess_image(image):
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, binary_image = cv2.threshold(gray_image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    denoised_image = cv2.GaussianBlur(binary_image, (5, 5), 0)
    return denoised_image

# 텍스트 분석 (OCR)
def extract_text(image, confidence_threshold=0.7):
    results = ocr_reader.readtext(image)
    text_info = []
    for (bbox, text, prob) in results:
        if prob >= confidence_threshold:
            (top_left, _, bottom_right, _) = bbox
            x, y = int(top_left[0]), int(top_left[1])
            width = int(bottom_right[0] - top_left[0])
            height = int(bottom_right[1] - top_left[1])
            area = width * height
            
            text_info.append({
                "text": text,
                "x": x, "y": y,
                "width": width, "height": height,
                "area": area,
                "probability": prob
            })
    return text_info

# YOLOv5 모델 로드 (PyTorch Hub)
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')  # 'yolov5s'는 작은 모델 (빠름)

# 객체 탐지 (YOLOv5)
def detect_objects_with_yolov5(image):
    if image is None:
        return {"objects": [], "central_focus": False}
    
    # YOLOv5 모델을 사용하여 객체 감지
    results = model(image)  # 이미지에서 객체 감지
    objects = []
    central_focus = False
    height, width, _ = image.shape

    # 감지된 객체들
    for det in results.xywh[0].cpu().numpy():  # 결과는 (x_center, y_center, width, height, confidence, class) 형식
        x_center, y_center, w, h, conf, cls = det
        if conf > 0.5:  # confidence score가 0.5 이상인 객체만 사용
            label = results.names[int(cls)]  # 객체의 클래스 이름 (예: 'person', 'car', etc.)
            x1 = int((x_center - w / 2) * width)
            y1 = int((y_center - h / 2) * height)
            x2 = int((x_center + w / 2) * width)
            y2 = int((y_center + h / 2) * height)
            objects.append({"label": label, "x": x1, "y": y1, "width": x2 - x1, "height": y2 - y1})
            
            # 중앙에 가까운지 확인
            if (width * 0.3) < x_center < (width * 0.7) and (height * 0.3) < y_center < (height * 0.7):
                central_focus = True
    
    # 감지된 객체와 중앙에 있는지 여부 리턴
    return {"objects": objects, "central_focus": central_focus}

# 주요 색상 추출 및 이름 변환
def closest_colour(requested_colour):
    """주어진 RGB 값과 가장 가까운 CSS3 색상명을 찾음"""
    min_colours = {}
    for name in webcolors.names("css3"):
        r_c, g_c, b_c = webcolors.name_to_rgb(name)
        rd = (r_c - requested_colour[0]) ** 2
        gd = (g_c - requested_colour[1]) ** 2
        bd = (b_c - requested_colour[2]) ** 2
        min_colours[(rd + gd + bd)] = name
    return min_colours[min(min_colours.keys())]

def get_color_name_from_rgb(r, g, b):
    """정확한 색상이 있으면 반환, 없으면 가장 가까운 색상명 반환"""
    try:
        return webcolors.rgb_to_name((r, g, b), spec='css3')
    except ValueError:
        return closest_colour((r, g, b))

def extract_colors(image, num_colors=3):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).reshape((-1, 3))
    kmeans = KMeans(n_clusters=num_colors, n_init="auto")
    labels = kmeans.fit_predict(image)
    palette = kmeans.cluster_centers_.astype(int)
    
    # 색상의 출현 빈도 계산
    counter = np.bincount(labels)
    total_count = np.sum(counter)
    # 주요 색상 및 비율 저장
    color_ratios = {tuple(palette[i]): counter[i] / total_count for i in range(len(palette))}
    # 색상명을 가져오도록 수정
    color_list = [(get_color_name_from_rgb(*color), ratio) for color, ratio in color_ratios.items()]
    
    return sorted(color_list, key=lambda x: x[1], reverse=True)

# 밝기 및 대비 계산
def calculate_brightness(image):
    return np.mean(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))

def calculate_contrast(image):
    return np.std(cv2.cvtColor(image, cv2.COLOR_BGR2GRAY))

# 썸네일 분석 함수
def analyze_image(image_url):
    image, height, width = load_image_from_url(image_url)
    if image is None:
        return None
    
    # Step 1: 이미지 특성 추출 (색상, 밝기, 대비)
    colors = extract_colors(image)
    brightness = calculate_brightness(image)
    contrast = calculate_contrast(image)
    
    # Step 2: 이미지 전처리 후 OCR 텍스트 추출
    processed_image = preprocess_image(image)
    text_data = extract_text(processed_image, confidence_threshold=0.7)
    
    # Step 3: 객체 탐지 (YOLOv5)
    object_data = detect_objects_with_yolov5(image)
    
    # 결과 리턴
    result = {
        "dominant_colors": colors,
        "brightness": brightness,
        "contrast": contrast,
        "text_details": text_data,
        "object_details": object_data
    }
    
    return result, height, width

# Get the size of the thumbnail image (width, height)
def get_thumbnail_size(url):
    image, height, width = load_image_from_url(url)
    return (height, width) if image is not None else (None, None)

# Add 'thumbnail_size' column
df['thumbnail_size'] = df['thumbnail_url'].apply(get_thumbnail_size)

# 형태소 분석기 초기화
okt = Okt()

# 불용어 리스트
stopwords = set([
    '은', '는', '이', '가', '을', '를', '에', '의', '도', '로', '과', '와', '한', '하다',
    '에서', '에게', '까지', '부터', '보다', '처럼', '만', '없이', '수', '것', '좀', '더', '이',
    '또', '등', '그', '이것', '저것', '그것', '거', '때', '건', '중', '나', '너', '저', '우리',
    '누구', '뭐', '왜', '어디', '어떻게', '영상', '채널', '오늘', '이제', '정말', '진짜',
    '완전', '그냥', '내가', '당신', '내용', '제목', '시작', '끝', '다시', '모두',
    '최고', '대박', '소름', '헐', 'ㅋㅋ', 'ㅎㅎ', 'ㅠㅠ', '와우','자막','브이','로그','일상',
    '정보', '필독', '업로드', '자막', '구독', '좋아요', '댓글', '시청', '확인',
    '보세요'
])

# 클릭 유도 키워드
clickbait_keywords = [
    '실화', '충격', '대박', '소름', '반전', '최초', '드디어', '헐', '진실',
    '믿기지', '이게', '무조건', '죽기 전에', '꼭 봐야할'
]

# 피처 추출 함수
def extract_korean_title_features(title, video_id):
    features = {}
    features['video_id'] = video_id
    features['title'] = title
    features['title_length'] = len(title)
    features['word_count'] = len(okt.morphs(title))

    # 이모지 관련 피처
    features['emoji_count'] = sum(1 for char in title if char in emoji.EMOJI_DATA)
    features['has_emoji'] = int(features['emoji_count'] > 0)

    # 특수문자 수
    special_chars = re.findall(r"[!\"#$%&'()*+,\-./:;<=>?@\[\]^_`{|}~]", title)
    features['special_char_count'] = len(special_chars)

    # 클릭 유도 키워드 포함 여부
    features['is_clickbait'] = int(any(word in title for word in clickbait_keywords))

    # 구두점 포함 여부
    features['has_question_mark'] = '?' in title
    features['has_exclamation'] = '!' in title

    # 주요 명사 3개 추출 (불용어 제거 포함)
    nouns = okt.nouns(title)
    filtered_nouns = [noun for noun in nouns if noun not in stopwords and len(noun) > 1]
    noun_freq = Counter(filtered_nouns)
    top_nouns = [word for word, _ in noun_freq.most_common(3)]
    for i in range(3):
        features[f'top_noun_{i+1}'] = top_nouns[i] if i < len(top_nouns) else ''

    return features

# Extract features from title and thumbnail
title_features = extract_korean_title_features(df.iloc[0, 0], '0')
thumbnail_features, height, width = analyze_image(df.iloc[0, 1])

# Add features to the DataFrame
# Title features
for key, value in title_features.items():
    df[key] = value

# Thumbnail features (dominant_colors, brightness, contrast, etc.)
df['dominant_colors'] = [', '.join([color[0] for color in thumbnail_features['dominant_colors']])]

df['brightness'] = thumbnail_features['brightness']
df['contrast'] = thumbnail_features['contrast']
df['text_details'] = [thumbnail_features['text_details']]  # Keep the structure intact for text details
df['object_details'] = [thumbnail_features['object_details']]  # Keep the structure intact for object details

def classify_positions(image_width, image_height, text_details, object_details):
    def classify_position(x, y, width, height):
        """ 위치를 왼쪽/중간/오른쪽, 위/중간/아래, 크기별로 분류 """
        right_x = x + width
        bottom_y = y + height

        # 가로 위치 (left, middle, right)
        if right_x < image_width * 0.33:
            horizontal_position = "left"
        elif x > image_width * 0.67:
            horizontal_position = "right"
        else:
            horizontal_position = "middle"

        # 세로 위치 (up, middle, down)
        if bottom_y < image_height * 0.33:
            vertical_position = "up"
        elif y > image_height * 0.67:
            vertical_position = "down"
        else:
            vertical_position = "middle"

        # 크기 분류 (s, m, l)
        area = width * height
        size_category = "s" if area < 10000 else ("m" if area < 30000 else "l")

        return f"{horizontal_position} {vertical_position} {size_category}"

    # 텍스트 위치 분석
    text_positions = [classify_position(td['x'], td['y'], td['width'], td['height']) for td in text_details]
    if not text_positions:  # 텍스트가 없을 경우
        text_positions = ["텍스트 없음"]

    # 사람 위치 분석
    person_positions = []
    
    # Ensure 'object_details' is a dictionary and contains 'objects' key
    if isinstance(object_details, dict) and 'objects' in object_details:
        for obj in object_details['objects']:
            if isinstance(obj, dict) and obj.get('label') == 'person':
                # 'person' 객체만 위치 분류
                position = classify_position(obj['x'], obj['y'], obj['width'], obj['height'])
                person_positions.append(position)

    if not person_positions:  # 사람 객체가 없을 경우
        person_positions = ["사람 없음"]

    # 신뢰도 높은 텍스트 추출 (probability ≥ 0.7)
    high_confidence_texts = [td["text"] for td in text_details if td.get("probability", 0) >= 0.7]
    prob_text = high_confidence_texts if high_confidence_texts else ["해당 없음"]

    return text_positions, person_positions, prob_text


# Apply the function to classify text and person positions
df[["text_positions", "person_positions", "prob_text"]] = df.apply(
    lambda row: pd.Series(
        classify_positions(
            image_width=row["thumbnail_size"][1],   # width (두 번째 요소)
            image_height=row["thumbnail_size"][0],  # height (첫 번째 요소)
            text_details=ast.literal_eval(row["text_details"]) if isinstance(row["text_details"], str) else row["text_details"],
            object_details=ast.literal_eval(row["object_details"]) if isinstance(row["object_details"], str) else row["object_details"]
        )
    ),
    axis=1
)

# Ensure that 'person_positions' is a list, if it's a string, convert it to list
df['person_positions'] = df['person_positions'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

def extract_labels(obj_detail_str):
    try:
        # If obj_detail_str is a string, convert it to a dictionary
        obj_details = ast.literal_eval(obj_detail_str) if isinstance(obj_detail_str, str) else obj_detail_str
        
        # Check if the 'objects' key exists and it contains a list
        if isinstance(obj_details, dict) and 'objects' in obj_details and isinstance(obj_details['objects'], list):
            # Extract labels from the objects list
            labels = [obj["label"] for obj in obj_details['objects'] if "label" in obj]
            return labels if labels else ["없음"]
        else:
            return ["에러"]
    except Exception as e:
        # Handle any errors (e.g., invalid format, missing 'objects' key)
        print(f"Error extracting labels: {e}")
        return ["에러"]

df["object_labels"] = df["object_details"].apply(extract_labels)


# 문자열을 실제 리스트로 변환 (object_labels 칼럼에만 적용)
df['object_labels'] = df['object_labels'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

def safe_literal_eval(val):
    if isinstance(val, str):
        try:
            return ast.literal_eval(val)
        except (ValueError, SyntaxError):
            return []
    return val if isinstance(val, list) else []

df['person_count'] = df['object_labels'].apply(lambda x: x.count('person') if isinstance(x, list) else 0)
df['object_count'] = df['object_labels'].apply(lambda x: len([obj for obj in x if obj != 'person']) if isinstance(x, list) else 0)

df['has_text'] = df['text_positions'].apply(lambda x: int(x != ['텍스트 없음']))

# 사람 위치
df['person_left'] = df['person_positions'].apply(lambda x: sum('left' in p for p in x))
df['person_middle'] = df['person_positions'].apply(lambda x: sum('middle' in p for p in x))
df['person_right'] = df['person_positions'].apply(lambda x: sum('right' in p for p in x))
df['person_small'] = df['person_positions'].apply(lambda x: sum('s' in p for p in x))
df['person_medium'] = df['person_positions'].apply(lambda x: sum('m' in p for p in x))
df['person_large'] = df['person_positions'].apply(lambda x: sum('l' in p for p in x))

# 텍스트 위치
df['text_left'] = df['text_positions'].apply(lambda x: sum('left' in p for p in x))
df['text_middle'] = df['text_positions'].apply(lambda x: sum('middle' in p for p in x))
df['text_right'] = df['text_positions'].apply(lambda x: sum('right' in p for p in x))
df['text_small'] = df['text_positions'].apply(lambda x: sum('s' in p for p in x))
df['text_medium'] = df['text_positions'].apply(lambda x: sum('m' in p for p in x))
df['text_large'] = df['text_positions'].apply(lambda x: sum('l' in p for p in x))

import pandas as pd
from collections import defaultdict
import webcolors

# 웹 색상 이름을 기반으로 한 더 광범위한 색상 그룹 설정
color_groups = {
    'red': {'red', 'crimson', 'firebrick', 'darkred', 'salmon', 'indianred', 'tomato', 'orangered', 'darkorange', 'lightcoral', 'maroon', 'brown'},
    'blue': {'blue', 'navy', 'dodgerblue', 'deepskyblue', 'royalblue', 'skyblue', 'slateblue', 'mediumblue', 'cornflowerblue', 'steelblue', 'lightblue', 'powderblue', 'midnightblue', 'lightsteelblue'},
    'green': {'green', 'lime', 'forestgreen', 'seagreen', 'springgreen', 'mediumseagreen', 'darkgreen', 'lawngreen', 'yellowgreen', 'olive', 'olivedrab', 'chartreuse'},
    'yellow': {'yellow', 'gold', 'khaki', 'lemonchiffon', 'lightyellow', 'palegoldenrod', 'lightgoldenrodyellow', 'goldenrod', 'darkgoldenrod'},
    'purple': {'purple', 'magenta', 'violet', 'orchid', 'mediumorchid', 'mediumpurple', 'darkviolet', 'blueviolet', 'darkorchid', 'thistle', 'lavender', 'plum'},
    'brown': {'brown', 'sienna', 'chocolate', 'peru', 'saddlebrown', 'tan', 'burlywood', 'rosybrown', 'darkkhaki', 'khaki'},
    'grey': {'grey', 'gray', 'lightgrey', 'darkgrey', 'dimgrey', 'slategrey', 'gainsboro', 'darkslategrey', 'lightsteelblue', 'silver', 'dimgray'},
    'white': {'white', 'snow', 'ivory', 'ghostwhite', 'whitesmoke', 'floralwhite', 'seashell', 'beige', 'linen', 'mintcream', 'seashell'},
    'pink': {'pink', 'lightpink', 'hotpink', 'lavenderblush', 'deeppink', 'mediumvioletred', 'palevioletred'},
    'black': {'black', 'darkslategray', 'dimgray', 'charcoal'},
    'other': set()
}

# 색상을 그룹으로 매핑
def map_color_to_group(color):
    for group, colors in color_groups.items():
        if color.lower() in colors:
            return group
    return 'other'

# 각 색상 그룹별로 0.0 초기화
for col in color_groups.keys():
    df[f'color_{col}'] = 0.0

# 색상 그룹 확률을 계산하는 함수
def count_color_groups(color_list):
    counter = defaultdict(float)
    for color in color_list:
        group = map_color_to_group(color)
        counter[group] += 1  # 각 색상은 1씩 추가됨
    return counter

# dominant_colors 컬럼이 문자열로 되어있으므로, 이를 리스트로 변환
for idx, row in df.iterrows():
    if isinstance(row['dominant_colors'], str):
        color_list = row['dominant_colors'].split(', ')  # ',' 기준으로 분할하여 리스트로 변환
        counter = count_color_groups(color_list)
        for group in counter:
            df.at[idx, f'color_{group}'] = counter[group]

# 색상 컬럼이 0이면 0, 0이 아니면 1로 변환
color_columns = [
    'color_red', 'color_blue', 'color_green', 'color_yellow',
    'color_purple', 'color_brown', 'color_grey', 'color_white',
    'color_pink', 'color_black', 'color_other'
]

df[color_columns] = df[color_columns].applymap(lambda x: 0 if x == 0 else 1)

feature_cols = [
    'duration', 'subscriber_count', 'brightness', 'contrast',
       'title_length', 'word_count', 'emoji_count', 'has_emoji',
       'special_char_count', 'is_clickbait', 'has_question_mark',
       'has_exclamation', 'pub_year', 'pub_month', 'pub_weekday', 'color_red',
       'color_blue', 'color_green', 'color_yellow', 'color_purple',
       'color_brown', 'color_grey', 'color_white', 'color_pink',
       'person_count', 'object_count', 'has_text', 'person_left',
       'person_middle', 'person_right', 'person_small', 'person_medium',
       'person_large', 'text_left', 'text_middle', 'text_right', 'text_small',
       'text_medium', 'text_large'
]
model_df=df[feature_cols]



import joblib  # or use pickle if you prefer

# 모델 로드 (예: RandomForest, XGBoost, LinearRegression 등)
model = joblib.load('saved_models/model_cluster_0.pkl')
# 예측에 필요한 특성만 선택 (모델 학습에 사용한 특성과 동일해야 함)
X = model_df[feature_cols]  # 특성들이 담긴 DataFrame
# 예측 수행
y_pred = model.predict(X)

# 예측 결과를 DataFrame에 추가
model_df['predicted_views'] = y_pred
# 예측 결과 출력
print(model_df['predicted_views'])

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
Using cache found in C:\Users\duwjd/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2025-4-24 Python-3.11.7 torch-2.7.0+cpu CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients, 16.4 GFLOPs
Adding AutoShape... 
  with amp.autocast(autocast):


0    11.883536
Name: predicted_views, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['predicted_views'] = y_pred


In [59]:
import joblib  # or use pickle if you prefer
model_df['subscriber_count']=100000000
# 모델 로드 (예: RandomForest, XGBoost, LinearRegression 등)
model = joblib.load('saved_models/model_cluster_3.pkl')
# 예측에 필요한 특성만 선택 (모델 학습에 사용한 특성과 동일해야 함)
X = model_df[feature_cols]  # 특성들이 담긴 DataFrame
# 예측 수행
y_pred = model.predict(X)

# 예측 결과를 DataFrame에 추가
model_df['predicted_views'] = y_pred
# 예측 결과 출력
print(model_df['predicted_views'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['subscriber_count']=100000000


0    10.027542
Name: predicted_views, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_df['predicted_views'] = y_pred
