# 조회수 예측 모델 적용 위한 피쳐 추출

## 1. 제목

#### 후보들 제목 그대로 활용

In [2]:
import pandas as pd
import re
import emoji
from collections import Counter
from konlpy.tag import Okt

df_recommendations=pd.read_csv('title_recommend_candidates.csv', encoding='utf-8')

# 형태소 분석기 초기화
okt = Okt()

# 불용어 리스트
stopwords = set([
    '은', '는', '이', '가', '을', '를', '에', '의', '도', '로', '과', '와', '한', '하다',
    '에서', '에게', '까지', '부터', '보다', '처럼', '만', '없이', '수', '것', '좀', '더', '이',
    '또', '등', '그', '이것', '저것', '그것', '거', '때', '건', '중', '나', '너', '저', '우리',
    '누구', '뭐', '왜', '어디', '어떻게', '영상', '채널', '오늘', '이제', '정말', '진짜',
    '완전', '그냥', '내가', '당신', '내용', '제목', '시작', '끝', '다시', '모두',
    '최고', '대박', '소름', '헐', 'ㅋㅋ', 'ㅎㅎ', 'ㅠㅠ', '와우','자막','브이','로그','일상',
    '정보', '필독', '업로드', '자막', '구독', '좋아요', '댓글', '시청', '확인', '보세요'
])

# 클릭 유도 키워드
clickbait_keywords = [
    '실화', '충격', '대박', '소름', '반전', '최초', '드디어', '헐', '진실',
    '믿기지', '이게', '무조건', '죽기 전에', '꼭 봐야할'
]

# 제목 기반 피처 추출 함수
def extract_korean_title_features(title, video_id):
    features = {}
    features['video_id'] = video_id
    features['title'] = title
    features['title_length'] = len(title)
    features['word_count'] = len(okt.morphs(title))

    # 이모지 관련 피처
    features['emoji_count'] = sum(1 for char in title if char in emoji.EMOJI_DATA)
    features['has_emoji'] = int(features['emoji_count'] > 0)

    # 특수문자 수
    special_chars = re.findall(r"[!\"#$%&'()*+,\-./:;<=>?@\[\]^_`{|}~]", title)
    features['special_char_count'] = len(special_chars)

    # 클릭유도 키워드 여부
    features['is_clickbait'] = int(any(word in title for word in clickbait_keywords))

    # 구두점
    features['has_question_mark'] = '?' in title
    features['has_exclamation'] = '!' in title

    # 주요 명사 3개
    nouns = okt.nouns(title)
    filtered_nouns = [n for n in nouns if n not in stopwords and len(n) > 1]
    noun_freq = Counter(filtered_nouns)
    top_nouns = [word for word, _ in noun_freq.most_common(3)]
    for i in range(3):
        features[f'top_noun_{i+1}'] = top_nouns[i] if i < len(top_nouns) else ''

    return features

titles = df_recommendations['title_candidate']
video_ids = range(len(titles))  # 후보 고유 ID 부여

title_features_df = pd.DataFrame([
    extract_korean_title_features(title, vid)
    for title, vid in zip(titles, video_ids)
])

title_features_df.head()

Unnamed: 0,video_id,title,title_length,word_count,emoji_count,has_emoji,special_char_count,is_clickbait,has_question_mark,has_exclamation,top_noun_1,top_noun_2,top_noun_3
0,0,게임 탐험하기,7,3,0,0,0,0,False,False,게임,탐험,
1,1,마인크래프트 탐험하기,11,3,0,0,0,0,False,False,마인크래프트,탐험,
2,2,게임 살펴보기,7,2,0,0,0,0,False,False,게임,,
3,3,마인크래프트 살펴보기,11,2,0,0,0,0,False,False,마인크래프트,,
4,4,게임 전국일주,7,3,0,0,0,0,False,False,게임,전국,일주


## 2. 썸네일

#### 조회수 예측 모델 학습에 활용한 데이터들의 평균값

In [3]:
# 썸네일 평균 피처 로드
thumbnail_df = pd.read_csv("../../1학기/캡스톤디자인(1)/Youtube-prediction-and-feedback/model/model_datas.csv")

# feature_cols에 포함된 썸네일 관련 컬럼만 선택
thumbnail_cols = [
    'brightness', 'contrast', 'color_red', 'color_blue', 'color_green',
    'color_yellow', 'color_purple', 'color_brown', 'color_grey', 'color_white', 'color_pink',
    'person_count', 'object_count', 'has_text',
    'person_left', 'person_middle', 'person_right',
    'person_small', 'person_medium', 'person_large',
    'text_left', 'text_middle', 'text_right',
    'text_small', 'text_medium', 'text_large'
]

# ✅ cluster=2인 행만 선택
if 'cluster' in thumbnail_df.columns:
    thumbnail_cluster_df = thumbnail_df[thumbnail_df['cluster'] == 2]
else:
    print("'cluster' 컬럼이 없습니다. 전체 평균으로 계산합니다.")
    thumbnail_cluster_df = thumbnail_df

thumbnail_means = thumbnail_cluster_df[thumbnail_cols].mean().to_dict()
thumbnail_means

{'brightness': 140.29969908503438,
 'contrast': 70.04647056030761,
 'color_red': 0.22727272727272727,
 'color_blue': 0.14015151515151514,
 'color_green': 0.041666666666666664,
 'color_yellow': 0.24621212121212122,
 'color_purple': 0.10606060606060606,
 'color_brown': 0.49242424242424243,
 'color_grey': 0.9015151515151515,
 'color_white': 0.48484848484848486,
 'color_pink': 0.03787878787878788,
 'person_count': 1.621212121212121,
 'object_count': 0.7159090909090909,
 'has_text': 0.7159090909090909,
 'person_left': 0.20454545454545456,
 'person_middle': 1.571969696969697,
 'person_right': 0.24242424242424243,
 'person_small': 0.07954545454545454,
 'person_medium': 1.5757575757575757,
 'person_large': 1.5909090909090908,
 'text_left': 0.4015151515151515,
 'text_middle': 1.3068181818181819,
 'text_right': 0.3333333333333333,
 'text_small': 0.4431818181818182,
 'text_medium': 1.4242424242424243,
 'text_large': 1.5378787878787878}

## 3. 채널 및 영상 정보

#### 내 채널 데이터들의 평균값

In [6]:
from datetime import datetime
# === 채널 평균 로드 ===
channel_df = pd.read_csv("data/channel(@KoreanCryingGuy)_videos_metadata.csv")
channel_df = channel_df.rename(columns={'duration_sec':'duration'})
channel_cols = ['duration','subscriber_count']
channel_means = channel_df[channel_cols].mean().to_dict()

# === 현재 시점 게시일 피처 추가 ===
now = datetime.now()
pub_features = {
    'pub_year': now.year,
    'pub_month': now.month,
    'pub_weekday': now.weekday()  # 월=0, 일=6
}

In [8]:
# === 세 종류의 피처 병합 ===
base_features = {**thumbnail_means, **channel_means, **pub_features}

full_features = []
for _, row in title_features_df.iterrows():
    merged = {**base_features, **row.to_dict()}
    full_features.append(merged)

feature_df = pd.DataFrame(full_features)

# 모델 입력용 feature_cols 순서대로 정리
feature_cols = [
    'duration', 'subscriber_count', 'brightness', 'contrast',
    'title_length', 'word_count', 'emoji_count',
    'special_char_count', 'is_clickbait', 'has_question_mark',
    'has_exclamation', 'pub_year', 'pub_month', 'pub_weekday', 'color_red',
    'color_blue', 'color_green', 'color_yellow', 'color_purple',
    'color_brown', 'color_grey', 'color_white', 'color_pink',
    'person_count', 'object_count', 'has_text', 'person_left',
    'person_middle', 'person_right', 'person_small', 'person_medium',
    'person_large', 'text_left', 'text_middle', 'text_right', 'text_small',
    'text_medium', 'text_large'
]

model_input_df = feature_df[feature_cols + ['title','video_id']]
model_input_df

Unnamed: 0,duration,subscriber_count,brightness,contrast,title_length,word_count,emoji_count,special_char_count,is_clickbait,has_question_mark,...,person_medium,person_large,text_left,text_middle,text_right,text_small,text_medium,text_large,title,video_id
0,1408.662162,1690000.0,140.299699,70.046471,7,3,0,0,0,False,...,1.575758,1.590909,0.401515,1.306818,0.333333,0.443182,1.424242,1.537879,게임 탐험하기,0
1,1408.662162,1690000.0,140.299699,70.046471,11,3,0,0,0,False,...,1.575758,1.590909,0.401515,1.306818,0.333333,0.443182,1.424242,1.537879,마인크래프트 탐험하기,1
2,1408.662162,1690000.0,140.299699,70.046471,7,2,0,0,0,False,...,1.575758,1.590909,0.401515,1.306818,0.333333,0.443182,1.424242,1.537879,게임 살펴보기,2
3,1408.662162,1690000.0,140.299699,70.046471,11,2,0,0,0,False,...,1.575758,1.590909,0.401515,1.306818,0.333333,0.443182,1.424242,1.537879,마인크래프트 살펴보기,3
4,1408.662162,1690000.0,140.299699,70.046471,7,3,0,0,0,False,...,1.575758,1.590909,0.401515,1.306818,0.333333,0.443182,1.424242,1.537879,게임 전국일주,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,1408.662162,1690000.0,140.299699,70.046471,22,8,0,0,0,False,...,1.575758,1.590909,0.401515,1.306818,0.333333,0.443182,1.424242,1.537879,귀신의집 금지된 학교축제 창자빠지게 놀기,545
546,1408.662162,1690000.0,140.299699,70.046471,25,9,0,0,0,False,...,1.575758,1.590909,0.401515,1.306818,0.333333,0.443182,1.424242,1.537879,Lady Ok 금지된 학교축제 창자빠지게 놀기,546
547,1408.662162,1690000.0,140.299699,70.046471,19,7,0,0,0,False,...,1.575758,1.590909,0.401515,1.306818,0.333333,0.443182,1.424242,1.537879,찜질방 리모델링기념 창자빠지게 놀기,547
548,1408.662162,1690000.0,140.299699,70.046471,20,7,0,0,0,False,...,1.575758,1.590909,0.401515,1.306818,0.333333,0.443182,1.424242,1.537879,귀신의집 리모델링기념 창자빠지게 놀기,548


In [19]:
import joblib
import numpy as np
import pandas as pd

# -------------------------------
# 1️⃣ 모델 불러오기
# -------------------------------
model = joblib.load("../../1학기/캡스톤디자인(1)/Youtube-prediction-and-feedback/model/saved_models/model_cluster_2.pkl")

# -------------------------------
# 2️⃣ 예측 수행
# -------------------------------
X = model_input_df[feature_cols]  # 입력 피처만 추출
predicted_views = model.predict(X)

# 결과를 DataFrame에 추가
model_input_df['predicted_views'] = predicted_views

# -------------------------------
# 3️⃣ df_recommendations와 병합
# -------------------------------
# df_recommendations에는 각 제목의 template 정보가 있음
# 로그값을 원래 스케일로 변환
model_input_df['predicted_views_actual'] = np.expm1(model_input_df['predicted_views'])  # log1p 썼을 경우
# 만약 np.log(view_count+1) 이 아니라 np.log(view_count)만 썼다면 → np.exp()
# merged_df['predicted_views_actual'] = np.exp(merged_df['predicted_views'])

model_input_df.columns

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_input_df['predicted_views'] = predicted_views
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  model_input_df['predicted_views_actual'] = np.expm1(model_input_df['predicted_views'])  # log1p 썼을 경우


Index(['duration', 'subscriber_count', 'brightness', 'contrast',
       'title_length', 'word_count', 'emoji_count', 'special_char_count',
       'is_clickbait', 'has_question_mark', 'has_exclamation', 'pub_year',
       'pub_month', 'pub_weekday', 'color_red', 'color_blue', 'color_green',
       'color_yellow', 'color_purple', 'color_brown', 'color_grey',
       'color_white', 'color_pink', 'person_count', 'object_count', 'has_text',
       'person_left', 'person_middle', 'person_right', 'person_small',
       'person_medium', 'person_large', 'text_left', 'text_middle',
       'text_right', 'text_small', 'text_medium', 'text_large', 'title',
       'video_id', 'predicted_views', 'predicted_views_actual'],
      dtype='object')

In [21]:
model_input_df.to_csv('model_input_df.csv', index=0, encoding='utf-8')

## gpt 프롬프트 기반 제목 수정

# def build_prompt(template, candidates, avg_features, top10_features):
    prompt = f"""
You are an expert YouTube strategist who improves Korean titles to maximize views.

Below are candidate titles for a YouTube video template:
Template: {template}
Candidates:
{chr(10).join('- ' + c for c in candidates)}

We analyzed 1000 videos from this channel. 
Here are the style statistics for Cluster 2 videos and their Top 10% performers.

[Cluster 2 Average Style]
- Title length: {avg_features['title_length']} characters
- Word count: {avg_features['word_count']}
- Emoji count: {avg_features['emoji_count']}
- Special characters: {avg_features['special_char_count']}
- Clickbait presence: {avg_features['is_clickbait']}
- Has question mark: {avg_features['has_question_mark']}
- Has exclamation mark: {avg_features['has_exclamation']}

[Top 10% Average Style]
- Title length: {top10_features['title_length']} characters
- Word count: {top10_features['word_count']}
- Emoji count: {top10_features['emoji_count']}
- Special characters: {top10_features['special_char_count']}
- Clickbait presence: {top10_features['is_clickbait']}
- Has question mark: {top10_features['has_question_mark']}
- Has exclamation mark: {top10_features['has_exclamation']}

Goal:
- Modify or slightly improve one candidate title so that it stylistically aligns with the **Top 10% style**.
- Keep the original meaning and tone natural.
- Add or remove emojis, special characters, or words if needed to fit the target pattern.
- Output only ONE final optimized title in JSON format:

{{
  "template": "{template}",
  "optimized_title": "..."
}}
"""
    return prompt

In [120]:
from openai import OpenAI
import json

MY_API_KEY=getpass.getpass("OpenAI API Key:")
client = OpenAI(api_key=MY_API_KEY)

def optimize_title_with_gpt(template, candidates, avg_features, top10_features):
    prompt = build_prompt(template, candidates, avg_features, top10_features)

    response = client.chat.completions.create(
        model="gpt-5",
        messages=[
            {"role": "system", "content": "You are a YouTube title optimization assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    try:
        return json.loads(response.choices[0].message.content)
    except:
        return {"template": template, "optimized_title": response.choices[0].message.content}

OpenAI API Key: ········


In [122]:
# cluster=2 데이터만
df_cluster2 = thumbnail_df[thumbnail_df['cluster'] == 2]

# 상위 10% 기준
threshold = df_cluster2['view_count'].quantile(0.9)
df_top10 = df_cluster2[df_cluster2['view_count'] >= threshold]

# 평균 피처 계산
title_features = [
    'title_length', 'word_count', 'emoji_count',
    'special_char_count', 'is_clickbait',
    'has_question_mark', 'has_exclamation'
]

avg_features = df_cluster2[title_features].mean().to_dict()
top10_features = df_top10[title_features].mean().to_dict()

In [124]:
from tqdm import tqdm

results = []

for template in tqdm(df_recommendations['template'].unique()):
    candidates = df_recommendations[df_recommendations['template'] == template]['title_candidate'].tolist()

    output = optimize_title_with_gpt(template, candidates, avg_features, top10_features)
    results.append(output)

df_final_titles = pd.DataFrame(results)

100%|██████████████████████████████████████████████████████████████████████████████████| 15/15 [12:08<00:00, 48.58s/it]


In [126]:
df_final_titles

Unnamed: 0,template,optimized_title
0,{TOPIC} {SITUATION},마인크래프트 탐험하기 | ⛏️ 신비한 바이옴·숨겨진 유적과 보물 던전 한 번에!
1,{SITUATION} {TOPIC},"그냥 내비대로 가주세요: 기념일 드라이브, 예상 못한 코스로 가봅니다! 🎉"
2,{TOPIC} {EVENT},"퇴사자 당근마켓 레전드 | 실제 거래 모음 · 실화 썰 대방출! 노컷 하이라이트, 현웃 주의 🤯"
3,{SERIES} {TOPIC} {ENG},[LG TWINS] 노래 모음 플리 | 응원감성 가득! Playlist 🎧
4,{SERIES} {TOPIC} {EVENT},[예고] 다크소울 플레이리스트 | 보스전·전투 몰입도 BGM 타격감 폭발! 🔥
5,{SERIES} {SITUATION} {TOPIC} {REACTION},[웃소] 퇴사마려운 직장인 레전드 명장면 모음.zip 웃픈 회사생활 공감 😂!
6,{TOPIC} {SITUATION} {LOC},"여행, 쓸쓸하지 않은 제주도｜혼자여도 따뜻한 감성 코스 추천!｜카페·해변·노을 🏝️"
7,{ENG} {PERSON}의 {SITUATION} {TOPIC},[ENG SUB] 아이유의 웃으면 안 되는 생일파티 🎂 레전드 모먼트 모음!
8,{SERIES} {TOPIC} {SITUATION} {NUM} {ENG},[영업중] 돈가스 가게에서 생기는 일: 바삭 비밀 공개!🍽️ 노컷 00 EN
9,{PERSON} {SITUATION},감스트 인턴 첫 출근합니다! | 오늘 브이로그 비하인드 공개 👔 [현장 스케치]


In [128]:
df_final_titles.to_csv('title_recommend_final.csv', index=0, encoding='utf-8')