In [1]:
import json
import os
import pandas as pd
import isodate
import requests
import numpy as np
import cv2
from collections import Counter
from sklearn.cluster import KMeans
import time
from tqdm import tqdm
import random

In [117]:
merge_df=pd.read_csv("final_video_datas.csv", encoding="utf-8-sig")

In [43]:
merge_df.info()
merge_df.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7308 entries, 0 to 7307
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   title_x             7308 non-null   object        
 1   video_id            7308 non-null   object        
 2   published_date      7308 non-null   datetime64[ns]
 3   thumbnail_url       7308 non-null   object        
 4   view_count          7308 non-null   float64       
 5   like_count          7261 non-null   float64       
 6   comment_count       7262 non-null   float64       
 7   duration            7308 non-null   float64       
 8   channel_id          7308 non-null   object        
 9   subscriber_count    7308 non-null   float64       
 10  brightness          7308 non-null   float64       
 11  contrast            7308 non-null   float64       
 12  dominant_colors     7308 non-null   object        
 13  thumbnail_size      7308 non-null   object      

  merge_df.describe(include='all')


Unnamed: 0,title_x,video_id,published_date,thumbnail_url,view_count,like_count,comment_count,duration,channel_id,subscriber_count,...,special_char_count,is_clickbait,has_question_mark,has_exclamation,top_noun_1,top_noun_2,top_noun_3,like_missing,comment_missing,upload_weekday
count,7308,7308,7308,7308,7308.0,7261.0,7262.0,7308.0,7308,7308.0,...,7308.0,7308.0,7308,7308,7283,7128,6846,7308.0,7308.0,7308
unique,7289,7308,1073,7308,,,,,2453,,...,,,2,2,2789,3106,3158,,,7
top,[12뉴스] 오늘의 주요뉴스 / SBS,t-zwVkTOZA8,2025-03-28 00:00:00,https://i.ytimg.com/vi/t-zwVkTOZA8/maxresdefau...,,,,,"(' YTN', '@ytnnews24')",,...,,,False,False,테슬라,버라이어티,하바나,,,Friday
freq,11,1,221,1,,,,,201,,...,,,5861,5578,284,101,67,,,1332
first,,,2020-04-06 00:00:00,,,,,,,,...,,,,,,,,,,
last,,,2025-04-01 00:00:00,,,,,,,,...,,,,,,,,,,
mean,,,,,648182.3,9387.620438,770.885018,925.517378,,1517002.0,...,4.297072,0.057745,,,,,,0.018199,0.025588,
std,,,,,1460531.0,29128.381466,3495.486735,709.398507,,3908397.0,...,3.278131,0.233276,,,,,,0.13368,0.157915,
min,,,,,432.0,0.0,0.0,60.0,,528.0,...,0.0,0.0,,,,,,0.0,0.0,
25%,,,,,31518.25,467.0,61.0,355.75,,119000.0,...,2.0,0.0,,,,,,0.0,0.0,


In [19]:
cat_cols = ['has_emoji', 'is_clickbait', 'has_question_mark', 'has_exclamation']
for col in cat_cols:
    print(merge_df[col].value_counts())

0    5863
1    1445
Name: has_emoji, dtype: int64
0    6886
1     422
Name: is_clickbait, dtype: int64
False    5861
True     1447
Name: has_question_mark, dtype: int64
False    5578
True     1730
Name: has_exclamation, dtype: int64


In [119]:
def fill_by_group_median(df, group_col, target_col):
    return df[target_col].fillna(
        df.groupby(group_col)[target_col].transform('median')
    )

# 1. 결측 여부 피처
merge_df['like_missing'] = merge_df['like_count'].isnull().astype(int)
merge_df['comment_missing'] = merge_df['comment_count'].isnull().astype(int)

# 2. 구독자수 구간화 → subscriber_group 컬럼 생성
merge_df['subscriber_group'] = pd.qcut(
    merge_df['subscriber_count'], 
    q=5,
    labels=False,
    duplicates='drop'
)

# 3. subscriber_group 기준으로 중앙값 채우기
merge_df['like_count'] = fill_by_group_median(merge_df, 'subscriber_group', 'like_count')
merge_df['comment_count'] = fill_by_group_median(merge_df, 'subscriber_group', 'comment_count')

In [121]:
# 업로드 날짜, 요일
merge_df['published_date'] = pd.to_datetime(merge_df['published_date'])
merge_df['upload_weekday'] = merge_df['published_date'].dt.day_name()

In [71]:
# 트리기반모델 쓰는 경우 생략 가능
# from sklearn.preprocessing import MinMaxScaler
# # 정규화
# scaler = MinMaxScaler()
# scaled_cols = ['like_count', 'comment_count', 'subscriber_count', 'duration', 'brightness', 'contrast']
# merge_df[scaled_cols] = scaler.fit_transform(merge_df[scaled_cols])

### 주요단어 벡터화

#### TF-IDF

In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np

# 1. 주요 명사 결합
merge_df['nouns_combined'] = merge_df[['top_noun_1', 'top_noun_2', 'top_noun_3']].fillna('').apply(
    lambda row: ' '.join([str(noun).strip() for noun in row if str(noun).strip() != '']), axis=1
)

# 2. TF-IDF 벡터화
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(merge_df['nouns_combined'])

# 3. 전체 단어별 평균 TF-IDF를 계산해 상위 n개 선택
tfidf_means = np.asarray(tfidf_matrix.mean(axis=0)).flatten()
top_n = 30
top_n_indices = np.argsort(tfidf_means)[::-1][:top_n]
top_n_features = [tfidf.get_feature_names_out()[i] for i in top_n_indices]

# 4. 상위 단어만 선택한 DataFrame 만들기
top_n_matrix = tfidf_matrix[:, top_n_indices]
top_n_df = pd.DataFrame(top_n_matrix.toarray(), columns=[f"tfidf_{word}" for word in top_n_features])

# 5. 병합
merge_df = pd.concat([merge_df.reset_index(drop=True), top_n_df.reset_index(drop=True)], axis=1)

'tfidf_테슬라', 'tfidf_뉴스', 'tfidf_전기차', 'tfidf_닌텐도',
       'tfidf_아이폰', 'tfidf_스위치', 'tfidf_게임', 'tfidf_경제', 'tfidf_한국',
       'tfidf_인공', 'tfidf_여행', 'tfidf_버라이어티', 'tfidf_미국', 'tfidf_호텔',
       'tfidf_일상', 'tfidf_먹방', 'tfidf_운동', 'tfidf_중국', 'tfidf_지능', 'tfidf_갱생',
       'tfidf_하바나', 'tfidf_메이크업', 'tfidf_트럼프', 'tfidf_머스크', 'tfidf_지금',
       'tfidf_이유', 'tfidf_토크쇼', 'tfidf_일본', 'tfidf_사람', 'tfidf_패션'

#### 차원 축소

In [81]:
from sklearn.decomposition import TruncatedSVD

# 기존 TF-IDF 전체 벡터 그대로 사용
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(merge_df['nouns_combined'])

# SVD로 차원 축소
svd = TruncatedSVD(n_components=20, random_state=42)
svd_matrix = svd.fit_transform(tfidf_matrix)

# SVD 결과를 DataFrame으로 변환
svd_df = pd.DataFrame(svd_matrix, columns=[f"tfidf_svd_{i+1}" for i in range(svd_matrix.shape[1])])

# 병합
merge_df = pd.concat([merge_df.reset_index(drop=True), svd_df.reset_index(drop=True)], axis=1)


 'tfidf_svd_1', 'tfidf_svd_2', 'tfidf_svd_3', 'tfidf_svd_4',
       'tfidf_svd_5', 'tfidf_svd_6', 'tfidf_svd_7', 'tfidf_svd_8',
       'tfidf_svd_9', 'tfidf_svd_10', 'tfidf_svd_11', 'tfidf_svd_12',
       'tfidf_svd_13', 'tfidf_svd_14', 'tfidf_svd_15', 'tfidf_svd_16',
       'tfidf_svd_17', 'tfidf_svd_18', 'tfidf_svd_19', 'tfidf_svd_20'

## modeling

In [173]:
import ast
# 문자열을 실제 리스트로 변환 (object_labels 칼럼에만 적용)
merge_df['object_labels'] = merge_df['object_labels'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)
merge_df['dominant_colors'] = merge_df['dominant_colors'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

In [175]:
from collections import Counter
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

def process_object_labels(df, top_n_objects=10):
    df = df.copy()

    # 1. person 개수 세기
    df['person_count'] = df['object_labels'].apply(
        lambda objs: sum(1 for obj in objs if obj == 'person') if isinstance(objs, list) else 0
    )

    # 2. person 제외한 객체에서 상위 top_n 추출
    filtered_objects = df['object_labels'].dropna().apply(
        lambda x: list(set(x) - {'person'}) if isinstance(x, list) else []
    ).explode()

    top_objects = [obj for obj, _ in Counter(filtered_objects).most_common(top_n_objects)]

    # 3. 상위 객체 OHE
    for obj in top_objects:
        df[f'obj_{obj}'] = df['object_labels'].apply(
            lambda x: int(isinstance(x, list) and obj in x and obj != 'person')
        )

    return df

def preprocess_for_model(df, top_n_colors=10, top_n_objects=10, top_tfidf_n=50):
    df = df.copy()

    all_colors = df['dominant_colors'].dropna().explode()
    color_names = [
        item[0] for item in all_colors 
        if isinstance(item, tuple) and len(item) == 2 and isinstance(item[0], str)
    ]
    top_colors = [color for color, _ in Counter(color_names).most_common(top_n_colors)]

    for color in top_colors:
        df[f'color_{color}'] = df['dominant_colors'].apply(
            lambda x: int(any(isinstance(c, tuple) and len(c) == 2 and c[0] == color for c in x)) if isinstance(x, list) else 0
        )

    df = process_object_labels(df, top_n_objects=top_n_objects)

    return df

    
model_df = preprocess_for_model(merge_df, top_n_colors=10, top_n_objects=10, top_tfidf_n=50)

In [177]:
model_df.iloc[0,:]

title_x                                                 결국 긴팔문신 제거 하는 조두팔
video_id                                                      t-zwVkTOZA8
published_date                                        2025-03-28 00:00:00
thumbnail_url           https://i.ytimg.com/vi/t-zwVkTOZA8/maxresdefau...
view_count                                                       244420.0
like_count                                                         3045.5
comment_count                                                       221.0
duration                                                            459.0
channel_id                                             ('조두팔', '@조두팔이라고')
subscriber_count                                                 412000.0
brightness                                                     109.295617
contrast                                                        50.267187
dominant_colors         [(dimgrey, 0.312705078125), (grey, 0.288639322...
thumbnail_size                        

In [179]:
cat_cols = ['obj_nan','obj_없음','obj_tie','obj_car','obj_bowl','obj_cup','obj_chair','obj_cell phone',
'obj_cake', 'obj_bottle']
for col in cat_cols:
    print(model_df[col].value_counts())

0    7308
Name: obj_nan, dtype: int64
0    6183
1    1125
Name: obj_없음, dtype: int64
0    6577
1     731
Name: obj_tie, dtype: int64
0    6825
1     483
Name: obj_car, dtype: int64
0    6931
1     377
Name: obj_bowl, dtype: int64
0    6992
1     316
Name: obj_cup, dtype: int64
0    7003
1     305
Name: obj_chair, dtype: int64
0    7004
1     304
Name: obj_cell phone, dtype: int64
0    7083
1     225
Name: obj_cake, dtype: int64
0    7093
1     215
Name: obj_bottle, dtype: int64
