### Youtube View Predict Project

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from transformers import BertModel, BertTokenizer
import torch
import ast
from tqdm import tqdm

In [6]:
# 1. 데이터 로드
df = pd.read_csv(r"C:\Users\user\Desktop\USvideos.csv")
df = df[['title', 'views']].dropna()
df = df[df['views'] > 0]
df['log_views'] = df['views'].apply(np.log1p)

df = df.head(3000).copy()

# 2. BERT tokenizer 및 모델 로드
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.eval()

# 3. CLS 벡터 추출 함수
def extract_cls_embedding(text):
    encoding = tokenizer(text,
                         max_length=32,
                         truncation=True,
                         padding='max_length',
                         return_tensors='pt')
    input_ids_tensor = encoding['input_ids']
    attention_mask_tensor = encoding['attention_mask']
    with torch.no_grad():
        outputs = bert_model(input_ids=input_ids_tensor, attention_mask=attention_mask_tensor)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return cls_embedding

# 4. 모든 제목에 대해 임베딩 추출
embeddings = []
for title in tqdm(df['title'], desc="Extracting CLS embeddings"):
    try:
        vec = extract_cls_embedding(title)
        embeddings.append(vec)
    except Exception as e:
        print(f"오류 발생: {e}")
        embeddings.append(np.zeros(768))

X = np.array(embeddings)
y = df['log_views'].values  # 로그 변환된 조회수

# 5. 선형 회귀 모델 훈련
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

# 6. 평가 지표 출력
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R^2: {r2:.4f}")

Extracting CLS embeddings: 100%|██████████| 3000/3000 [11:54<00:00,  4.20it/s]


RMSE: 0.3963
R^2: 0.9541
