### 앙상블

In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt




In [None]:
# 1. 데이터 불러오기
df = pd.read_csv(r"C:\Users\user\Desktop\Youtube_Game2_10000.csv")  # 파일 경로 수정
df = df[['title', 'views']].dropna()
df = df[df['views'] > 0]  # 0 조회수 제거

# 로그 변환
df['log_views'] = np.log1p(df['views'])

# 2. BERT 임베딩
model = SentenceTransformer('bert-base-nli-mean-tokens')
embeddings = model.encode(df['title'].tolist(), show_progress_bar=True)

# 3. 학습/테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, df['log_views'], test_size=0.2, random_state=42
)

# 4. 앙상블 모델 구성 (Stacking)
base_models = [
    ('ridge', Ridge(alpha=1.0)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gbr', GradientBoostingRegressor(n_estimators=100, random_state=42))
]

stack_model = StackingRegressor(
    estimators=base_models,
    final_estimator=LinearRegression(),
    passthrough=True
)

# 5. 모델 학습
stack_model.fit(X_train, y_train)

# 6. 예측 및 평가
y_pred = stack_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

# 7. 예측 결과 시각화
plt.figure(figsize=(8, 6))
plt.scatter(np.expm1(y_test), np.expm1(y_pred), alpha=0.3)
plt.xlabel("Actual Views")
plt.ylabel("Predicted Views")
plt.title("YouTube Views Prediction (BERT + Stacking)")
plt.xscale('log')
plt.yscale('log')
plt.plot([1, 1e8], [1, 1e8], linestyle='--', color='red')
plt.grid(True)
plt.show()


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.77k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/435 [00:00<?, ?it/s]