In [1]:
import pandas as pd
import numpy as np
import re

In [4]:
# 데이터 로드
df = pd.read_csv(r"C:\Users\user\Desktop\dataset v2.csv")

# 제목 전처리
def preprocess_title(title):
    return re.sub(r"[^a-zA-Z0-9가-힣!? ]", "", str(title).lower())

# Rule-Based feature 추출
def extract_features(title):
    title = preprocess_title(title)
    return {
        'length': len(title),
        'num_exclamations': title.count('!'),
        'num_questions': title.count('?'),
        'has_number': int(bool(re.search(r'\d+', title))),
        'has_year': int(bool(re.search(r'20\d{2}', title))),
        'has_comedy': int('comedy' in title or 'funny' in title or '😂' in title or '🤣' in title),
        'has_part': int('part' in title),
    }

# 점수 계산 함수 (가중치는 임의 설정, 추후 조정 가능)
def compute_rule_score(features):
    score = 0
    score += -0.05 * features['length']                   # 짧을수록 가산점
    score += 0.3 * features['num_exclamations']
    score += 0.4 * features['num_questions']
    score += 0.5 * features['has_number']
    score += 0.4 * features['has_year']
    score += 0.6 * features['has_comedy']
    score += 0.5 * features['has_part']
    return score

# feature 및 rule_score 계산
df['features'] = df['title'].apply(lambda x: extract_features(x))
df['rule_score'] = df['features'].apply(compute_rule_score)

# 조회수 예측 함수
def score_to_views(score, base=1e5):
    return int(base * np.exp(score / 3))

df['predicted_views'] = df['rule_score'].apply(score_to_views)

# 상위 결과 확인
df_result = df[['title', 'views', 'rule_score', 'predicted_views']].sort_values(by='views', ascending=False)


### 머신러닝 기반 Rule-Base 모델 설계

In [13]:
import pandas as pd
import numpy as np
import re

In [None]:
# 데이터 로드
df = pd.read_csv(r"C:\Users\user\Desktop\dataset v2.csv")

# 게임 영상에 특화된 feature 추출 함수
def extract_game_video_features(title):
    title = str(title).lower()
    title = re.sub(r"[^a-zA-Z0-9가-힣!? ]", "", title)

    keywords = [
        'rank', 'op', 'patch', 'update', 'buff', 'nerf', 'new', 'epic',
        'trick', 'tips', 'glitch', 'record', 'kill', 'highlight', 'champion'
    ]
    hype_words = ['insane', 'crazy', 'unbelievable', 'god', '1v5', 'clutch']

    return {
        'length': len(title),
        'has_question': int('?' in title),
        'has_exclam': int('!' in title),
        'has_number': int(bool(re.search(r'\d+', title))),
        'starts_with_number': int(bool(re.match(r'^\d+', title))),
        'has_patch_note': int('patch' in title or 'update' in title),
        'has_hype_word': int(any(word in title for word in hype_words)),
        'has_keyword': int(any(word in title for word in keywords)),
        'is_caps_heavy': int(sum(1 for c in title if c.isupper()) > len(title) * 0.3),
        'has_vs': int(' vs ' in title or 'v.' in title),
    }

# feature 생성
df_features = df['title'].apply(extract_game_video_features).apply(pd.Series)

# Rule Score 계산 함수
def compute_rule_score(row):
    return (
        -0.05 * row['length'] +
        0.3 * row['has_question'] +
        0.4 * row['has_exclam'] +
        0.6 * row['has_number'] +
        0.3 * row['starts_with_number'] +
        0.5 * row['has_patch_note'] +
        0.7 * row['has_hype_word'] +
        0.6 * row['has_keyword'] +
        0.2 * row['is_caps_heavy'] +
        0.5 * row['has_vs']
    )

# 점수 및 예측값 생성
df_features['rule_score'] = df_features.apply(compute_rule_score, axis=1)
df_features['predicted_views'] = np.expm1(df_features['rule_score'])

# 결과 데이터프레임
df_result = pd.concat([df[['title']], df_features], axis=1)

=== Rule-Based 모델 성능 (Lasso 기반 가중치 적용) ===
RMSE: 36,775,461.19
MAE: 12,118,988.12
R²: -0.1218

=== 사용된 Rule Feature 및 가중치 ===
length: -0.1364
has_question_mark: -0.1043
has_interest_word: -0.1357
has_clickbait: 0.1255
has_emotion_word: -0.3075
has_eng_kor_mix: 0.0997


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 1. 입력(X)과 출력(y) 정의
X = df_features.drop(columns=['rule_score', 'predicted_views'])
y = np.log1p(df['views'])  # 조회수 → 로그 변환 (스케일 안정화)

# 2. 학습/검증 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

# 3. 선형 회귀 모델 학습
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# 4. 학습된 가중치 저장
optimal_weights = dict(zip(X.columns, lr_model.coef_))

# 5. Rule Score 계산 함수 (학습된 가중치 적용)
def compute_optimized_rule_score(row):
    return sum(row[feat] * optimal_weights[feat] for feat in X.columns)

# 6. 새로운 rule_score 및 예측값 계산
df_features['rule_score_optimized'] = X.apply(compute_optimized_rule_score, axis=1)
df_features['predicted_views_optimized'] = np.expm1(df_features['rule_score_optimized'])

# 7. 결과 DataFrame 병합
df_result = pd.concat([df[['title']], df_features], axis=1)
