In [11]:
# Cell 1: 필요한 라이브러리 설치
!pip install transformers peft datasets accelerate yfinance scikit-learn kagglehub -q



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
import os
import json
import random
import numpy as np
import pandas as pd
from pathlib import Path

import yfinance as yf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, mean_squared_error
from sklearn.linear_model import LinearRegression

import torch
from torch.utils.data import Dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

import kagglehub


In [13]:
path_thor_2012 = r"C:\Users\starw\Downloads\data_2012_frame_thor_output.json"
path_thor_business = r"C:\Users\starw\Downloads\data_business_thor_output_1200.json"

df_2012 = pd.read_json(path_thor_2012)
df_business = pd.read_json(path_thor_business)

df_2012_sub = df_2012[["headline", "polarity"]].copy()
df_business_sub = df_business[["headline", "polarity"]].copy()

df_2012_sub.columns = ["text", "label_raw"]
df_business_sub.columns = ["text", "label_raw"]

df_thor = pd.concat([df_2012_sub, df_business_sub], ignore_index=True)
df_thor["label_raw"] = df_thor["label_raw"].str.lower().str.strip()
label_map = {"negative": 0, "neutral": 1, "positive": 2}
df_thor["label"] = df_thor["label_raw"].map(label_map)

df_thor = df_thor.dropna(subset=["label"])
df_thor.head()


Unnamed: 0,text,label_raw,label
0,"Currently, the company foresees its pre-tax pr...",negative,0
1,The agreement strengthens our long-term partne...,positive,2
2,Talvivaara Mining Company Plc Talvivaara Minin...,positive,2
3,Snap Shares Tumble As Short Sellers Move In,negative,0
4,3 Steps to Creating the Company Culture You Want,positive,2


In [14]:
# FinancialPhraseBank 다운로드
path_fpb = kagglehub.dataset_download("ankurzing/sentiment-analysis-for-financial-news")
print("FPB path:", path_fpb)

csv_path = list(Path(path_fpb).rglob("*.csv"))[0]
fpb = pd.read_csv(csv_path, names=["label_raw", "text"], encoding="latin-1")
fpb["label_raw"] = fpb["label_raw"].str.lower().str.strip()

fpb_map = {"negative": 0, "neutral": 1, "positive": 2}
fpb["label"] = fpb["label_raw"].map(fpb_map)
fpb = fpb.dropna(subset=["label"])

print("FPB size:", len(fpb))
fpb.head()


FPB path: C:\Users\starw\.cache\kagglehub\datasets\ankurzing\sentiment-analysis-for-financial-news\versions\5
FPB size: 4846


Unnamed: 0,label_raw,text,label
0,neutral,"According to Gran , the company has no plans t...",1
1,neutral,Technopolis plans to develop in stages an area...,1
2,negative,The international electronic industry company ...,0
3,positive,With the new production plant the company woul...,2
4,positive,According to the company 's updated strategy f...,2


In [15]:
# 대용량 뉴스 데이터 다운로드
path_usnews = kagglehub.dataset_download("jeet2016/us-financial-news-articles")
print("US News path:", path_usnews)

root_us = Path(path_usnews)
json_files = list(root_us.rglob("*.json"))
print("Total JSON files:", len(json_files))


US News path: C:\Users\starw\.cache\kagglehub\datasets\jeet2016\us-financial-news-articles\versions\1
Total JSON files: 306242


In [16]:
titles = []
dates = []

MAX_FILES = 60000  # 속도: 필요하면 줄여도 됨

for i, fp in enumerate(json_files):
    if i >= MAX_FILES:
        break
    try:
        data = json.loads(fp.read_text(encoding="utf-8"))
    except:
        continue

    # title 찾기
    title = data.get("title") or data.get("thread", {}).get("title")
    if not title:
        continue
    
    # published date 찾기
    published = data.get("published") or data.get("thread", {}).get("published")
    if not published:
        continue
    
    try:
        dt = pd.to_datetime(published).date()
    except:
        continue
    
    titles.append(title)
    dates.append(dt)

news_df = pd.DataFrame({"date": dates, "title": titles})
print("Parsed US news:", news_df.shape)
news_df.head()


Parsed US news: (60000, 2)


Unnamed: 0,date,title
0,2018-01-03,Emerging markets are set for an even bigger ra...
1,2018-01-03,Cramer reflects on how Trump's actions are fue...
2,2018-01-03,The Wall Street Journal: Peter Thiel’s VC firm...
3,2018-01-02,Hoda Kotb Will Replace Matt Lauer on NBC’s ‘To...
4,2018-01-01,UK's Compass says new CEO to start Jan 1 after...


In [17]:
positive_keywords = ["soared", "jumped", "skyrocketed", "surged", "climbed", "beat expectations"]
negative_keywords = ["slumped", "tumbled", "plunged", "collapsed", "fell", "warned"]

def rule_label(t):
    t = t.lower()
    if any(w in t for w in positive_keywords):
        return 2
    if any(w in t for w in negative_keywords):
        return 0
    return None

news_df["label"] = news_df["title"].apply(rule_label)
news_labeled = news_df.dropna(subset=["label"]).copy()
news_labeled.rename(columns={"title": "text"}, inplace=True)

print("Rule-based labeled news:", len(news_labeled))
news_labeled.head()


Rule-based labeled news: 102


Unnamed: 0,date,text,label
260,2018-01-12,China's 2017 exports rose 10.8% in yuan terms ...,2.0
560,2018-01-12,BlackRock earnings beat expectations; assets u...,2.0
572,2018-01-12,Intel just warned that its patches can cause p...,0.0
811,2018-01-16,Jared Kushner reportedly was warned about his ...,0.0
1075,2018-01-17,2017 Chinese foreign direct investment fell 35...,0.0


In [18]:
full_data = pd.concat([
    fpb[["text", "label"]],
    df_thor[["text", "label"]],
    news_labeled[["text", "label"]],
], ignore_index=True)

full_data = full_data.dropna(subset=["text", "label"]).reset_index(drop=True)
print("Total dataset size:", len(full_data))
print(full_data["label"].value_counts())
full_data.head()


Total dataset size: 6151
label
1.0    3061
2.0    1821
0.0    1269
Name: count, dtype: int64


Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",1.0
1,Technopolis plans to develop in stages an area...,1.0
2,The international electronic industry company ...,0.0
3,With the new production plant the company woul...,2.0
4,According to the company 's updated strategy f...,2.0


In [19]:
train_df, val_df = train_test_split(
    full_data,
    test_size=0.1,
    stratify=full_data["label"],
    random_state=42
)

train_df.shape, val_df.shape


((5535, 2), (616, 2))

In [20]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

base_model = AutoModelForSequenceClassification.from_pretrained(
    "ProsusAI/finbert",
    num_labels=3
)

# Set problem type explicitly
base_model.config.problem_type = "single_label_classification"

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["query", "value"],
    task_type="SEQ_CLS"
)

model = get_peft_model(base_model, lora_config)

# Ensure problem type is preserved after PEFT
model.config.problem_type = "single_label_classification"

print(model.print_trainable_parameters())


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.2707
None


Exception ignored in: <function tqdm.__del__ at 0x000001E7F56CEDE0>
Traceback (most recent call last):
  File "c:\Users\starw\AppData\Local\Programs\Python\Python313\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "c:\Users\starw\AppData\Local\Programs\Python\Python313\Lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm' object has no attribute 'disp'


In [21]:
class FinDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128):
        self.texts = df["text"].tolist()
        self.labels = df["label"].astype(int).tolist()
        self.tk = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        enc = self.tk(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_ds = FinDataset(train_df, tokenizer)
val_ds = FinDataset(val_df, tokenizer)


In [22]:
def metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro")
    }

args = TrainingArguments(
    output_dir="./finbert_lora_all",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=metrics
)


In [23]:
trainer.train()
trainer.evaluate()




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.1509,0.747311,0.696429,0.589293
2,0.665,0.539554,0.792208,0.771372
3,0.545,0.515551,0.801948,0.785491




{'eval_loss': 0.5155511498451233,
 'eval_accuracy': 0.801948051948052,
 'eval_f1': 0.7854905367374053,
 'eval_runtime': 1122.8734,
 'eval_samples_per_second': 0.549,
 'eval_steps_per_second': 0.018,
 'epoch': 3.0}

In [24]:
# Cell 14: Twitter Financial News Sentiment Dataset 다운로드

import os
import kagglehub

# Kaggle에서 데이터 내려받기
twitter_path = kagglehub.dataset_download(
    "borhanitrash/twitter-financial-news-sentiment-dataset"
)

print("Twitter financial dataset path:", twitter_path)
print("Files:", os.listdir(twitter_path))


Twitter financial dataset path: C:\Users\starw\.cache\kagglehub\datasets\borhanitrash\twitter-financial-news-sentiment-dataset\versions\1
Files: ['README.md', 'sent_dataset_meta.txt', 'sent_train.csv', 'sent_valid.csv']


In [25]:
# Cell 15: CSV 로딩 및 텍스트/라벨 컬럼 식별

import pandas as pd

# 보통 파일 이름이 이런 식이라서 우선 이렇게 시도
train_csv_path = os.path.join(twitter_path, "sent_train.csv")
valid_csv_path = os.path.join(twitter_path, "sent_valid.csv")

df_train_tw = pd.read_csv(train_csv_path)
df_valid_tw = pd.read_csv(valid_csv_path)

print("Train shape:", df_train_tw.shape)
print("Valid shape:", df_valid_tw.shape)
print("Train columns:", df_train_tw.columns.tolist())
print("Valid columns:", df_valid_tw.columns.tolist())

# 텍스트/라벨 컬럼 자동 탐색 (데이터셋 구조가 조금 달라도 버티게)
def detect_text_label_cols(df):
    text_candidates = ["text", "sentence", "tweet", "content"]
    label_candidates = ["label", "sentiment", "target", "labels"]

    text_col = None
    label_col = None

    for c in df.columns:
        if c.lower() in text_candidates:
            text_col = c
        if c.lower() in label_candidates:
            label_col = c

    if text_col is None or label_col is None:
        raise ValueError(
            f"텍스트/라벨 컬럼을 찾지 못했습니다. 실제 컬럼명을 확인해서 코드에서 직접 지정해 주세요. "
            f"(현재 컬럼들: {df.columns.tolist()})"
        )
    return text_col, label_col

text_col_train, label_col_train = detect_text_label_cols(df_train_tw)
text_col_valid, label_col_valid = detect_text_label_cols(df_valid_tw)

print("Detected text/label columns (train):", text_col_train, label_col_train)
print("Detected text/label columns (valid):", text_col_valid, label_col_valid)

# 공통 포맷으로 정리
df_train_tw = df_train_tw[[text_col_train, label_col_train]].rename(
    columns={text_col_train: "text", label_col_train: "label_raw"}
)
df_valid_tw = df_valid_tw[[text_col_valid, label_col_valid]].rename(
    columns={text_col_valid: "text", label_col_valid: "label_raw"}
)

print(df_train_tw.head())
print(df_valid_tw.head())


Train shape: (9543, 2)
Valid shape: (2388, 2)
Train columns: ['text', 'label']
Valid columns: ['text', 'label']
Detected text/label columns (train): text label
Detected text/label columns (valid): text label
                                                text  label_raw
0  $BYND - JPMorgan reels in expectations on Beyo...          0
1  $CCL $RCL - Nomura points to bookings weakness...          0
2  $CX - Cemex cut at Credit Suisse, J.P. Morgan ...          0
3  $ESS: BTIG Research cuts to Neutral https://t....          0
4  $FNKO - Funko slides after Piper Jaffray PT cu...          0
                                                text  label_raw
0  $ALLY - Ally Financial pulls outlook https://t...          0
1  $DELL $HPE - Dell, HPE targets trimmed on comp...          0
2  $PRTY - Moody's turns negative on Party City h...          0
3                   $SAN: Deutsche Bank cuts to Hold          0
4                  $SITC: Compass Point cuts to Sell          0


In [26]:
# Cell 16: Twitter 라벨 → FinBERT 라벨로 매핑

def map_twitter_label(v):
    # 문자열일 때
    if isinstance(v, str):
        v_clean = v.strip().lower()
        mapping_str = {
            "bearish": 0,   # negative
            "neutral": 1,
            "bullish": 2,   # positive
        }
        if v_clean not in mapping_str:
            raise ValueError(f"예상치 못한 문자열 라벨: {v}")
        return mapping_str[v_clean]

    # 숫자일 때 (HF 정의: 0=Bearish, 1=Bullish, 2=Neutral)
    try:
        v_int = int(v)
    except Exception as e:
        raise ValueError(f"라벨을 int로 변환할 수 없음: {v}") from e

    mapping_num = {0: 0, 2: 1, 1: 2}  # 0=neg,2=neu,1=pos로 재정렬
    if v_int not in mapping_num:
        raise ValueError(f"예상치 못한 숫자 라벨: {v_int}")
    return mapping_num[v_int]

df_train_tw["label"] = df_train_tw["label_raw"].apply(map_twitter_label)
df_valid_tw["label"] = df_valid_tw["label_raw"].apply(map_twitter_label)

print(df_train_tw["label"].value_counts())
print(df_valid_tw["label"].value_counts())

# 중복 제거 및 기본 확인
print("Train unique texts:", df_train_tw["text"].nunique(), "/", len(df_train_tw))
print("Valid unique texts:", df_valid_tw["text"].nunique(), "/", len(df_valid_tw))

df_train_tw.sample(5)


label
1    6178
2    1923
0    1442
Name: count, dtype: int64
label
1    1566
2     475
0     347
Name: count, dtype: int64
Train unique texts: 9543 / 9543
Valid unique texts: 2388 / 2388


Unnamed: 0,text,label_raw,label
5470,When staffers in WeWork's New York City headqu...,2,1
8982,ADMA Biologics down 9% premarket after pricing...,0,0
4704,Illegal Tender podcast: What it was like to be...,2,1
1954,The Splunk Data-to-Everything Platform Brings ...,2,1
3146,OPEC's share of Indian oil imports in October ...,2,1


In [27]:
# Cell 16.5: tokenizer 다시 로드 (Cell 17 실행 전에!)

from transformers import AutoTokenizer

# 파인튜닝에 사용한 같은 체크포인트로 맞춰줄 것
model_name_or_path = "ProsusAI/finbert"      # 또는 너의 fine-tuned 모델 디렉토리 경로

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)


In [28]:
# Cell 17: HuggingFace Dataset으로 변환 후 토크나이즈

from datasets import Dataset

# 평가용으로 validation split만 사용할 수도 있지만,
# 여기서는 train+valid 모두 합쳐서 "외부 평가셋"으로 사용 (원하는 대로 조절 가능)
df_twitter_all = pd.concat([df_train_tw[["text", "label"]],
                            df_valid_tw[["text", "label"]]],
                           ignore_index=True)

twitter_dataset = Dataset.from_pandas(df_twitter_all)

def preprocess_twitter(examples):
    # 앞에서 쓰던 tokenize_function이 있으면 그대로 써도 됨
    # 예: return tokenize_function(examples)
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )
    tokenized["labels"] = examples["label"]
    return tokenized

twitter_encoded = twitter_dataset.map(preprocess_twitter, batched=True)

# Trainer용 포맷 설정
twitter_encoded.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)

twitter_encoded


Map:   0%|          | 0/11931 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 11931
})

In [29]:
# Cell 18: Twitter 금융 트윗 감성 분류 성능 평가

twitter_metrics = trainer.evaluate(twitter_encoded)
print("===== Twitter Financial News Sentiment – External Eval =====")
for k, v in twitter_metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, (int, float)) else f"{k}: {v}")




===== Twitter Financial News Sentiment – External Eval =====
eval_loss: 0.6721
eval_accuracy: 0.7261
eval_f1: 0.6681
eval_runtime: 1873.6256
eval_samples_per_second: 6.3680
eval_steps_per_second: 0.1990
epoch: 3.0000


In [33]:
# Cell 18.5: 뉴스 데이터에 감성 점수 예측하여 daily summary 생성

# 1) 모델과 토크나이저 준비 (이미 로드되어 있으면 재사용)
try:
    model
    tokenizer
except NameError:
    print("Loading model and tokenizer...")
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    from peft import PeftModel
    
    tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
    base_model = AutoModelForSequenceClassification.from_pretrained(
        "ProsusAI/finbert",
        num_labels=3
    )
    model = PeftModel.from_pretrained(base_model, "./finbert_lora_all")
    model.config.problem_type = "single_label_classification"

# 2) news_df에 날짜가 있어야 함 (Cell 6에서 생성)
# news_df = DataFrame with columns: ['date', 'title']
print("News data shape:", news_df.shape)
print("Date range:", news_df['date'].min(), "to", news_df['date'].max())

# 3) 각 뉴스에 대해 감성 점수 예측
import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

def predict_sentiment(text):
    """텍스트에 대한 감성 점수 예측 (0=neg, 1=neu, 2=pos)"""
    inputs = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=-1)
        pred_label = torch.argmax(probs, dim=-1).item()
        
    # 감성 점수: negative=-1, neutral=0, positive=1
    sentiment_map = {0: -1, 1: 0, 2: 1}
    return sentiment_map[pred_label], probs[0].cpu().numpy()

# 4) 배치 처리로 예측 (속도 향상)
sentiments = []
pos_probs = []
neg_probs = []

print("Predicting sentiment for news articles...")
for idx, row in tqdm(news_df.iterrows(), total=len(news_df)):
    try:
        sent_score, probs = predict_sentiment(row['title'])
        sentiments.append(sent_score)
        pos_probs.append(probs[2])  # positive probability
        neg_probs.append(probs[0])  # negative probability
    except Exception as e:
        sentiments.append(0)
        pos_probs.append(0.33)
        neg_probs.append(0.33)

news_df['sentiment'] = sentiments
news_df['pos_prob'] = pos_probs
news_df['neg_prob'] = neg_probs

# 5) 날짜별로 집계
daily_sentiment = news_df.groupby('date').agg(
    news_count=('sentiment', 'count'),
    sent_mean=('sentiment', 'mean'),
    sent_std=('sentiment', 'std'),
    sent_max=('sentiment', 'max'),
    sent_min=('sentiment', 'min'),
    pos_mean=('pos_prob', 'mean'),
    neg_mean=('neg_prob', 'mean')
).reset_index()

daily_sentiment.columns = ['date_only', 'news_count', 'sent_mean', 'sent_std', 
                           'sent_max', 'sent_min', 'pos_mean', 'neg_mean']

# std가 NaN인 경우 (뉴스 1개만 있는 날) 0으로 채우기
daily_sentiment['sent_std'] = daily_sentiment['sent_std'].fillna(0)

# 6) CSV로 저장
daily_sentiment.to_csv("sentiment_daily_summary.csv", index=False)
print(f"\nSaved sentiment_daily_summary.csv with {len(daily_sentiment)} days")
print(daily_sentiment.head(10))

News data shape: (60000, 3)
Date range: 2017-12-07 to 2018-02-28
Predicting sentiment for news articles...


100%|██████████| 60000/60000 [10:35:54<00:00,  1.57it/s]       




Saved sentiment_daily_summary.csv with 75 days
    date_only  news_count  sent_mean  sent_std  sent_max  sent_min  pos_mean  \
0  2017-12-07           3  -0.333333   0.57735         0        -1  0.119003   
1  2017-12-08           3   0.333333   0.57735         1         0  0.415521   
2  2017-12-10           1   0.000000   0.00000         0         0  0.134113   
3  2017-12-13           1   0.000000   0.00000         0         0  0.087634   
4  2017-12-14           3   0.000000   0.00000         0         0  0.159874   
5  2017-12-15           3   0.000000   1.00000         1        -1  0.256659   
6  2017-12-18           1  -1.000000   0.00000        -1        -1  0.020795   
7  2017-12-20           1   0.000000   0.00000         0         0  0.169697   
8  2017-12-21           4   0.000000   0.00000         0         0  0.143694   
9  2017-12-22           2   0.000000   0.00000         0         0  0.129280   

   neg_mean  
0  0.353256  
1  0.079328  
2  0.211022  
3  0.090975  
4

In [35]:
import pandas as pd
import yfinance as yf

# 1) sentiment summary 파일 불러오기
sent_df = pd.read_csv("sentiment_daily_summary.csv")  
# 예: columns = ['date_only','news_count','sent_mean','sent_std','sent_max','sent_min','pos_mean','neg_mean']

sent_df['date_only'] = pd.to_datetime(sent_df['date_only'])

# 2) S&P500 가격 불러오기
sp500 = yf.download("^GSPC", start="2012-01-01", end="2025-01-01")

# MultiIndex 컬럼을 flatten (yfinance가 MultiIndex를 반환하는 경우 대비)
if isinstance(sp500.columns, pd.MultiIndex):
    sp500.columns = sp500.columns.get_level_values(0)

sp500 = sp500.reset_index()
sp500 = sp500.rename(columns={"Date": "date_only"})

# daily return 생성
sp500["ret"] = sp500["Close"].pct_change()
sp500["ret_next"] = sp500["ret"].shift(-1)

# 3) sentiment + market merge
merged = pd.merge(sent_df, sp500[["date_only","Close","ret","ret_next"]],
                  on="date_only", how="inner")

print("Merged shape:", merged.shape)
print(merged.head())

  sp500 = yf.download("^GSPC", start="2012-01-01", end="2025-01-01")
[*********************100%***********************]  1 of 1 completed

Merged shape: (53, 11)
   date_only  news_count  sent_mean  sent_std  sent_max  sent_min  pos_mean  \
0 2017-12-07           3  -0.333333   0.57735         0        -1  0.119003   
1 2017-12-08           3   0.333333   0.57735         1         0  0.415521   
2 2017-12-13           1   0.000000   0.00000         0         0  0.087634   
3 2017-12-14           3   0.000000   0.00000         0         0  0.159874   
4 2017-12-15           3   0.000000   1.00000         1        -1  0.256659   

   neg_mean        Close       ret  ret_next  
0  0.353256  2636.979980  0.002932  0.005506  
1  0.079328  2651.500000  0.005506  0.003202  
2  0.090975  2662.850098 -0.000473 -0.004071  
3  0.101922  2652.010010 -0.004071  0.008974  
4  0.422308  2675.810059  0.008974  0.005363  





In [36]:
# Cell 19: 특징/타깃 정의 (baseline vs sentiment 비교)

import numpy as np
from sklearn.model_selection import train_test_split

# 1) 감성 feature들
sent_cols = [
    "news_count",
    "sent_mean",
    "sent_std",
    "sent_max",
    "sent_min",
    "pos_mean",
    "neg_mean",
]

# 2) baseline feature (예: 전일 수익률 하나만)
base_cols = ["ret"]

# 3) full model: baseline + sentiment
full_cols = base_cols + sent_cols

# 결측치 정리
data = merged.copy().dropna(subset=["ret_next"])  # 라벨 없는 날 제거
data[sent_cols] = data[sent_cols].fillna(0.0)

# time-series니까 섞지 말고 뒤쪽 20%를 test로 사용
split_idx = int(len(data) * 0.8)
train_df = data.iloc[:split_idx].copy()
test_df  = data.iloc[split_idx:].copy()

y_train = train_df["ret_next"].values
y_test  = test_df["ret_next"].values

X_base_train = train_df[base_cols].values
X_base_test  = test_df[base_cols].values

X_sent_train = train_df[sent_cols].values
X_sent_test  = test_df[sent_cols].values

X_full_train = train_df[full_cols].values
X_full_test  = test_df[full_cols].values

print("Train size:", len(train_df), "Test size:", len(test_df))
print("Feature sets:",
      "baseline =", X_base_train.shape[1],
      ", sentiment =", X_sent_train.shape[1],
      ", full =", X_full_train.shape[1])


Train size: 42 Test size: 11
Feature sets: baseline = 1 , sentiment = 7 , full = 8


In [37]:
# Cell 20: RandomForest로 baseline / sentiment-only / full 모델 학습 및 예측

from sklearn.ensemble import RandomForestRegressor

rf_base = RandomForestRegressor(
    n_estimators=300,
    max_depth=6,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
)
rf_sent = RandomForestRegressor(
    n_estimators=300,
    max_depth=6,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
)
rf_full = RandomForestRegressor(
    n_estimators=300,
    max_depth=6,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
)

rf_base.fit(X_base_train, y_train)
rf_sent.fit(X_sent_train, y_train)
rf_full.fit(X_full_train, y_train)

y_pred_base = rf_base.predict(X_base_test)
y_pred_sent = rf_sent.predict(X_sent_test)
y_pred_full = rf_full.predict(X_full_test)


In [40]:
# Cell 21: 세 모델 성능 비교 (R², RMSE, 방향(up/down) 정확도)

from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

def eval_reg(y_true, y_pred, name="model"):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    dir_true = (y_true > 0).astype(int)
    dir_pred = (y_pred > 0).astype(int)
    acc = (dir_true == dir_pred).mean()
    print(f"===== {name} =====")
    print(f"R^2   : {r2:.4f}")
    print(f"RMSE  : {rmse:.6f}")
    print(f"DirAcc: {acc:.4f}")
    return r2, rmse, acc

r2_b, rmse_b, acc_b = eval_reg(y_test, y_pred_base, "Baseline (ret only)")
r2_s, rmse_s, acc_s = eval_reg(y_test, y_pred_sent, "Sentiment-only")
r2_f, rmse_f, acc_f = eval_reg(y_test, y_pred_full, "Full (ret + sentiment)")

===== Baseline (ret only) =====
R^2   : 0.2403
RMSE  : 0.009271
DirAcc: 0.7273
===== Sentiment-only =====
R^2   : -0.1175
RMSE  : 0.011244
DirAcc: 0.4545
===== Full (ret + sentiment) =====
R^2   : 0.1198
RMSE  : 0.009979
DirAcc: 0.6364
