In [2]:
import yfinance as yf
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import softmax
from sklearn.preprocessing import MinMaxScaler
from torch import nn
import numpy as np
from torch.utils.data import DataLoader, TensorDataset

In [10]:
!pip install selenium
!pip install webdriver_manager

Collecting selenium
  Downloading selenium-4.31.0-py3-none-any.whl.metadata (7.5 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.29.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.31.0-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.29.0-py3-none-any.whl (492 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.9/492.9 kB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio_websocket-0.12.2-py3-none-any.whl (21 kB)
Downloading outcome-1.3.0.post0-py2.py3-

In [12]:
# 구글 콜랩
!apt-get update > /dev/null
!apt install -y chromium-chromedriver > /dev/null
!cp /usr/lib/chromium-browser/chromedriver /usr/bin
!pip install selenium > /dev/null

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


cp: '/usr/lib/chromium-browser/chromedriver' and '/usr/bin/chromedriver' are the same file


In [13]:
# 1. 데이터 수집
ticker = "AAPL"
stock = yf.download(ticker, start="2021-01-01", end="2025-04-17")[["Close"]]
stock.reset_index(inplace=True)

# 2. 뉴스 예시 (실제로는 크롤링 필요)
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time
from datetime import datetime

def get_naver_news_selenium(query="애플", pages=3):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # 브라우저 안띄우기
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

    news_list = []

    for page in range(1, pages + 1):
        start = (page - 1) * 10 + 1
        url = f"https://search.naver.com/search.naver?where=news&query={query}&start={start}"
        driver.get(url)
        time.sleep(1)  # 페이지 로딩 대기

        articles = driver.find_elements(By.CSS_SELECTOR, "div.news_area")

        for article in articles:
            try:
                title = article.find_element(By.CSS_SELECTOR, "a.news_tit").get_attribute("title")
                link = article.find_element(By.CSS_SELECTOR, "a.news_tit").get_attribute("href")
                press = article.find_element(By.CSS_SELECTOR, "a.info.press").text
                news_list.append({
                    "date": datetime.now().strftime("%Y-%m-%d"),
                    "headline": title,
                    "link": link,
                    "press": press
                })
            except Exception as e:
                print("에러:", e)

    driver.quit()
    return pd.DataFrame(news_list)

# 사용 예시
news_df = get_naver_news_selenium("애플", pages=3)
print(news_df.head())


# 3. FinBERT 로딩
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone")

# 4. 뉴스 감성 분석
def get_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    probs = softmax(outputs.logits, dim=1)
    return probs.detach().numpy()[0]

# 감성 스코어 계산
sentiment_scores = news_df["headline"].apply(get_sentiment)
sentiment_df = pd.DataFrame(sentiment_scores.tolist(), columns=["negative", "neutral", "positive"])
news_data = pd.concat([news_df, sentiment_df], axis=1)

# 5. 감성 + 주가 merge
news_data["date"] = pd.to_datetime(news_data["date"])
stock["Date"] = pd.to_datetime(stock["Date"])
merged = pd.merge(stock, news_data, left_on="Date", right_on="date").drop(columns=["date"])

# 6. LSTM 입력 생성
features = merged[["positive", "negative", "neutral"]].values
targets = merged["Close"].values

scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features)
targets_scaled = scaler.fit_transform(targets.reshape(-1, 1))

# 시퀀스 생성
X, y = [], []
window_size = 5
for i in range(window_size, len(features_scaled)):
    X.append(features_scaled[i-window_size:i])
    y.append(targets_scaled[i])
X, y = np.array(X), np.array(y)

# Tensor 변환
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)
dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

# 7. LSTM 모델 정의
class StockLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return out

model_lstm = StockLSTM(input_size=3, hidden_size=50)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_lstm.parameters(), lr=0.001)

# 8. 학습 루프
for epoch in range(10):  # 에폭 수 조절 가능
    model_lstm.train()
    epoch_loss = 0
    for batch_x, batch_y in loader:
        pred = model_lstm(batch_x).squeeze()
        loss = criterion(pred, batch_y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1} Loss: {epoch_loss:.4f}")


[*********************100%***********************]  1 of 1 completed


WebDriverException: Message: unknown error: Chrome failed to start: exited abnormally.
  (unknown error: DevToolsActivePort file doesn't exist)
  (The process started from chrome location /usr/bin/chromium-browser is no longer running, so ChromeDriver is assuming that Chrome has crashed.)
Stacktrace:
#0 0x589e0f4eb4e3 <unknown>
#1 0x589e0f21ac76 <unknown>
#2 0x589e0f243d78 <unknown>
#3 0x589e0f240029 <unknown>
#4 0x589e0f27eccc <unknown>
#5 0x589e0f27e47f <unknown>
#6 0x589e0f275de3 <unknown>
#7 0x589e0f24b2dd <unknown>
#8 0x589e0f24c34e <unknown>
#9 0x589e0f4ab3e4 <unknown>
#10 0x589e0f4af3d7 <unknown>
#11 0x589e0f4b9b20 <unknown>
#12 0x589e0f4b0023 <unknown>
#13 0x589e0f47e1aa <unknown>
#14 0x589e0f4d46b8 <unknown>
#15 0x589e0f4d4847 <unknown>
#16 0x589e0f4e4243 <unknown>
#17 0x79b047d58ac3 <unknown>
