In [1]:
import yfinance as yf
import pandas as pd
import re
from datetime import datetime
from fredapi import Fred
import praw
import warnings
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import torch
import torch.nn.functional as F
from sklearn.decomposition import PCA
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
def collect_data(firm):
    # Get the firm's full name
    def get_firm_name(ticker):
        stock = yf.Ticker(ticker)
        return stock.info["longName"]

    firm_name = get_firm_name(firm)
    firm_name = re.sub(r'[^A-Za-z0-9 ]+', '', firm_name).split(' ')[0]

    ##### Historical Stock Price #####
    stock = yf.Ticker(firm)
    data = stock.history(period="5y")
    df_stock = pd.DataFrame(data)
    df_stock.index = df_stock.index.date
    df_stock.index.name = None
    print(f"{firm}: Historical Stock Price collected")

    ##### Macroeconomic Indicators #####
    api_key = "d6ed01a1d424d730c0a92819f41f4c79"
    fred = Fred(api_key=api_key)

    indicators = {
        "GDP (Billions USD)": "GDP",
        "Unemployment Rate (%)": "UNRATE",
        "Producer Price Index (PPI)": "PPIACO",
        "Retail Sales (Millions USD)": "RSAFS",
        "Industrial Production Index": "INDPRO",
        "Housing Starts (Thousands)": "HOUST",
        "Personal Consumption Expenditures (PCE)": "PCE",
        "Trade Balance (Billions USD)": "BOPGSTB",
        "M2 Money Supply (Billions USD)": "M2",
        "Consumer Confidence Index": "UMCSENT",
    }

    data = {name: fred.get_series(code) for name, code in indicators.items()}
    df_macro = pd.DataFrame(data)
    print(f"{firm}: Macroeconomic Indicators collected")

    ##### Microeconomic Indicators #####
    keep_cols = [
        "Total Revenue", "Operating Income", "Net Income", "EBITDA", "Gross Profit", "Interest Expense",
        "Total Assets", "Total Liabilities Net Minority Interest", "Stockholders Equity",
        "Cash And Cash Equivalents", "Accounts Receivable", "Inventory", "Current Assets", 
        "Current Liabilities", "Long Term Debt",
        "Operating Cash Flow", "Capital Expenditure", "Free Cash Flow", 
        "Depreciation And Amortization", "Repurchase Of Capital Stock"
    ]
    
    def select_columns(df):
        return df[[col for col in keep_cols if col in df.columns]]
    
    financials = select_columns(stock.financials.T)
    balance_sheet = select_columns(stock.balance_sheet.T)
    cash_flow = select_columns(stock.cashflow.T)
    
    df_micro = pd.concat([financials, balance_sheet, cash_flow], axis=1)
    
    print(f"{firm}: Microeconomic Indicators collected")

    ##### Reddit (동기 방식) #####
    def fetch_reddit_sync(firm_name, subreddit_name, sort_method="new", limit=500):
        reddit = praw.Reddit(
            client_id="ardOQiL60Y2K7KF0V_WMGA",
            client_secret="oyNdfuaDlVeSwV7qmgSRP5bFcYru-Q",
            user_agent="my_reddit_scraper"
        )

        subreddit = reddit.subreddit(subreddit_name)
        posts = subreddit.search(firm_name, limit=limit, sort=sort_method, time_filter='all')

        data = []
        for post in posts:
            data.append({
                "Date Posted": datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d'),
                "Title": post.title,
                "Body": post.selftext,
                "Vote": post.score,
            })

        return pd.DataFrame(data)

    subreddit_lists = ["wallstreetbets", "stocks", "investing", "StockMarket"]
    dataframes = []
    for subreddit in subreddit_lists:
        df = fetch_reddit_sync("$" + firm, subreddit)
        dataframes.append(df)

    df_reddit = pd.concat(dataframes, ignore_index=True)
    df_reddit.drop_duplicates(inplace=True)
    df_reddit = df_reddit.sort_values(by='Date Posted', ascending=False).reset_index(drop=True)
    df_reddit['Date Posted'] = pd.to_datetime(df_reddit['Date Posted'])
    df_reddit.set_index('Date Posted', inplace=True)
    df_reddit.index.name = None
    print(f"{firm}: Reddit Data collected")

    ##### Merge stock, macro, micro #####
    df_stock.index = pd.to_datetime(df_stock.index)
    df_stock = df_stock.reset_index().rename(columns={"index": "Date"})

    df_macro.index = pd.to_datetime(df_macro.index)
    df_macro = df_macro.sort_index().ffill().dropna()
    df_macro = df_macro.resample("D").ffill().reset_index().rename(columns={"index": "Date"})

    df_micro.index = pd.to_datetime(df_micro.index)
    df_micro = df_micro.sort_index()
    df_micro = df_micro.drop(df_micro.index[0])
    df_micro.ffill(inplace=True)
    df_micro = df_micro.resample("D").ffill().reset_index().rename(columns={"index": "Date"})

    # Expand df_micro to 2024-12-31
    last_date = df_micro["Date"].max()
    future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), end=df_stock['Date'].max(), freq="D")
    future_df = pd.DataFrame({"Date": future_dates})
    df_micro = pd.concat([df_micro, future_df], ignore_index=True).ffill()

    merged_df = df_stock.merge(df_macro, on='Date', how='inner')
    merged_df = merged_df.merge(df_micro, on='Date', how='inner')
    merged_df.set_index('Date', inplace=True)
    merged_df.reset_index(inplace=True)

    # Filtering and ratios
    # merged_df = merged_df[(merged_df["Date"] >= "2021-09-30") & (merged_df["Date"] <= "2024-12-31")]
    merged_df['ROA'] = merged_df['Net Income'] / merged_df['Total Assets']
    merged_df['Current_Ratio'] = merged_df['Current Assets'] / merged_df['Current Liabilities']

    ##### Word Embdeeing & Sentiment analysis #####

    # Load FinBERT models
    model_name = "yiyanghkust/finbert-tone"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name)
    embedding_model = AutoModel.from_pretrained(model_name)

    labels = ['negative', 'neutral', 'positive']

    # 예시: 데이터 불러오기
    df = df_reddit  # 또는 DataFrame이 이미 있다면 생략

    # 텍스트 결합
    df["text"] = df["Title"].fillna("") + " " + df["Body"].fillna("")

    # 결과 저장용 리스트
    sentiment_list = []
    embedding_list = []

    # GPU 사용 가능할 시
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    sentiment_model.to(device)
    embedding_model.to(device)

    # 인퍼런스 함수
    def process(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

        with torch.no_grad():
            # 감성
            sent_out = sentiment_model(**inputs)
            probs = F.softmax(sent_out.logits, dim=-1)[0].cpu().numpy()
            sentiment = dict(zip(labels, probs))

            # 임베딩
            emb_out = embedding_model(**inputs)
            cls_vec = emb_out.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # CLS 토큰

        return sentiment, cls_vec

    # tqdm으로 진행상황 보기
    for text in df["text"]:
        sentiment, embedding = process(text)
        sentiment_list.append(sentiment)
        embedding_list.append(embedding)

    # 결과 추가
    df["sentiment"] = sentiment_list
    df["embedding"] = embedding_list
    print(f'{firm}: Word Embedding & Sentiment Analysis completed')
    
    # 1. 임베딩 분해
    embedding_array = np.vstack(df["embedding"].values)
    embedding_df = pd.DataFrame(embedding_array, columns=[f"emb_{i}" for i in range(embedding_array.shape[1])])

    # 2. 감성 점수 분해
    sentiment_df = pd.DataFrame(df["sentiment"].tolist())

    # 3. Vote 가져오기
    vote_col = df["Vote"].reset_index(drop=True)

    # 4. 날짜 (혹은 Title에서 날짜 파싱)
    df["Date"] = pd.to_datetime(df.index) 
    # 5. 모두 합치기
    features_df = pd.concat([df[["Date"]].reset_index(drop=True), vote_col, sentiment_df, embedding_df], axis=1)
    features_df.rename(columns={"Vote": "vote"}, inplace=True)

    final_df = features_df.groupby("Date").agg({
        "vote": "sum",               # 그날 받은 총 vote 수
        "negative": "mean",          # 그날의 평균 부정 감성
        "neutral": "mean",
        "positive": "mean",
        **{f"emb_{i}": "mean" for i in range(768)}  # 임베딩 768차원 평균
    }).reset_index()

    reddit_cols = ['vote', 'negative', 'neutral', 'positive'] + [f"emb_{i}" for i in range(768)]
    if final_df.empty:
        final_df = pd.DataFrame(columns=["Date"] + reddit_cols)
    else:
        for col in reddit_cols:
            if col not in final_df.columns:
                final_df[col] = np.nan

    final_df["Date"] = pd.to_datetime(final_df["Date"]).dt.date
    merged_df['Date'] = pd.to_datetime(merged_df['Date']).dt.date
    
    df = merged_df.merge(final_df, on='Date', how = 'left')
    
    df["Target_1day"] = df["Close"].shift(-1) - df['Close']
    df["Target_1week"] = df["Close"].shift(-5) - df['Close']
    df["Target_1month"] = df["Close"].shift(-20) - df['Close']
    df["Target_1year"] = df["Close"].shift(-250) - df['Close']

    return df

In [None]:
def collect_data(firm):
    # Get the firm's full name
    def get_firm_name(ticker):
        stock = yf.Ticker(ticker)
        return stock.info["longName"]

    firm_name = get_firm_name(firm)
    firm_name = re.sub(r'[^A-Za-z0-9 ]+', '', firm_name).split(' ')[0]

    ##### Historical Stock Price #####
    stock = yf.Ticker(firm)
    data = stock.history(period="5y")
    df_stock = pd.DataFrame(data)
    df_stock.index = df_stock.index.date
    df_stock.index.name = None
    print(f"{firm}: Historical Stock Price collected")

    ##### Macroeconomic Indicators #####
    api_key = "d6ed01a1d424d730c0a92819f41f4c79"
    fred = Fred(api_key=api_key)

    indicators = {
        "GDP (Billions USD)": "GDP",
        "Unemployment Rate (%)": "UNRATE",
        "Producer Price Index (PPI)": "PPIACO",
        "Retail Sales (Millions USD)": "RSAFS",
        "Industrial Production Index": "INDPRO",
        "Housing Starts (Thousands)": "HOUST",
        "Personal Consumption Expenditures (PCE)": "PCE",
        "Trade Balance (Billions USD)": "BOPGSTB",
        "M2 Money Supply (Billions USD)": "M2",
        "Consumer Confidence Index": "UMCSENT",
    }

    data = {name: fred.get_series(code) for name, code in indicators.items()}
    df_macro = pd.DataFrame(data)
    print(f"{firm}: Macroeconomic Indicators collected")

    ##### Microeconomic Indicators #####
    keep_cols = [
        "Total Revenue", "Operating Income", "Net Income", "EBITDA", "Gross Profit", "Interest Expense",
        "Total Assets", "Total Liabilities Net Minority Interest", "Stockholders Equity",
        "Cash And Cash Equivalents", "Accounts Receivable", "Inventory", "Current Assets", 
        "Current Liabilities", "Long Term Debt",
        "Operating Cash Flow", "Capital Expenditure", "Free Cash Flow", 
        "Depreciation And Amortization", "Repurchase Of Capital Stock"
    ]
    
    def select_columns(df):
        return df[[col for col in keep_cols if col in df.columns]]
    
    financials = select_columns(stock.financials.T)
    balance_sheet = select_columns(stock.balance_sheet.T)
    cash_flow = select_columns(stock.cashflow.T)
    
    df_micro = pd.concat([financials, balance_sheet, cash_flow], axis=1)
    
    print(f"{firm}: Microeconomic Indicators collected")

    ##### Reddit (동기 방식) #####
    def fetch_reddit_sync(firm_name, subreddit_name, sort_method="new", limit=500):
        reddit = praw.Reddit(
            client_id="ardOQiL60Y2K7KF0V_WMGA",
            client_secret="oyNdfuaDlVeSwV7qmgSRP5bFcYru-Q",
            user_agent="my_reddit_scraper"
        )

        subreddit = reddit.subreddit(subreddit_name)
        posts = subreddit.search(firm_name, limit=limit, sort=sort_method, time_filter='all')

        data = []
        for post in posts:
            data.append({
                "Date Posted": datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d'),
                "Title": post.title,
                "Body": post.selftext,
                "Vote": post.score,
            })

        return pd.DataFrame(data)

    subreddit_lists = ["wallstreetbets", "stocks", "investing", "StockMarket"]
    dataframes = []
    for subreddit in subreddit_lists:
        df = fetch_reddit_sync("$" + firm, subreddit)
        dataframes.append(df)

    df_reddit = pd.concat(dataframes, ignore_index=True)
    df_reddit.drop_duplicates(inplace=True)
    df_reddit = df_reddit.sort_values(by='Date Posted', ascending=False).reset_index(drop=True)
    df_reddit['Date Posted'] = pd.to_datetime(df_reddit['Date Posted'])
    df_reddit.set_index('Date Posted', inplace=True)
    df_reddit.index.name = None
    print(f"{firm}: Reddit Data collected")

    ##### Merge stock, macro, micro #####
    df_stock.index = pd.to_datetime(df_stock.index)
    df_stock = df_stock.reset_index().rename(columns={"index": "Date"})

    df_macro.index = pd.to_datetime(df_macro.index)
    df_macro = df_macro.sort_index().ffill().dropna()
    df_macro = df_macro.resample("D").ffill().reset_index().rename(columns={"index": "Date"})

    df_micro.index = pd.to_datetime(df_micro.index)
    df_micro = df_micro.sort_index()
    df_micro = df_micro.drop(df_micro.index[0])
    df_micro.ffill(inplace=True)
    df_micro = df_micro.resample("D").ffill().reset_index().rename(columns={"index": "Date"})

    # Expand df_micro to 2024-12-31
    last_date = df_micro["Date"].max()
    future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), end=df_stock['Date'].max(), freq="D")
    future_df = pd.DataFrame({"Date": future_dates})
    df_micro = pd.concat([df_micro, future_df], ignore_index=True).ffill()

    merged_df = df_stock.merge(df_macro, on='Date', how='inner')
    merged_df = merged_df.merge(df_micro, on='Date', how='inner')
    merged_df.set_index('Date', inplace=True)
    merged_df.reset_index(inplace=True)

    # Filtering and ratios
    # merged_df = merged_df[(merged_df["Date"] >= "2021-09-30") & (merged_df["Date"] <= "2024-12-31")]
    merged_df['ROA'] = merged_df['Net Income'] / merged_df['Total Assets']
    merged_df['Current_Ratio'] = merged_df['Current Assets'] / merged_df['Current Liabilities']

    ##### Word Embdeeing & Sentiment analysis #####

    # Load FinBERT models
    model_name = "yiyanghkust/finbert-tone"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name)
    embedding_model = AutoModel.from_pretrained(model_name)

    labels = ['negative', 'neutral', 'positive']

    # 예시: 데이터 불러오기
    df = df_reddit  # 또는 DataFrame이 이미 있다면 생략

    # 텍스트 결합
    df["text"] = df["Title"].fillna("") + " " + df["Body"].fillna("")

    # 결과 저장용 리스트
    sentiment_list = []
    embedding_list = []

    # GPU 사용 가능할 시
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    sentiment_model.to(device)
    embedding_model.to(device)

    # 인퍼런스 함수
    def process(text):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)

        with torch.no_grad():
            # 감성
            sent_out = sentiment_model(**inputs)
            probs = F.softmax(sent_out.logits, dim=-1)[0].cpu().numpy()
            sentiment = dict(zip(labels, probs))

            # 임베딩
            emb_out = embedding_model(**inputs)
            cls_vec = emb_out.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # CLS 토큰

        return sentiment, cls_vec

    # tqdm으로 진행상황 보기
    for text in df["text"]:
        sentiment, embedding = process(text)
        sentiment_list.append(sentiment)
        embedding_list.append(embedding)

    # 결과 추가
    df["sentiment"] = sentiment_list
    df["embedding"] = embedding_list
    print(f'{firm}: Word Embedding & Sentiment Analysis completed')
    
    # 1. 임베딩 분해
    embedding_array = np.vstack(df["embedding"].values)
    embedding_df = pd.DataFrame(embedding_array, columns=[f"emb_{i}" for i in range(embedding_array.shape[1])])

    # 2. 감성 점수 분해
    sentiment_df = pd.DataFrame(df["sentiment"].tolist())

    # 3. Vote 가져오기
    vote_col = df["Vote"].reset_index(drop=True)

    # 4. 날짜 (혹은 Title에서 날짜 파싱)
    df["Date"] = pd.to_datetime(df.index) 
    # 5. 모두 합치기
    features_df = pd.concat([df[["Date"]].reset_index(drop=True), vote_col, sentiment_df, embedding_df], axis=1)
    features_df.rename(columns={"Vote": "vote"}, inplace=True)

    final_df = features_df.groupby("Date").agg({
        "vote": "sum",               # 그날 받은 총 vote 수
        "negative": "mean",          # 그날의 평균 부정 감성
        "neutral": "mean",
        "positive": "mean",
        **{f"emb_{i}": "mean" for i in range(768)}  # 임베딩 768차원 평균
    }).reset_index()

    reddit_cols = ['vote', 'negative', 'neutral', 'positive'] + [f"emb_{i}" for i in range(768)]
    if final_df.empty:
        final_df = pd.DataFrame(columns=["Date"] + reddit_cols)
    else:
        for col in reddit_cols:
            if col not in final_df.columns:
                final_df[col] = np.nan

    final_df["Date"] = pd.to_datetime(final_df["Date"]).dt.date
    merged_df['Date'] = pd.to_datetime(merged_df['Date']).dt.date
    
    df = merged_df.merge(final_df, on='Date', how = 'left')
    
    df["Target_1day"] = df["Close"].shift(-1) - df['Close']
    df["Target_1week"] = df["Close"].shift(-5) - df['Close']
    df["Target_1month"] = df["Close"].shift(-20) - df['Close']
    df["Target_1year"] = df["Close"].shift(-250) - df['Close']

    return df

In [None]:
dfs = []
firms = ['AAPL', 'NVDA', 'MSFT', 'AMZN', 'GOOGL', 'META', 'TSLA', 'AVGO', 'ORCL', 'AMD']

for firm in firms:
    df = collect_data(firm)
    df['Ticker'] = firm
    dfs.append(df)

In [None]:
df = pd.concat(dfs, axis=0, ignore_index=True)
df = df.sort_values(['Date', 'Ticker'])
df = df.reset_index(drop = True)
df