In [7]:
!pip install datasets
!pip install finnhub-python
!pip install yfinance
!pip install openai
!pip install utils

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [8]:
import os
import time
import json
import math
import torch
import yfinance as yf
import finnhub
import pandas as pd
from datetime import datetime
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForCausalLM

In [10]:
access_token = "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"
finnhub_key = "cv0hi7pr01qo8ssgrbagcv0hi7pr01qo8ssgrbb0"

In [11]:
finnhub_client = finnhub.Client(api_key=finnhub_key)

Fetching the basic Financials, the company news, and preparing the data. Using deepseek to do the analyse

In [12]:
def categorize_return(ret):
    direction = "U" if ret >= 0 else "D"
    magnitude = math.ceil(abs(100 * ret))
    return f"{direction}{magnitude if magnitude <= 5 else '5+'}"


def fetch_stock_returns(symbol, start_date, end_date):
    stock_data = yf.download(symbol, start=start_date, end=end_date)
    if stock_data.empty:
        raise ValueError(f" No stock data found for {symbol}")

    price_column = "Adj Close" if "Adj Close" in stock_data.columns else "Close"

    weekly_prices = stock_data[price_column].resample("W").ffill()
    weekly_returns = weekly_prices.pct_change().dropna()

    data = pd.DataFrame({
        "Start Date": weekly_prices.index[:-1],
        "Start Price": weekly_prices.iloc[:-1].values.ravel(),
        "End Date": weekly_prices.index[1:],
        "End Price": weekly_prices.iloc[1:].values.ravel(),
        "Weekly Returns": weekly_returns.values.ravel()
    })

    data["Bin Label"] = data["Weekly Returns"].apply(categorize_return)
    return data


def fetch_company_news(symbol, data):
    news_results = []
    for _, row in data.iterrows():
        start_date = row["Start Date"].strftime("%Y-%m-%d")
        end_date = row["End Date"].strftime("%Y-%m-%d")
        time.sleep(1)

        try:
            news_articles = finnhub_client.company_news(symbol, _from=start_date, to=end_date)
            news_articles = [
                {
                    "date": datetime.fromtimestamp(n["datetime"]).strftime("%Y-%m-%d %H:%M:%S"),
                    "headline": n["headline"],
                    "summary": n["summary"]
                }
                for n in news_articles if n.get("datetime", 0) > 0
            ]
            news_articles.sort(key=lambda x: x["date"])
            news_results.append(json.dumps(news_articles))
        except Exception as e:
            print(f" Error fetching news for {symbol}: {e}")
            news_results.append(json.dumps([]))

    data["News"] = news_results
    return data


def fetch_basic_financials(symbol, data):
    try:
        financials = finnhub_client.company_basic_financials(symbol, "all")
        quarterly_data = financials.get("series", {}).get("quarterly", {})
        financial_results = []

        for i, row in data.iterrows():
            end_date = row["End Date"].strftime("%Y-%m-%d")
            last_start_date = data.iloc[max(i - 2, 0)]["Start Date"].strftime("%Y-%m-%d")

            matched_financials = {}
            for metric, values in quarterly_data.items():
                for record in values:
                    if last_start_date <= record["period"] < end_date:
                        matched_financials[metric] = record["v"]

            financial_results.append(json.dumps(matched_financials))

        data["Basics"] = financial_results
    except Exception as e:
        print(f" Error fetching financials for {symbol}: {e}")
        data["Basics"] = json.dumps({})

    return data

def prepare_stock_data(symbol, data_dir, start_date, end_date, include_basics=True):
    print(f" Fetching stock data for {symbol} from {start_date} to {end_date}...")
    stock_data = fetch_stock_returns(symbol, start_date, end_date)
    stock_data = fetch_company_news(symbol, stock_data)

    if include_basics:
        stock_data = fetch_basic_financials(symbol, stock_data)

    file_name = f"{data_dir}/{symbol}_{start_date}_{end_date}.csv"
    stock_data.to_csv(file_name, index=False)
    print(f" Data saved to {file_name}")

    return stock_data