In [1]:
import pandas as pd
import os
from tqdm import tqdm
import gc
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast
import kagglehub
from kagglehub import KaggleDatasetAdapter
from typing import Optional, Iterable
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cudf
import cupy as cp
import numpy as np
import pandas as pd
from typing import Optional, Iterable
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Construct relative path
finance_news = os.path.join(".", "FINANCIAL NEWS", "finance_data.csv")

# Minimal file path check
if not os.path.exists(finance_news):
    print(f"File not found at: {os.path.abspath(finance_news)}")
    exit()

# Load datasets
df = pd.read_csv(finance_news)
sp500_tickers = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "andrewmvd/sp-500-stocks",
    "sp500_companies.csv",
)

# Inner merge at the start
df = pd.merge(df, sp500_tickers[['Symbol']], how='inner', left_on='Stock_symbol', right_on='Symbol')
df = df.drop(columns=['Symbol'])

# Verify and process columns
required_columns = ['Date', 'Article_title', 'Stock_symbol', 'Article']
if not all(col in df.columns for col in required_columns):
    print("Error: Missing required columns. Available columns:", list(df.columns))
    exit()

# Prepare text data
df['text'] = df['Article_title'].astype(str) + ' ' + df['Article'].fillna('').astype(str)
df = df[['Date', 'Stock_symbol', 'text']]

# Load FinBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert").to(device).eval()

# Custom Dataset for lazy tokenization
class TextDataset(Dataset):
    def __init__(self, series, max_length=512):
        self.series = series
        self.max_length = max_length

    def __len__(self):
        return len(self.series)

    def __getitem__(self, idx):
        text = str(self.series.iloc[idx]) if not pd.isna(self.series.iloc[idx]) else ""
        return {'text': text, 'is_empty': text.strip() == ""}

# Batch inference function with mixed precision
def batch_sentiment_scores(series, tokenizer, batch_size=32, chunk_size=100000):
    scores, sentiments = [], []
    labels = ['positive', 'negative', 'neutral']
    is_cuda = torch.cuda.is_available()

    for start in tqdm(range(0, len(series), chunk_size), desc="Processing chunks"):
        dataset = TextDataset(series[start:min(start + chunk_size, len(series))])
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        with torch.no_grad():
            for batch in tqdm(dataloader, desc=f"Processing batches (chunk {start//chunk_size + 1})"):
                inputs = tokenizer(
                    batch['text'],
                    return_tensors="pt",
                    max_length=512,
                    truncation=True,
                    padding='longest'
                ).to(device)

                outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
                probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()
                batch_scores = np.max(probs, axis=1)
                batch_sentiments = [labels[np.argmax(p)] for p in probs]

                # Handle empty texts
                for i, empty in enumerate(batch['is_empty']):
                    if empty:
                        batch_scores[i] = np.nan
                        batch_sentiments[i] = "neutral"

                scores.extend(batch_scores)
                sentiments.extend(batch_sentiments)

                if is_cuda:
                    torch.cuda.empty_cache()

        gc.collect()

    return scores, sentiments

# Apply sentiment analysis
scores, sentiments = batch_sentiment_scores(df['text'], tokenizer, batch_size=1024)

Using device: cuda


  df = pd.read_csv(finance_news)
Processing batches (chunk 1):  42%|████▏     | 41/98 [00:39<00:54,  1.05it/s]
Processing chunks:   0%|          | 0/7 [00:39<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Assign results
df['sentiment_score'] = pd.Series(scores, dtype="float32")
df['sentiment'] = pd.Series(sentiments, dtype="category")

# Process final DataFrame
df['sentiment_numeric'] = df['sentiment'].map({'positive': 1, 'neutral': 0, 'negative': -1}).astype('int64')
df['adjusted_sentiment_score'] = df['sentiment_numeric'] * df['sentiment_score']
df['date_only'] = pd.to_datetime(df['Date']).dt.date

# Aggregate data
df = df[["date_only", "Stock_symbol", "adjusted_sentiment_score"]].groupby(
    ['Stock_symbol', 'date_only']
).agg({'adjusted_sentiment_score': 'mean'}).reset_index()

# Save and print results
df.to_csv('output.csv', index=False)
print("Final DataFrame:")
print(df)
start_date = df['date_only'].min()
end_date = df['date_only'].max() 
# Clean up
del df
gc.collect()

In [None]:
df = pd.read_csv('output.csv')
start_date = df['date_only'].min()
end_date = df['date_only'].max() 

In [None]:
sp500_tickers = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    "andrewmvd/sp-500-stocks",
    "sp500_companies.csv",
)
# Assume you already have the DataFrame sp500_ticker with a 'Symbol' column
# Example: sp500_ticker = pd.read_csv("sp500.csv")
# Make sure symbols are in list form
tickers = sp500_tickers['Symbol'].dropna().unique().tolist()

# Download historical data for all tickers
# 'group_by' keeps data per ticker in separate subframes

data = yf.download(
    tickers=tickers,
    start="2009-01-01",
    end="2020-12-31",
    group_by='ticker',
    auto_adjust=False,
    threads=True
)

# Flatten the MultiIndex DataFrame
sp500_history = data.stack(level=0).reset_index()
sp500_history.columns.name = None  # remove any column name metadata

# Rename columns for clarity (optional)
sp500_history.rename(columns={
    'level_1': 'Ticker'
}, inplace=True)

# Save to CSV
sp500_history.to_csv("sp500_yfinance_data.csv", index=False)

In [3]:
import cudf
import cupy as cp
import numpy as np
import pandas as pd
from typing import Optional, Iterable
import matplotlib.pyplot as plt
import os
import kagglehub
import yfinance as yf

# ---------------------------
# Basic moving averages
# ---------------------------
EPSILON = 1e-10
def sma(series: cudf.Series, window: int) -> cudf.Series:
    return series.rolling(window=window, min_periods=1).mean()

def wma(series: cudf.Series, window: int) -> cudf.Series:
    """
    Calculates the Weighted Moving Average (WMA) using a Numba-compatible
    custom function for cudf's rolling.apply.
    """
    def weighted_mean_kernel(x):
        # This kernel is JIT-compiled by Numba for the GPU.
        # It must only contain operations that Numba can translate to CUDA.
        win_size = len(x)
        if win_size == 0:
            return np.nan

        # Calculate sum of weights for the current window size: 1 + 2 + ... + win_size
        current_weights_sum = (win_size * (win_size + 1)) / 2.0

        # Calculate the weighted sum using a manual loop.
        # Numba can compile this simple loop to a GPU kernel.
        res = 0.0
        for i in range(win_size):
            # The weight for element i (in a 0-indexed loop) is i+1
            res += x[i] * (i + 1)

        return res / current_weights_sum

    # The .apply() will execute the JIT-compiled kernel for each rolling window.
    return series.rolling(window=window, min_periods=1).apply(weighted_mean_kernel)

def ema(series: cudf.Series, window: int) -> cudf.Series:
    return series.ewm(span=window, adjust=False).mean()

# ---------------------------
# Momentum / Oscillators
# ---------------------------
def rsi(series: cudf.Series, window: int = 14) -> cudf.Series:
    delta = series.diff()
    up = delta.clip(lower=0)
    down = -delta.clip(upper=0)
    ma_up = up.ewm(alpha=1/window, adjust=False).mean()
    ma_down = down.ewm(alpha=1/window, adjust=False).mean()
    rs = ma_up / ma_down
    return 100 - (100 / (1 + rs))

def macd(series: cudf.Series, fast: int = 12, slow: int = 26, signal: int = 9):
    fast_ema = ema(series, fast)
    slow_ema = ema(series, slow)
    macd_line = fast_ema - slow_ema
    signal_line = macd_line.ewm(span=signal, adjust=False).mean()
    hist = macd_line - signal_line
    return macd_line, signal_line, hist

def stochastic_oscillator(high: cudf.Series, low: cudf.Series, close: cudf.Series, k_window: int = 14, d_window: int = 3):
    lowest_low = low.rolling(window=k_window, min_periods=1).min()
    highest_high = high.rolling(window=k_window, min_periods=1).max()
    percent_k = 100 * (close - lowest_low) / (highest_high - lowest_low)
    percent_d = percent_k.rolling(window=d_window, min_periods=1).mean()
    return percent_k, percent_d

def williams_r(high: cudf.Series, low: cudf.Series, close: cudf.Series, window: int = 14) -> cudf.Series:
    highest_high = high.rolling(window=window, min_periods=1).max()
    lowest_low = low.rolling(window=window, min_periods=1).min()
    wr = -100 * (highest_high - close) / (highest_high - lowest_low)
    return wr

def roc(series: cudf.Series, window: int = 12) -> cudf.Series:
    return series.pct_change(window) * 100

# ---------------------------
# Volatility / Bands / Ranges
# ---------------------------
def bollinger_bands(series: cudf.Series, window: int = 20, n_std: float = 2.0):
    mid = sma(series, window)
    std = series.rolling(window=window, min_periods=1).std()
    upper = mid + n_std * std
    lower = mid - n_std * std
    bandwidth = (upper - lower) / mid
    percent_b = (series - lower) / (upper - lower)
    return mid, upper, lower, bandwidth, percent_b

def atr(high: cudf.Series, low: cudf.Series, close: cudf.Series, window: int = 14) -> cudf.Series:
    high_low = high - low
    high_prevclose = (high - close.shift(1)).abs()
    low_prevclose = (low - close.shift(1)).abs()
    tr = cudf.concat([high_low, high_prevclose, low_prevclose], axis=1).max(axis=1)
    return tr.ewm(alpha=1/window, adjust=False).mean()

def keltner_channels(high: cudf.Series, low: cudf.Series, close: cudf.Series, ema_window: int = 20, atr_window: int = 10, multiplier: float = 2.0):
    mid = ema(close, ema_window)
    atr_val = atr(high, low, close, atr_window)
    upper = mid + multiplier * atr_val
    lower = mid - multiplier * atr_val
    return mid, upper, lower

def donchian_channel(high: cudf.Series, low: cudf.Series, window: int = 20):
    upper = high.rolling(window=window, min_periods=1).max()
    lower = low.rolling(window=window, min_periods=1).min()
    mid = (upper + lower) / 2
    return mid, upper, lower

# ---------------------------
# Volume-based indicators
# ---------------------------

def obv(close: cudf.Series, volume: cudf.Series) -> cudf.Series:
    # First, calculate the difference and immediately fill the initial NA/null
    price_diff = close.diff().fillna(0)
    
    # cp.sign returns a cupy.ndarray, not a cudf.Series
    direction_cp = cp.sign(price_diff)
    
    # Explicitly convert the cupy array back to a cudf Series,
    # preserving the original index for correct alignment.
    direction_cudf = cudf.Series(direction_cp, index=close.index)
    
    # Now all subsequent operations are on cudf Series objects
    return (direction_cudf * volume).fillna(0).cumsum()

def chaikin_adi(high: cudf.Series, low: cudf.Series, close: cudf.Series, volume: cudf.Series) -> cudf.Series:
    mfm = ((close - low) - (high - close)) / (high - low)
    mfm = mfm.replace([cp.inf, -cp.inf], 0).fillna(0)
    ad = (mfm * volume).cumsum()
    return ad

def money_flow_index(high: cudf.Series, low: cudf.Series, close: cudf.Series, volume: cudf.Series, window: int = 14) -> cudf.Series:
    tp = (high + low + close) / 3
    mf = tp * volume
    positive = mf.where(tp > tp.shift(1), 0.0)
    negative = mf.where(tp < tp.shift(1), 0.0)
    pos_mf = positive.rolling(window=window, min_periods=1).sum()
    neg_mf = negative.rolling(window=window, min_periods=1).sum()
    mfi = 100 * (pos_mf / (pos_mf + neg_mf+ EPSILON))
    return mfi

def force_index(close: cudf.Series, volume: cudf.Series, window: int = 13) -> cudf.Series:
    fi = close.diff() * volume
    return fi.ewm(span=window, adjust=False).mean()

def vwap(df: cudf.DataFrame, window: Optional[int] = None) -> cudf.Series:
    tp = (df['High'] + df['Low'] + df['Close']) / 3
    pv = tp * df['Volume']
    if window is None:
        return pv.cumsum() / df['Volume'].cumsum()
    else:
        return pv.rolling(window).sum() / df['Volume'].rolling(window).sum()

# ---------------------------
# Trend / Directional Movement (ADX)
# ---------------------------
def _dm_plus(high: cudf.Series, low: cudf.Series) -> cudf.Series:
    up_move = high.diff()
    down_move = -low.diff()
    dm_plus = up_move.where((up_move > down_move) & (up_move > 0), 0.0)
    return dm_plus

def _dm_minus(high: cudf.Series, low: cudf.Series) -> cudf.Series:
    up_move = high.diff()
    down_move = -low.diff()
    dm_minus = down_move.where((down_move > up_move) & (down_move > 0), 0.0)
    return dm_minus

def adx(high: cudf.Series, low: cudf.Series, close: cudf.Series, window: int = 14) -> cudf.Series:
    tr = cudf.concat([
        (high - low).abs(),
        (high - close.shift(1)).abs(),
        (low - close.shift(1)).abs()
    ], axis=1).max(axis=1)
    atr_ = tr.ewm(alpha=1/window, adjust=False).mean()
    
    dm_p = _dm_plus(high, low).ewm(alpha=1/window, adjust=False).mean()
    dm_m = _dm_minus(high, low).ewm(alpha=1/window, adjust=False).mean()
    
    di_p = 100 * (dm_p / (atr_ + EPSILON))
    di_m = 100 * (dm_m / (atr_ + EPSILON))
    
    dx = 100 * (di_p - di_m).abs() / ((di_p + di_m) + EPSILON)
    
    adx_series = dx.ewm(alpha=1/window, adjust=False).mean()
    return adx_series

# ---------------------------
# Statistical / Composite
# ---------------------------
def _mad_kernel(x):
    """
    Numba-compatible kernel to calculate Mean Absolute Deviation.
    This will be JIT-compiled for the GPU by cudf.
    """
    # Guard against empty windows
    if len(x) == 0:
        return np.nan
        
    # Step 1: Calculate the mean of the current window using a simple loop
    mean_x = 0.0
    for val in x:
        mean_x += val
    mean_x /= len(x)
    
    # Step 2: Calculate the mean of the absolute deviations from the mean
    mad_val = 0.0
    for val in x:
        mad_val += abs(val - mean_x) # Numba can compile python's built-in abs()
    mad_val /= len(x)
    
    return mad_val

def cci(high: cudf.Series, low: cudf.Series, close: cudf.Series, window: int = 20) -> cudf.Series:
    tp = (high + low + close) / 3
    ma = tp.rolling(window=window, min_periods=1).mean()
    
    # Use our new Numba-compatible kernel instead of the lambda function
    mad = tp.rolling(window=window, min_periods=1).apply(_mad_kernel)
    
    # The constant 0.015 is used to scale the CCI to a common range
    cci_val = (tp - ma) / (0.015 * mad)
    return cci_val

# ---------------------------
# Parabolic SAR (loop implementation, CPU fallback)
# ---------------------------
def parabolic_sar(high: cudf.Series, low: cudf.Series, close: cudf.Series, af_start: float = 0.02, af_step: float = 0.02, af_max: float = 0.2) -> cudf.Series:
    # Convert to pandas for iterative calculation
    high_pd = high.to_pandas()
    low_pd = low.to_pandas()
    close_pd = close.to_pandas()
    highs = high_pd.values
    lows = low_pd.values
    length = len(highs)
    if length == 0:
        return cudf.Series(dtype=float)

    sar = np.zeros(length)
    bull = True
    af = af_start
    ep = highs[0]
    sar[0] = lows[0]

    for i in range(1, length):
        prev = sar[i - 1]
        if bull:
            sar[i] = prev + af * (ep - prev)
            sar[i] = min(sar[i], lows[i-1], lows[i-2] if i >= 2 else lows[i-1])
            if lows[i] < sar[i]:
                bull = False
                sar[i] = ep
                ep = lows[i]
                af = af_start
        else:
            sar[i] = prev + af * (ep - prev)
            sar[i] = max(sar[i], highs[i-1], highs[i-2] if i >= 2 else highs[i-1])
            if highs[i] > sar[i]:
                bull = True
                sar[i] = ep
                ep = highs[i]
                af = af_start
        if bull:
            if highs[i] > ep:
                ep = highs[i]
                af = min(af + af_step, af_max)
        else:
            if lows[i] < ep:
                ep = lows[i]
                af = min(af + af_step, af_max)

    return cudf.Series(sar, index=high.index)

# ---------------------------
# Ichimoku Cloud
# ---------------------------
def ichimoku(high: cudf.Series, low: cudf.Series, close: cudf.Series,
             tenkan: int = 9, kijun: int = 26, senkou_b: int = 52, shift: int = 26):
    conv = (high.rolling(window=tenkan).max() + low.rolling(window=tenkan).min()) / 2
    base = (high.rolling(window=kijun).max() + low.rolling(window=kijun).min()) / 2
    span_a = ((conv + base) / 2).shift(shift)
    span_b = ((high.rolling(window=senkou_b).max() + low.rolling(window=senkou_b).min()) / 2).shift(shift)
    lagging = close.shift(-shift)
    return conv, base, span_a, span_b, lagging

# ---------------------------
# Helper: add indicators for all tickers
# ---------------------------
def add_all_indicators(df: cudf.DataFrame, additional: Optional[Iterable[str]] = None, prefix: str = "") -> cudf.DataFrame:
    """
    Compute and append indicators for all tickers in df, grouped by Ticker.
    Includes debug statements to track progress.
    """
    required = {"Ticker", "Date", "High", "Low", "Close", "Volume"}
    if not required.issubset(df.columns):
        raise ValueError(f"DataFrame must contain columns: {required}")
    df = df.copy()
    df['Date'] = cudf.to_datetime(df['Date'])
    df = df.sort_values(['Ticker', 'Date'])
    def apply_indicators(group: cudf.DataFrame) -> cudf.DataFrame:
        high = group["High"]
        low = group["Low"]
        close = group["Close"]
        volume = group["Volume"]
        print(f"Processing ticker: {group['Ticker'].iloc[0]}")
        print("Computing SMA_20...")
        group[f"{prefix}SMA_20"] = sma(close, 20)
        print("Computing SMA_50...")
        group[f"{prefix}SMA_50"] = sma(close, 50)
        print("Computing SMA_200...")
        group[f"{prefix}SMA_200"] = sma(close, 200)
        print("Computing EMA_12...")
        group[f"{prefix}EMA_12"] = ema(close, 12)
        print("Computing EMA_26...")
        group[f"{prefix}EMA_26"] = ema(close, 26)
        print("Computing WMA_50...")
        group[f"{prefix}WMA_50"] = wma(close, 50)
        print("Computing RSI_14...")
        group[f"{prefix}RSI_14"] = rsi(close, 14)
        print("Computing MACD...")
        macd_line, macd_signal, macd_hist = macd(close)
        group[f"{prefix}MACD"] = macd_line
        group[f"{prefix}MACD_Signal"] = macd_signal
        group[f"{prefix}MACD_Hist"] = macd_hist
        print("Computing Stochastic Oscillator...")
        stoch_k, stoch_d = stochastic_oscillator(high, low, close)
        group[f"{prefix}Stoch_%K"] = stoch_k
        group[f"{prefix}Stoch_%D"] = stoch_d
        print("Computing Williams %R...")
        group[f"{prefix}WilliamsR_14"] = williams_r(high, low, close, 14)
        print("Computing ROC_12...")
        group[f"{prefix}ROC_12"] = roc(close, 12)
        print("Computing Bollinger Bands...")
        bb_mid, bb_up, bb_low, bb_bw, bb_pctb = bollinger_bands(close, 20, 2)
        group[f"{prefix}BB_Mid"] = bb_mid
        group[f"{prefix}BB_Upper"] = bb_up
        group[f"{prefix}BB_Lower"] = bb_low
        group[f"{prefix}BB_Bandwidth"] = bb_bw
        group[f"{prefix}BB_pctB"] = bb_pctb
        print("Computing ATR_14...")
        group[f"{prefix}ATR_14"] = atr(high, low, close, 14)
        print("Computing Keltner Channels...")
        k_mid, k_up, k_low = keltner_channels(high, low, close, 20, 10, 2.0)
        group[f"{prefix}KC_Mid"] = k_mid
        group[f"{prefix}KC_Upper"] = k_up
        group[f"{prefix}KC_Lower"] = k_low
        print("Computing Donchian Channels...")
        d_mid, d_up, d_low = donchian_channel(high, low, 20)
        group[f"{prefix}Donchian_Mid"] = d_mid
        group[f"{prefix}Donchian_Upper"] = d_up
        group[f"{prefix}Donchian_Lower"] = d_low
        print("Computing OBV...")
        group[f"{prefix}OBV"] = obv(close, volume)
        print("Computing Chaikin AD...")
        group[f"{prefix}Chaikin_AD"] = chaikin_adi(high, low, close, volume)
        print("Computing MFI_14...")
        group[f"{prefix}MFI_14"] = money_flow_index(high, low, close, volume, 14)
        print("Computing Force Index...")
        group[f"{prefix}ForceIndex_13"] = force_index(close, volume, 13)
        print("Computing VWAP...")
        group[f"{prefix}VWAP"] = vwap(group)
        print("Computing ADX_14...")
        group[f"{prefix}ADX_14"] = adx(high, low, close, 14)
        print("Computing CCI_20...")
        group[f"{prefix}CCI_20"] = cci(high, low, close, 20)
        print("Computing Parabolic SAR...")
        group[f"{prefix}Parabolic_SAR"] = parabolic_sar(high, low, close)
        print("Computing Ichimoku Cloud...")
        conv, base, span_a, span_b, lag = ichimoku(high, low, close)
        group[f"{prefix}Ichimoku_Conv"] = conv
        group[f"{prefix}Ichimoku_Base"] = base
        group[f"{prefix}Ichimoku_SpanA"] = span_a
        group[f"{prefix}Ichimoku_SpanB"] = span_b
        group[f"{prefix}Ichimoku_Lagging"] = lag
        print(f"Finished processing ticker: {group['Ticker'].iloc[0]}")
        return group
    result = df.groupby('Ticker').apply(apply_indicators)
    result = result.replace([cp.inf, -cp.inf], cp.nan)
    return result

def plot_basic_with_indicators(df: cudf.DataFrame, ticker: str):
    """
    A placeholder function to plot results for a single ticker.
    This prevents an error if the function was missing.
    """
    print(f"\n--- Plotting results for {ticker} ---")
    ticker_df = df[df['Ticker'] == ticker].to_pandas()
    if ticker_df.empty:
        print(f"No data found for ticker {ticker} to plot.")
        return

    plt.figure(figsize=(15, 10))
    plt.subplot(2, 1, 1)
    plt.plot(ticker_df['Date'], ticker_df['Close'], label='Close')
    plt.plot(ticker_df['Date'], ticker_df['SMA_50'], label='SMA 50')
    plt.plot(ticker_df['Date'], ticker_df['WMA_50'], label='WMA 50', linestyle='--')
    plt.title(f'{ticker} Close Price and Moving Averages')
    plt.legend()

    plt.subplot(2, 1, 2)
    plt.plot(ticker_df['Date'], ticker_df['RSI_14'], label='RSI 14')
    plt.axhline(70, linestyle='--', color='r', alpha=0.5)
    plt.axhline(30, linestyle='--', color='g', alpha=0.5)
    plt.title(f'{ticker} RSI')
    plt.legend()

    plt.tight_layout()
    # plt.show() # Uncomment to display plot if in a desktop environment
    plt.savefig(f"{ticker}_plot.png")
    print(f"Plot saved to {ticker}_plot.png")


if __name__ == "__main__":
    # Check if the data file exists
    data_file = "sp500_yfinance_data.csv"
    if os.path.exists(data_file):
        print(f"Loading existing data from {data_file}...")
        # Reading with pandas first is often more robust for varied CSVs
        sp500_history_pd = pd.read_csv(data_file)
        sp500_history = cudf.from_pandas(sp500_history_pd)
    else:
        print("Downloading S&P 500 tickers and historical data...")
        # Load S&P 500 tickers from the specified Kaggle dataset
        sp500_tickers_path = kagglehub.dataset_download("andrewmvd/sp-500-stocks")
        sp500_companies_file = os.path.join(sp500_tickers_path, "sp500_companies.csv")
        sp500_tickers = pd.read_csv(sp500_companies_file)
        
        tickers = sp500_tickers['Symbol'].dropna().unique().tolist()
        
        print(f"Downloading data for {len(tickers)} tickers...")
        # Download historical data
        data = yf.download(
            tickers=tickers,
            start="2019-01-01", # Using a shorter period for demonstration
            end="2020-12-31",
            group_by='ticker',
            auto_adjust=False,
            threads=True
        )
        
        if data.empty:
             raise ValueError("Failed to download data from yfinance. The API might be temporarily unavailable or the tickers are invalid.")

        # Flatten the MultiIndex DataFrame
        sp500_history = data.stack(level=0).reset_index()
        sp500_history.columns.name = None
        sp500_history.rename(columns={'level_1': 'Ticker'}, inplace=True)
        # Save to CSV
        sp500_history.to_csv(data_file, index=False)
        # Convert to cuDF
        sp500_history = cudf.from_pandas(sp500_history)

    # Filter out empty or problematic data groups before applying indicators
    sp500_history = sp500_history.dropna(subset=['High', 'Low', 'Close', 'Volume'])
    
    print("Computing indicators for all tickers...")
    result_df = add_all_indicators(sp500_history)
    
    print("\n--- Final DataFrame Tail ---")
    cols = [c for c in result_df.columns if any(k in c for k in ["Ticker", "Date", "Close", "SMA_20", "WMA_50", "RSI_14", "MACD", "BB_Upper", "ATR_14", "ADX_14", "VWAP"])]
    print(result_df[cols].tail(5).to_pandas())

    # Example plot for a specific ticker
    if 'AAPL' in result_df['Ticker'].unique().to_pandas():
        plot_basic_with_indicators(result_df, "AAPL")

Loading existing data from sp500_yfinance_data.csv...
Computing indicators for all tickers...




Processing ticker: A
Computing SMA_20...
Computing SMA_50...
Computing SMA_200...
Computing EMA_12...
Computing EMA_26...
Computing WMA_50...
Computing RSI_14...
Computing MACD...
Computing Stochastic Oscillator...
Computing Williams %R...
Computing ROC_12...
Computing Bollinger Bands...
Computing ATR_14...
Computing Keltner Channels...
Computing Donchian Channels...
Computing OBV...
Computing Chaikin AD...
Computing MFI_14...
Computing Force Index...
Computing VWAP...
Computing ADX_14...
Computing CCI_20...
Computing Parabolic SAR...
Computing Ichimoku Cloud...
Finished processing ticker: A
Processing ticker: AAPL
Computing SMA_20...
Computing SMA_50...
Computing SMA_200...
Computing EMA_12...
Computing EMA_26...
Computing WMA_50...
Computing RSI_14...
Computing MACD...
Computing Stochastic Oscillator...
Computing Williams %R...
Computing ROC_12...
Computing Bollinger Bands...
Computing ATR_14...
Computing Keltner Channels...
Computing Donchian Channels...
Computing OBV...
Computing C

In [5]:
result_df.to_csv("result.csv")

In [7]:
result_df.columns

Index(['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume',
       'SMA_20', 'SMA_50', 'SMA_200', 'EMA_12', 'EMA_26', 'WMA_50', 'RSI_14',
       'MACD', 'MACD_Signal', 'MACD_Hist', 'Stoch_%K', 'Stoch_%D',
       'WilliamsR_14', 'ROC_12', 'BB_Mid', 'BB_Upper', 'BB_Lower',
       'BB_Bandwidth', 'BB_pctB', 'ATR_14', 'KC_Mid', 'KC_Upper', 'KC_Lower',
       'Donchian_Mid', 'Donchian_Upper', 'Donchian_Lower', 'OBV', 'Chaikin_AD',
       'MFI_14', 'ForceIndex_13', 'VWAP', 'ADX_14', 'CCI_20', 'Parabolic_SAR',
       'Ichimoku_Conv', 'Ichimoku_Base', 'Ichimoku_SpanA', 'Ichimoku_SpanB',
       'Ichimoku_Lagging'],
      dtype='object')

In [None]:
if __name__ == "__main__":
    # Check if the data file exists
    data_file = "sp500_yfinance_data.csv"
    if os.path.exists(data_file):
        print(f"Loading existing data from {data_file}...")
        sp500_history = cudf.from_pandas(pd.read_csv(data_file))
    else:
        print("Downloading S&P 500 tickers and historical data...")
        # Load S&P 500 tickers
        sp500_tickers = kagglehub.dataset_load(
            kagglehub.PANDAS,
            "andrewmvd/sp-500-stocks",
            "sp500_companies.csv",
        )
        tickers = sp500_tickers['Symbol'].dropna().unique().tolist()
        # Download historical data
        data = yf.download(
            tickers=tickers,
            start="2009-01-01",
            end="2020-12-31",
            group_by='ticker',
            auto_adjust=False,
            threads=True
        )
        # Flatten the MultiIndex DataFrame
        sp500_history = data.stack(level=0).reset_index()
        sp500_history.columns.name = None
        sp500_history.rename(columns={'level_1': 'Ticker'}, inplace=True)
        # Save to CSV
        sp500_history.to_csv(data_file, index=False)
        # Convert to cuDF
        sp500_history = cudf.from_pandas(sp500_history)

    print("Computing indicators for all tickers...")
    result_df = add_all_indicators(sp500_history)
    cols = [c for c in result_df.columns if any(k in c for k in ["Close", "SMA_20", "RSI_14", "MACD", "BB_Upper", "ATR_14", "ADX_14", "VWAP"])]
    print(result_df[cols].tail(5))
    plot_basic_with_indicators(result_df, "AAPL")

Loading existing data from sp500_yfinance_data.csv...
Computing indicators for all tickers...
Processing ticker: A
Computing SMA_20...
Computing SMA_50...
Computing SMA_200...
Computing EMA_12...
Computing EMA_26...
Computing WMA_50...
Computing RSI_14...
Computing MACD...
Computing Stochastic Oscillator...
Computing Williams %R...
Computing ROC_12...
Computing Bollinger Bands...
Computing ATR_14...
Computing Keltner Channels...
Computing Donchian Channels...
Computing OBV...
Computing Chaikin AD...
Computing MFI_14...
Computing Force Index...
Computing VWAP...
Computing ADX_14...
Computing CCI_20...
Computing Parabolic SAR...
Computing Ichimoku Cloud...
Finished processing ticker: A
Processing ticker: AAPL
Computing SMA_20...
Computing SMA_50...
Computing SMA_200...




Computing EMA_12...
Computing EMA_26...
Computing WMA_50...
Computing RSI_14...
Computing MACD...
Computing Stochastic Oscillator...
Computing Williams %R...
Computing ROC_12...
Computing Bollinger Bands...
Computing ATR_14...
Computing Keltner Channels...
Computing Donchian Channels...
Computing OBV...
Computing Chaikin AD...
Computing MFI_14...
Computing Force Index...
Computing VWAP...
Computing ADX_14...
Computing CCI_20...
Computing Parabolic SAR...
Computing Ichimoku Cloud...
Finished processing ticker: AAPL
Processing ticker: ABBV
Computing SMA_20...
Computing SMA_50...
Computing SMA_200...
Computing EMA_12...
Computing EMA_26...
Computing WMA_50...
Computing RSI_14...
Computing MACD...
Computing Stochastic Oscillator...
Computing Williams %R...
Computing ROC_12...
Computing Bollinger Bands...
Computing ATR_14...
Computing Keltner Channels...
Computing Donchian Channels...
Computing OBV...
Computing Chaikin AD...
Computing MFI_14...
Computing Force Index...
Computing VWAP...
Com

KeyboardInterrupt: 