In [None]:
!pip install yfinance py_vollib

# Data Retrieval: 
Fetch historical data for SPY.

In [44]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import requests

In [82]:
def fetch_and_process_data(_asset):
    # Fetch data for the specified asset
    # asset = yf.Ticker(_asset)
    # hist = asset.history(period="5y")
    hist = yf.download(_asset, start='2022-01-01', end='2024-01-01')

    def compute_indicators(hist):
        # Woodie's pivot points
        hist['Pivot'] = (hist['High'] + hist['Low'] + 2 * hist['Close']) / 4
        hist['R1'] = 2 * hist['Pivot'] - hist['Low']
        hist['S1'] = 2 * hist['Pivot'] - hist['High']
        hist['R2'] = hist['Pivot'] + (hist['High'] - hist['Low'])
        hist['S2'] = hist['Pivot'] - (hist['High'] - hist['Low'])
        hist['R3'] = hist['High'] + 2 * (hist['Pivot'] - hist['Low'])
        hist['S3'] = hist['Low'] - 2 * (hist['High'] - hist['Pivot'])
        hist['R4'] = hist['Pivot'] + 3 * (hist['High'] - hist['Low'])
        hist['S4'] = hist['Pivot'] - 3 * (hist['High'] - hist['Low'])
        
        # Bollinger Bands
        rolling_mean = hist['Close'].rolling(window=20).mean()
        rolling_std = hist['Close'].rolling(window=20).std()
        hist['Bollinger_High'] = rolling_mean + (rolling_std * 2)
        hist['Bollinger_Low'] = rolling_mean - (rolling_std * 2)
        
        # MACD
        short_ema = hist['Close'].ewm(span=12, adjust=False).mean()
        long_ema = hist['Close'].ewm(span=26, adjust=False).mean()
        hist['MACD'] = short_ema - long_ema
        hist['Signal'] = hist['MACD'].ewm(span=9, adjust=False).mean()
        
        # RSI
        close_delta = hist['Close'].diff()
        up = close_delta.clip(lower=0)
        down = -1 * close_delta.clip(upper=0)
        ma_up = up.ewm(span=14, adjust=True).mean()
        ma_down = down.ewm(span=14, adjust=True).mean()
        rsi = ma_up / ma_down
        hist['RSI'] = 100 - (100 / (1 + rsi))
        
        # OBV
        hist['OBV'] = (np.sign(hist['Close'].diff()) * hist['Volume']).fillna(0).cumsum()
        
        # ATR
        high_low = hist['High'] - hist['Low']
        high_close = np.abs(hist['High'] - hist['Close'].shift())
        low_close = np.abs(hist['Low'] - hist['Close'].shift())
        ranges = pd.concat([high_low, high_close, low_close], axis=1)
        true_range = np.max(ranges, axis=1)
        hist['ATR'] = true_range.rolling(window=14).mean()
        
        # Stochastic Oscillator
        low_min = hist['Low'].rolling(window=14).min()
        high_max = hist['High'].rolling(window=14).max()
        hist['%K'] = 100 * ((hist['Close'] - low_min) / (high_max - low_min))
        hist['%D'] = hist['%K'].rolling(window=3).mean()
        
        return hist

    # Parallel execution setup
    with ThreadPoolExecutor() as executor:
        future = executor.submit(compute_indicators, hist)
        result = future.result()

    # Clean up data
    result.dropna(inplace=True)

    return result

# Example usage
spy_data = fetch_and_process_data("SPY")
# print(spy_data.tail())  # Show the last few rows to inspect the added indicators
spy_data

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Pivot,R1,S1,R2,...,S4,Bollinger_High,Bollinger_Low,MACD,Signal,RSI,OBV,ATR,%K,%D
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-31,441.239990,450.279999,439.809998,449.910004,436.099365,152251400,447.477501,455.145004,444.675003,457.947502,...,416.067497,486.624909,425.071091,-9.501387,-8.534214,52.627653,-7.999154e+08,10.920000,55.587324,38.660039
2022-02-01,450.679993,453.630005,446.940002,452.950012,439.046051,123155400,451.617508,456.295013,449.605011,458.307510,...,431.547501,483.625731,425.594270,-8.290981,-8.485567,56.805728,-6.767600e+08,10.840714,61.384441,52.459951
2022-02-02,455.500000,458.119995,453.049988,457.350006,443.310974,117361000,456.467499,459.885010,454.815002,461.537506,...,441.257477,480.589074,426.610929,-6.897176,-8.167889,62.351079,-5.593990e+08,10.905712,70.203376,62.391714
2022-02-03,450.950012,452.970001,445.709991,446.600006,432.890991,118024400,447.970001,450.230011,442.970001,455.230011,...,426.189972,478.735862,426.286142,-6.584113,-7.851134,45.781900,-6.774234e+08,11.062855,58.290106,63.292641
2022-02-04,446.350006,452.779999,443.829987,448.700012,434.926483,118454400,448.502502,453.175018,444.225006,457.452515,...,421.652466,476.783654,426.314350,-6.096282,-7.500164,48.845958,-5.589690e+08,11.331427,71.275550,66.589677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,473.859985,475.380005,471.700012,473.649994,473.649994,67126600,473.595001,475.489990,471.809998,477.274994,...,462.555023,478.348117,448.186887,7.412221,7.247574,67.343667,-2.393208e+09,4.287861,89.578507,79.790076
2023-12-26,474.070007,476.579987,473.989990,475.649994,475.649994,55387000,475.467491,476.944992,474.354996,478.057487,...,467.697502,479.776882,448.875120,7.460252,7.290110,70.447375,-2.337821e+09,4.302861,95.824009,90.193641
2023-12-27,475.440002,476.660004,474.890015,476.510010,476.510010,68000300,476.142509,477.395004,475.625015,477.912498,...,470.832542,481.105129,449.704875,7.481471,7.328382,71.778205,-2.269821e+09,4.105717,99.263653,94.888723
2023-12-28,476.880005,477.549988,476.260010,476.690002,476.690002,77158100,476.797501,477.334991,476.045013,478.087479,...,472.927567,482.117967,450.900038,7.427195,7.348145,72.081811,-2.192663e+09,3.902145,95.771949,96.953204


In [86]:
import numpy as np
import pandas as pd
import yfinance as yf

def fetch_and_process_data(_asset):
    # Fetch data for the specified asset
    hist = yf.download(_asset, start='2022-01-01')

    # Indicator calculations as defined earlier
    def bollinger_bands(data, window=20, num_std=2):
        rolling_mean = data['Close'].rolling(window=window).mean()
        rolling_std = data['Close'].rolling(window=window).std()
        data['Bollinger_High'] = rolling_mean + (rolling_std * num_std)
        data['Bollinger_Low'] = rolling_mean - (rolling_std * num_std)
        return data

    def macd(data, short_window=12, long_window=26, signal_window=9):
        short_ema = data['Close'].ewm(span=short_window, adjust=False).mean()
        long_ema = data['Close'].ewm(span=long_window, adjust=False).mean()
        data['MACD'] = short_ema - long_ema
        data['Signal'] = data['MACD'].ewm(span=signal_window, adjust=False).mean()
        return data

    def rsi(data, periods=14, ema=True):
        close_delta = data['Close'].diff()
        up = close_delta.clip(lower=0)
        down = -1 * close_delta.clip(upper=0)
        
        if ema:
            ma_up = up.ewm(com=periods - 1, adjust=True, min_periods=periods).mean()
            ma_down = down.ewm(com=periods - 1, adjust=True, min_periods=periods).mean()
        else:
            ma_up = up.rolling(window=periods, adjust=False).mean()
            ma_down = down.rolling(window=periods, adjust=False).mean()
        
        rsi = ma_up / ma_down
        data['RSI'] = 100 - (100 / (1 + rsi))
        return data
        
    def obv(data):
        """Calculate On-Balance Volume."""
        obv = (np.sign(data['Close'].diff()) * data['Volume']).fillna(0).cumsum()
        data['OBV'] = obv
        return data

    def atr(data, window=14):
        """Calculate Average True Range (ATR)."""
        high_low = data['High'] - data['Low']
        high_close = np.abs(data['High'] - data['Close'].shift())
        low_close = np.abs(data['Low'] - data['Close'].shift())
        ranges = pd.concat([high_low, high_close, low_close], axis=1)
        true_range = np.max(ranges, axis=1)
        data['ATR'] = true_range.rolling(window=window).mean()
        return data

    def woodie_pivots(data):
        # Calculate Woodie's pivot points
        data['Pivot'] = (data['High'] + data['Low'] + 2 * data['Close']) / 4
        data['R1'] = 2 * data['Pivot'] - data['Low']
        data['S1'] = 2 * data['Pivot'] - data['High']
        data['R2'] = data['Pivot'] + (data['High'] - data['Low'])
        data['S2'] = data['Pivot'] - (data['High'] - data['Low'])
        data['R3'] = data['High'] + 2 * (data['Pivot'] - data['Low'])
        data['S3'] = data['Low'] - 2 * (data['High'] - data['Pivot'])
        data['R4'] = data['Pivot'] + 3 * (data['High'] - data['Low'])
        data['S4'] = data['Pivot'] - 3 * (data['High'] - data['Low'])
        return data

    # Apply each indicator function to the data
    hist = bollinger_bands(hist)
    hist = macd(hist)
    hist = rsi(hist)
    hist = woodie_pivots(hist)
    hist = obv(hist)
    hist = atr(hist)
    # Repeat for other indicators as necessary...

    # Note: No explicit parallel processing applied here due to sequential dependency of calculations on data.

    # Ensure all NaN values created by indicators are handled appropriately
    hist.dropna(inplace=True)

    return hist

# Example usage
spy_data = fetch_and_process_data("SPY")
# print(spy_data.tail())  # Display the last few rows to verify the outcome
spy_data

CPU times: total: 0 ns
Wall time: 0 ns
[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Bollinger_High,Bollinger_Low,MACD,Signal,...,R1,S1,R2,S2,R3,S3,R4,S4,OBV,ATR
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-01-31,441.239990,450.279999,439.809998,449.910004,436.099426,152251400,486.624909,425.071091,-9.501387,-8.534214,...,455.145004,444.675003,457.947502,437.007500,465.615005,434.205002,478.887505,416.067497,-7.999154e+08,10.920000
2022-02-01,450.679993,453.630005,446.940002,452.950012,439.046082,123155400,483.625731,425.594270,-8.290981,-8.485567,...,456.295013,449.605011,458.307510,444.927505,462.985016,442.915009,471.687515,431.547501,-6.767600e+08,10.840714
2022-02-02,455.500000,458.119995,453.049988,457.350006,443.311005,117361000,480.589074,426.610929,-6.897176,-8.167889,...,459.885010,454.815002,461.537506,451.397491,464.955017,449.744995,471.677521,441.257477,-5.593990e+08,10.905712
2022-02-03,450.950012,452.970001,445.709991,446.600006,432.890961,118024400,478.735862,426.286142,-6.584113,-7.851134,...,450.230011,442.970001,455.230011,440.709991,457.490021,435.709991,469.750031,426.189972,-6.774234e+08,11.062855
2022-02-04,446.350006,452.779999,443.829987,448.700012,434.926514,118454400,476.783654,426.314350,-6.096282,-7.500164,...,453.175018,444.225006,457.452515,439.552490,462.125031,435.274994,475.352539,421.652466,-5.589690e+08,11.331427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-13,494.529999,497.089996,490.720001,494.079987,494.079987,113099200,504.900087,473.914919,6.587714,6.426473,...,497.264984,490.894989,500.362488,487.622498,503.634979,484.524994,513.102478,474.882507,-2.166675e+09,4.411427
2024-02-14,496.790009,499.070007,494.399994,498.570007,498.570007,68387800,504.459707,476.983298,6.468158,6.434810,...,500.905014,496.235001,502.322517,492.982491,505.575027,491.564987,511.662544,483.642464,-2.098287e+09,4.559287
2024-02-15,499.290009,502.200012,498.799988,502.010010,502.010010,61683000,504.884360,479.110647,6.575193,6.462887,...,503.710022,500.309998,504.655029,497.854980,507.110046,496.909973,511.455078,491.054932,-2.036604e+09,4.634288
2024-02-16,501.700012,502.869995,498.750000,499.510010,499.510010,75461200,505.325914,480.377095,6.384692,6.447248,...,501.570007,497.450012,504.279999,496.040009,505.690002,493.330017,512.519989,487.800018,-2.112065e+09,4.625002


# Check Data For any errors:

In [81]:
from pandas.tseries.holiday import USFederalHolidayCalendar

def check_data_errors(data):
    errors = []

    # Check for missing values
    if data.isnull().values.any():
        errors.append("Issue: Data contains missing values.")
    
    # Check for duplicate dates
    if data.index.duplicated().any():
        errors.append("Issue: Data contains duplicate dates.")
    
    # Outliers in price data
    z_scores = np.abs((data['Close'] - data['Close'].mean()) / data['Close'].std())
    if z_scores[z_scores > 3].any():
        errors.append("Issue: Data contains potential outliers in 'Close' prices.")
    
    # Volume checks
    if (data['Volume'] == 0).any():
        errors.append("Issue: Data contains days with zero volume.")
    if ((data['Volume'].diff() / data['Volume']).abs() > 5).any():
        errors.append("Issue: Data contains unexpected spikes in volume.")
    
    # Continuity of dates, excluding weekends and public holidays
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start=data.index.min(), end=data.index.max())
    business_days = pd.date_range(start=data.index.min(), end=data.index.max(), freq='B')
    business_days = business_days[~business_days.isin(holidays)]  # Exclude holidays
    
    missing_dates = business_days.difference(data.index).tolist()
    if missing_dates:
        formatted_dates = ', '.join([d.strftime('%Y-%m-%d') for d in missing_dates])
        errors.append(f"Issue: Data might be missing trading days: {formatted_dates}")

    return errors

# Example usage
spy_data = fetch_and_process_data("SPY")  # Assuming this function returns data with DateTimeIndex
errors = check_data_errors(spy_data)
if errors:
    for error in errors:
        print(error)
else:
    print("No issues detected in the data.")


[*********************100%%**********************]  1 of 1 completed
Issue: Data might be missing trading days: 2022-04-15, 2023-04-07


The error message indicating missing trading days for specific dates such as April 19, 2019, April 10, 2020, April 2, 2021, April 15, 2022, and April 7, 2023, highlights dates that are actually Good Friday. In the United States, the stock market (NYSE, NASDAQ) is closed on Good Friday, which is not a federal holiday and therefore not included in the USFederalHolidayCalendar. This explains why these dates were flagged as missing trading days by the previous function.

To address this and accurately reflect the trading calendar, we need to manually account for Good Friday and potentially other market-specific closures not covered by the federal holiday calendar. Here's an updated version of the function that checks for missing trading days, now including an adjustment for Good Friday and a more general approach to handling non-trading days:

Frankly, this is good enough for 

# Data Splitting: 
Split your data into training and testing sets. Ensure that this split respects the time series nature of the data (i.e., no future data is used to predict past values).



# Feature Preparation: 
Prepare the dataset for machine learning, backtesting, etc.