In [None]:
!pip install yfinance py_vollib

# Data Retrieval: 
Fetch historical data for SPY.

In [22]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import numpy as np
import yfinance as yf
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import requests

In [65]:
def fetch_and_process_data(_asset):
    # Fetch data for the specified asset
    hist = yf.download(_asset, start='2022-01-01')

    # Indicator calculations as defined earlier
    def bollinger_bands(data, window=20, num_std=2):
        rolling_mean = data['Close'].rolling(window=window).mean()
        rolling_std = data['Close'].rolling(window=window).std()
        data['Bollinger_High'] = rolling_mean + (rolling_std * num_std)
        data['Bollinger_Low'] = rolling_mean - (rolling_std * num_std)
        return data

    def macd(data, short_window=12, long_window=26, signal_window=9):
        short_ema = data['Close'].ewm(span=short_window, adjust=False).mean()
        long_ema = data['Close'].ewm(span=long_window, adjust=False).mean()
        data['MACD'] = short_ema - long_ema
        data['Signal'] = data['MACD'].ewm(span=signal_window, adjust=False).mean()
        return data

    def rsi(data, periods=14, ema=True):
        close_delta = data['Close'].diff()
        up = close_delta.clip(lower=0)
        down = -1 * close_delta.clip(upper=0)
        
        if ema:
            ma_up = up.ewm(com=periods - 1, adjust=True, min_periods=periods).mean()
            ma_down = down.ewm(com=periods - 1, adjust=True, min_periods=periods).mean()
        else:
            ma_up = up.rolling(window=periods, adjust=False).mean()
            ma_down = down.rolling(window=periods, adjust=False).mean()
        
        rsi = ma_up / ma_down
        data['RSI'] = 100 - (100 / (1 + rsi))
        return data
        
    def obv(data):
        """Calculate On-Balance Volume."""
        obv = (np.sign(data['Close'].diff()) * data['Volume']).fillna(0).cumsum()
        data['OBV'] = obv
        return data

    def atr(data, window=14):
        """Calculate Average True Range (ATR)."""
        high_low = data['High'] - data['Low']
        high_close = np.abs(data['High'] - data['Close'].shift())
        low_close = np.abs(data['Low'] - data['Close'].shift())
        ranges = pd.concat([high_low, high_close, low_close], axis=1)
        true_range = np.max(ranges, axis=1)
        data['ATR'] = true_range.rolling(window=window).mean()
        return data

    def woodie_pivots(data):
        # Calculate Woodie's pivot points
        data['Pivot'] = (data['High'] + data['Low'] + 2 * data['Close']) / 4
        data['R1'] = 2 * data['Pivot'] - data['Low']
        data['S1'] = 2 * data['Pivot'] - data['High']
        data['R2'] = data['Pivot'] + (data['High'] - data['Low'])
        data['S2'] = data['Pivot'] - (data['High'] - data['Low'])
        data['R3'] = data['High'] + 2 * (data['Pivot'] - data['Low'])
        data['S3'] = data['Low'] - 2 * (data['High'] - data['Pivot'])
        data['R4'] = data['Pivot'] + 3 * (data['High'] - data['Low'])
        data['S4'] = data['Pivot'] - 3 * (data['High'] - data['Low'])
        return data

    # Apply each indicator function to the data
    hist = bollinger_bands(hist)
    hist = macd(hist)
    hist = rsi(hist)
    hist = woodie_pivots(hist)
    hist = obv(hist)
    hist = atr(hist)
    # Repeat for other indicators as necessary...

    # Note: No explicit parallel processing applied here due to sequential dependency of calculations on data.

    # Ensure all NaN values created by indicators are handled appropriately
    hist.dropna(inplace=True)

    return hist

# Example usage
spy_data = fetch_and_process_data("SPY")
print(spy_data.tail())  # Display the last few rows to verify the outcome
# spy_data

[*********************100%%**********************]  1 of 1 completed
                  Open        High         Low       Close   Adj Close  \
Date                                                                     
2024-02-13  494.529999  497.089996  490.720001  494.079987  494.079987   
2024-02-14  496.790009  499.070007  494.399994  498.570007  498.570007   
2024-02-15  499.290009  502.200012  498.799988  502.010010  502.010010   
2024-02-16  501.700012  502.869995  498.750000  499.510010  499.510010   
2024-02-20  497.720001  498.410004  494.459991  496.760010  496.760010   

               Volume  Bollinger_High  Bollinger_Low      MACD    Signal  ...  \
Date                                                                      ...   
2024-02-13  113099200      504.900087     473.914919  6.587714  6.426473  ...   
2024-02-14   68387800      504.459707     476.983298  6.468158  6.434810  ...   
2024-02-15   61683000      504.884360     479.110647  6.575193  6.462887  ...   
2024-02

# Check Data For any errors:

In [35]:
from pandas.tseries.holiday import USFederalHolidayCalendar

def check_data_errors(data):
    errors = []

    # Check for missing values
    if data.isnull().values.any():
        errors.append("Issue: Data contains missing values.")
    
    # Check for duplicate dates
    if data.index.duplicated().any():
        errors.append("Issue: Data contains duplicate dates.")
    
    # Outliers in price data
    z_scores = np.abs((data['Close'] - data['Close'].mean()) / data['Close'].std())
    if z_scores[z_scores > 3].any():
        errors.append("Issue: Data contains potential outliers in 'Close' prices.")
    
    # Volume checks
    if (data['Volume'] == 0).any():
        errors.append("Issue: Data contains days with zero volume.")
    if ((data['Volume'].diff() / data['Volume']).abs() > 5).any():
        errors.append("Issue: Data contains unexpected spikes in volume.")
    
    # Continuity of dates, excluding weekends and public holidays
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays(start=data.index.min(), end=data.index.max())
    business_days = pd.date_range(start=data.index.min(), end=data.index.max(), freq='B')
    business_days = business_days[~business_days.isin(holidays)]  # Exclude holidays
    
    missing_dates = business_days.difference(data.index).tolist()
    if missing_dates:
        formatted_dates = ', '.join([d.strftime('%Y-%m-%d') for d in missing_dates])
        errors.append(f"Issue: Data might be missing trading days: {formatted_dates}")

    return errors

# Example usage
spy_data = fetch_and_process_data("SPY")  # Assuming this function returns data with DateTimeIndex
errors = check_data_errors(spy_data)
if errors:
    for error in errors:
        print(error)
else:
    print("No issues detected in the data.")


[*********************100%%**********************]  1 of 1 completed
Issue: Data might be missing trading days: 2022-04-15, 2023-04-07


The error message indicating missing trading days for specific dates such as April 19, 2019, April 10, 2020, April 2, 2021, April 15, 2022, and April 7, 2023, highlights dates that are actually Good Friday. In the United States, the stock market (NYSE, NASDAQ) is closed on Good Friday, which is not a federal holiday and therefore not included in the USFederalHolidayCalendar. This explains why these dates were flagged as missing trading days by the previous function.

To address this and accurately reflect the trading calendar, we need to manually account for Good Friday and potentially other market-specific closures not covered by the federal holiday calendar. Here's an updated version of the function that checks for missing trading days, now including an adjustment for Good Friday and a more general approach to handling non-trading days:

Frankly, this is good enough for 

# Feature Importance

In [71]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Ensure 'spy_data' is a DataFrame loaded with required data
# Assuming spy_data is already defined and contains the necessary columns

# Define the column names
columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Bollinger_High', 'Bollinger_Low', 'MACD', 'Signal', 'RSI', 
           'Pivot', 'R1', 'S1', 'R2', 'S2', 'R3', 'S3', 'R4', 'S4', 'OBV', 'ATR']

# Check if all columns are in the DataFrame, otherwise print a warning
missing_columns = [col for col in columns if col not in spy_data.columns]
if missing_columns:
    print(f"Warning: Missing columns {missing_columns} in the DataFrame. Please ensure all indicators are calculated before this step.")
else:
    # Select features and target, including the new indicators
    features = spy_data[columns]

    # Shift the 'Close' column to the next day for prediction
    target = spy_data['Close'].shift(-1)

    # Drop the last row from features and target to remove the NaN values from shifting
    features = features.iloc[:-1]
    target = target.iloc[:-1]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    # Train the model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Get feature importances
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(X_train.shape[1]):
        print(f"{f + 1}. feature {X_train.columns[indices[f]]} ({importances[indices[f]]:.4f})")


Feature ranking:
1. feature Close (0.1713)
2. feature Pivot (0.1344)
3. feature S1 (0.1285)
4. feature Low (0.1182)
5. feature S2 (0.1163)
6. feature R1 (0.0996)
7. feature S3 (0.0806)
8. feature High (0.0465)
9. feature R2 (0.0321)
10. feature Open (0.0234)
11. feature R3 (0.0223)
12. feature Bollinger_Low (0.0092)
13. feature S4 (0.0067)
14. feature Bollinger_High (0.0019)
15. feature MACD (0.0017)
16. feature Signal (0.0014)
17. feature Volume (0.0013)
18. feature ATR (0.0013)
19. feature RSI (0.0012)
20. feature OBV (0.0012)
21. feature R4 (0.0009)


# Credit Spread Pricing
Here we want to use R1-4 and S1-S4 to find the nearest 50 cent spread.
We'll then try to determine the probablity of each expiring.

In [73]:
import pandas as pd
import numpy as np

def round_to_nearest_50_cents(value):
    """Round the value to the nearest 50 cents."""
    return np.round(value * 2, 0) / 2

def generate_credit_spreads(data):
    # Initialize a list to hold the spreads for each day
    spreads = []
    
    # Iterate through each row in the DataFrame
    for index, row in data.iterrows():
        # Initialize a dictionary for the current day's spreads
        day_spreads = {
            'Date': index,
            'R1-R2 Put Spread': None,
            'R1-R3 Put Spread': None,
            'R1-R4 Put Spread': None,
            'S1-S2 Call Spread': None,
            'S1-S3 Call Spread': None,
            'S1-S4 Call Spread': None,
        }
        
        # Calculate the put credit spreads for each pair of resistances
        for i in range(2, 5):
            sell_strike = round_to_nearest_50_cents(row['R1'])
            buy_strike = round_to_nearest_50_cents(row[f'R{i}'])
            day_spreads[f'R1-R{i} Put Spread'] = (sell_strike, buy_strike)
        
        # Calculate the call credit spreads for each pair of supports
        for i in range(2, 5):
            sell_strike = round_to_nearest_50_cents(row['S1'])
            buy_strike = round_to_nearest_50_cents(row[f'S{i}'])
            day_spreads[f'S1-S{i} Call Spread'] = (sell_strike, buy_strike)
        
        # Add the current day's spreads to the list
        spreads.append(day_spreads)
    
    # Convert the list of spreads into a DataFrame for easier viewing
    spreads_df = pd.DataFrame(spreads)
    spreads_df.set_index('Date', inplace=True)
    return spreads_df

# Assuming spy_data is already defined and contains the necessary columns
# Generate the credit spreads
credit_spreads = generate_credit_spreads(spy_data)
print(credit_spreads)
credit_spreads

           R1-R2 Put Spread R1-R3 Put Spread R1-R4 Put Spread  \
Date                                                            
2022-01-31   (455.0, 458.0)   (455.0, 465.5)   (455.0, 479.0)   
2022-02-01   (456.5, 458.5)   (456.5, 463.0)   (456.5, 471.5)   
2022-02-02   (460.0, 461.5)   (460.0, 465.0)   (460.0, 471.5)   
2022-02-03   (450.0, 455.0)   (450.0, 457.5)   (450.0, 470.0)   
2022-02-04   (453.0, 457.5)   (453.0, 462.0)   (453.0, 475.5)   
...                     ...              ...              ...   
2024-02-13   (497.5, 500.5)   (497.5, 503.5)   (497.5, 513.0)   
2024-02-14   (501.0, 502.5)   (501.0, 505.5)   (501.0, 511.5)   
2024-02-15   (503.5, 504.5)   (503.5, 507.0)   (503.5, 511.5)   
2024-02-16   (501.5, 504.5)   (501.5, 505.5)   (501.5, 512.5)   
2024-02-20   (498.5, 500.5)   (498.5, 502.5)   (498.5, 508.5)   

           S1-S2 Call Spread S1-S3 Call Spread S1-S4 Call Spread  
Date                                                              
2022-01-31    (444.5

Unnamed: 0_level_0,R1-R2 Put Spread,R1-R3 Put Spread,R1-R4 Put Spread,S1-S2 Call Spread,S1-S3 Call Spread,S1-S4 Call Spread
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-01-31,"(455.0, 458.0)","(455.0, 465.5)","(455.0, 479.0)","(444.5, 437.0)","(444.5, 434.0)","(444.5, 416.0)"
2022-02-01,"(456.5, 458.5)","(456.5, 463.0)","(456.5, 471.5)","(449.5, 445.0)","(449.5, 443.0)","(449.5, 431.5)"
2022-02-02,"(460.0, 461.5)","(460.0, 465.0)","(460.0, 471.5)","(455.0, 451.5)","(455.0, 449.5)","(455.0, 441.5)"
2022-02-03,"(450.0, 455.0)","(450.0, 457.5)","(450.0, 470.0)","(443.0, 440.5)","(443.0, 435.5)","(443.0, 426.0)"
2022-02-04,"(453.0, 457.5)","(453.0, 462.0)","(453.0, 475.5)","(444.0, 439.5)","(444.0, 435.5)","(444.0, 421.5)"
...,...,...,...,...,...,...
2024-02-13,"(497.5, 500.5)","(497.5, 503.5)","(497.5, 513.0)","(491.0, 487.5)","(491.0, 484.5)","(491.0, 475.0)"
2024-02-14,"(501.0, 502.5)","(501.0, 505.5)","(501.0, 511.5)","(496.0, 493.0)","(496.0, 491.5)","(496.0, 483.5)"
2024-02-15,"(503.5, 504.5)","(503.5, 507.0)","(503.5, 511.5)","(500.5, 498.0)","(500.5, 497.0)","(500.5, 491.0)"
2024-02-16,"(501.5, 504.5)","(501.5, 505.5)","(501.5, 512.5)","(497.5, 496.0)","(497.5, 493.5)","(497.5, 488.0)"


# Probability of a Credit Spread expiring worthless


In [None]:
from scipy.stats import norm