In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor



In [2]:
# Define the path to the Downloads folder and the file
downloads_folder = os.path.expanduser("~/Desktop")
file_name = "aaHistoricalData_1726248252859.csv" 
file_path = os.path.join(downloads_folder, file_name)

# Read the CSV file into a DataFrame
data = pd.read_csv(file_path)

# Display the first few rows of the DataFrame
print(data.head(3))


         Date  Close/Last    Volume    Open    High     Low
0  09/12/2024      559.09  51892740  555.01  559.40  552.74
1  09/11/2024      554.42  75248610  548.70  555.36  539.96
2  09/10/2024      548.79  36394580  548.36  549.15  543.38


In [3]:
# Add new columns with the given calculations
data['High_minus_Open'] = (data['High'] - data['Open']) / ((data['Open'] + data['Close/Last']) / 2)
data['Open_minus_Low'] = (data['Open'] - data['Low']) / ((data['Open'] + data['Close/Last']) / 2)
data['intraday_up_MINUS_intraday_down_ratio'] = data['High_minus_Open'] - data['Open_minus_Low']
data['intraday_change'] = (data['Close/Last'] - data['Open']) / ((data['Open'] + data['Close/Last']) / 2)
data['day_change'] = (data['Close/Last'] - data['Close/Last'].shift(1)) / data['Close/Last'].shift(1)


data['intraday_range'] = (data['High'] - data['Low']) / ((data['Open'] + data['Close/Last']) / 2)
data['opening_gaps'] = (data['Open'] - data['Close/Last'].shift(1)) / ((data['Open'] + data['Close/Last']) / 2)
data['net_Intraday_Movement_Around_Open'] = ((data['High'] - data['Open']) / ((data['Open'] + data['Close/Last']) / 2)) - ((data['Open'] - data['Low']) / ((data['Open'] + data['Close/Last']) / 2))

# Create moving average columns
data['MA_5'] = data['Open'].rolling(window=5).mean()
data['MA_14'] = data['Open'].rolling(window=14).mean()
data['MA_50'] = data['Open'].rolling(window=50).mean()

# Create lagged features comparing today's open to these moving averages
data['lag_5'] = (data['Open'] > data['MA_5']).astype(int)
data['lag_14'] = (data['Open'] > data['MA_14']).astype(int)
data['lag_50'] = (data['Open'] > data['MA_50']).astype(int)



data['MA_Crossover_Signal_5_14'] = (data['MA_5'] > data['MA_14']).astype(int)

# Create a binary indicator for whether yesterday's close was above or below the 5-day MA
data['yesterday_close_above_MA_5'] = (data['Close/Last'].shift(1) > data['MA_5']).astype(int)



In [8]:
print(data.columns)

Index(['Date', 'Close/Last', 'Volume', 'Open', 'High', 'Low',
       'High_minus_Open', 'Open_minus_Low',
       'intraday_up_MINUS_intraday_down_ratio', 'intraday_change',
       'day_change', 'intraday_range', 'opening_gaps',
       'net_Intraday_Movement_Around_Open', 'MA_5', 'MA_14', 'MA_50', 'lag_5',
       'lag_14', 'lag_50', 'MA_Crossover_Signal_5_14',
       'yesterday_close_above_MA_5', 'OBV', 'OBV_pct_change', 'SPY_pct_change',
       'OBV_SPY_pct_diff', 'OBV_SPY_metric', 'OBV_SPY_window_length',
       'Original_Index'],
      dtype='object')


In [9]:
data['OBV'] = 0

# Initialize the first value of OBV
data.loc[0, 'OBV'] = data.loc[0, 'Volume']

# Calculate OBV
for i in range(1, len(data)):
    if data.loc[i, 'Close/Last'] > data.loc[i-1, 'Close/Last']:
        data.loc[i, 'OBV'] = data.loc[i-1, 'OBV'] + data.loc[i, 'Volume']
    elif data.loc[i, 'Close/Last'] < data.loc[i-1, 'Close/Last']:
        data.loc[i, 'OBV'] = data.loc[i-1, 'OBV'] - data.loc[i, 'Volume']
    else:
        data.loc[i, 'OBV'] = data.loc[i-1, 'OBV']

# Display the first few rows to check the new column
print(data.head(1))
print(''' Creates the OBV''')

         Date  Close/Last    Volume    Open   High     Low  High_minus_Open  \
0  09/12/2024      559.09  51892740  555.01  559.4  552.74         0.007881   

   Open_minus_Low  intraday_up_MINUS_intraday_down_ratio  intraday_change  \
0        0.004075                               0.003806         0.007324   

   ...  lag_50  MA_Crossover_Signal_5_14  yesterday_close_above_MA_5  \
0  ...       0                         0                           0   

        OBV  OBV_pct_change  SPY_pct_change  OBV_SPY_pct_diff  OBV_SPY_metric  \
0  51892740             NaN             NaN               NaN             0.0   

   OBV_SPY_window_length  Original_Index  
0                    5.0             0.0  

[1 rows x 29 columns]
 Creates the OBV


In [5]:
# "OBV_PCT_CHANGE IS ON A DIFFERENT SCALE AS THE "SPY_PCT_CHANGE"  BECAUSE OBV IS A CUMULATIVE FIGURE AND THE SPY_PCT_CHANGE IS NOT. MAKING THE WHOLE THING...
# ...NONSENSICAL... THE PERCENTAGE CHANGES OF THE OBV WILL BECOME MINISCULE AS EACH DAY'S VOLUME BECOMES INSUBSTANTIAL WHEN THE OBV SNOWBALLS TO A 
# HIGH VALUE OR LOW VALUE
# Use z-value of the spy change or the volume change
# OR use an obv rolling average and take the aveage as apercentage of that 

data['OBV_pct_change'] = data['OBV'].pct_change() * 100
data['SPY_pct_change'] = data['day_change'] * 100
data['OBV_SPY_pct_diff'] = data['OBV_pct_change'] - data['SPY_pct_change']

def calculate_obv_spy_metric_backward(data, initial_window_size=5, threshold=15):
    data['OBV_SPY_metric'] = np.nan  # Initialize the new independent variable column
    data['OBV_SPY_window_length'] = np.nan  # Initialize column for window length
    data['Original_Index'] = np.nan  # Initialize column for index
    
    for i in range(len(data) - initial_window_size):
        max_diff = 0
        j = initial_window_size
        
        while i - j >= 0:
            # Calculate rolling sum of differences (backward-looking)
            current_window = data['OBV_SPY_pct_diff'].iloc[i-j:i].sum()
            abs_current_diff = abs(current_window)
            
            if abs_current_diff > max_diff:
                max_diff = abs(current_window)
            
            retracement = ((max_diff - abs_current_diff) / max_diff) * 100 if max_diff != 0 else 0
            
            if retracement >= threshold:
                break
            
            j += 1
        
        # After the loop, assign the max difference and window length
        data.at[i, 'OBV_SPY_metric'] = max_diff
        data.at[i, 'OBV_SPY_window_length'] = j
        data.at[i, 'Original_Index'] = i  # Assign the index value
    
    return data

# Apply the function to the DataFrame
data = calculate_obv_spy_metric_backward(data)

# Drop rows with NaN values in OBV_SPY_metric
data.dropna(subset=['OBV_SPY_metric'], inplace=True)

print(data[['Original_Index', 'Date', 'OBV_pct_change', 'SPY_pct_change', 'OBV_SPY_pct_diff', 'OBV_SPY_metric', 'OBV_SPY_window_length']].head(2))
print('''OBV_SPY_metric is the cumulative percentage change between the OBV and SPY since it last had a 15% retracement, OBV_SPY_window_length is the length of the window
    in days, and Index is the original index.''')
print(" as it's currently coded it will stop when teh threshold is met... not somethign that can be graphed")


   Original_Index        Date  OBV_pct_change  SPY_pct_change  \
0             0.0  09/12/2024             NaN             NaN   
1             1.0  09/11/2024     -145.007972       -0.835286   

   OBV_SPY_pct_diff  OBV_SPY_metric  OBV_SPY_window_length  
0               NaN             0.0                    5.0  
1       -144.172686             0.0                    5.0  
OBV_SPY_metric is the cumulative percentage change between the OBV and SPY since it last had a 15% retracement, OBV_SPY_window_length is the length of the window
    in days, and Index is the original index.
It will record the INDEX to make sure that future data maniupulations don't force us to lose the order



In [6]:
# Filter the DataFrame for 'OBV_SPY_window_length' values over 20
filtered_data = data[data['OBV_SPY_window_length'] > 20]

# Verify filtered data row count
print(f"Initial filtered set has {filtered_data.shape[0]} rows.")  # Expect 45 rows

# Sort the filtered DataFrame by the 'Original_Index' column
sorted_filtered_data = filtered_data.sort_values(by='Original_Index')

# Verify sorted row count
print(f"After sorting, the DataFrame has {sorted_filtered_data.shape[0]} rows.")  # Expect 45 rows

# Calculate the percentage change between successive 'Original_Index' values
sorted_filtered_data['Percent_Change'] = sorted_filtered_data['Original_Index'].pct_change() * 100

# Drop NaNs resulting from pct_change calculation
sorted_filtered_data.dropna(subset=['Percent_Change'], inplace=True)

# Verify row count after dropping NaNs
print(f"After dropping NaNs, the DataFrame has {sorted_filtered_data.shape[0]} rows.")  # Should print 44

# Initialize a column to mark new OBV signals using .shift()
sorted_filtered_data['New_OBV_Trend'] = (abs(sorted_filtered_data['Percent_Change']) > 2).astype(int)

# Ensure the first value is 1
sorted_filtered_data.at[sorted_filtered_data.index[0], 'New_OBV_Trend'] = 1

# Debugging Output After creating New_OBV_Trend
print(f"Row count after creating New_OBV_Trend: {sorted_filtered_data.shape[0]}")  # Should remain consistent at 44

# Display the sorted filtered DataFrame and the count
print(sorted_filtered_data[['Original_Index', 'Date', 'OBV_pct_change', 'SPY_pct_change', 'OBV_SPY_pct_diff', 'OBV_SPY_metric', 'OBV_SPY_window_length', 
                            'Percent_Change', 'New_OBV_Trend']].head(2))

# Find the maximum value in the 'New_OBV_Trend' column
max_new_obv_trend = sorted_filtered_data['New_OBV_Trend'].max()
print(f"Maximum value in the 'New_OBV_Trend' column: {max_new_obv_trend}")


Initial filtered set has 45 rows.
After sorting, the DataFrame has 45 rows.
After dropping NaNs, the DataFrame has 44 rows.
Row count after creating New_OBV_Trend: 44
     Original_Index        Date  OBV_pct_change  SPY_pct_change  \
219           219.0  10/27/2023        3.362385       -1.181453   
220           220.0  10/26/2023       -3.488999        0.455342   

     OBV_SPY_pct_diff  OBV_SPY_metric  OBV_SPY_window_length  Percent_Change  \
219          4.543838       50.267462                   25.0        0.458716   
220         -3.944342       54.811300                   27.0        0.456621   

     New_OBV_Trend  
219              1  
220              0  
Maximum value in the 'New_OBV_Trend' column: 1


In [7]:
# Filter rows with a value of "1" in the New_OBV_Trend column
obv_signals = sorted_filtered_data[sorted_filtered_data['New_OBV_Trend'] == 1]

# Display the filtered rows with their corresponding original index values
print(obv_signals[['Original_Index', 'Date', 'OBV_pct_change', 'SPY_pct_change', 'OBV_SPY_pct_diff', 'OBV_SPY_metric',
                   'OBV_SPY_window_length', 'Percent_Change', 'New_OBV_Trend']].head(2))


     Original_Index        Date  OBV_pct_change  SPY_pct_change  \
219           219.0  10/27/2023        3.362385       -1.181453   
733           733.0  10/12/2021        3.456831       -0.358472   

     OBV_SPY_pct_diff  OBV_SPY_metric  OBV_SPY_window_length  Percent_Change  \
219          4.543838       50.267462                   25.0        0.458716   
733          3.815303       78.963190                   34.0      230.180180   

     New_OBV_Trend  
219              1  
733              1  
