In [1]:
# We are going to:

# Calc features
# - read data for each ticker
#   - resample to be hourly with forward fill
#   - remove the weekends and non-trading hours
# - calc technical indicator features.
#     - for date 
#         - for each stock
#             - we calculate a technical indicators using hourly data up until the end of the previous day
#         - for each indicator we combine the values for all stocks
#             - we rank this and split into n equal buckets and label > this is a single feature
#     
#     for d dates, s stocks and i indicators and b buckets then we should have a feature array of size
#         d x i with b discrete labels
# 
# This means that the feature is combined across the stocks which increases the number of data points to 
# d x s for each indicator approx 720 (144 weeks, ~3 years) * 126 * 65 = 5,896,800

# Calc targets
# We will calculate the return from the start of day at time t, until the end of day at time t+n
# We can decide to trade every day or selected days of the weekday_data
# # - read data for each ticker
#     - resample the data to be hourly with backward fill
#     - remove the weekends
#     - get the first price on each entry day of the week
#     - get the last price on the exit day, n days later
#     - calculate the return between the entry and exit times, store this for each stock
# - for each date 
#     - rank the returns across all stocks
#     - split into buckets of number b
#     - label the buckets, these are the targets
# 
# For each of the d dates, for each of s stocks, there will be a single target comprising one of b different labels. 
# The target array will be of size d x s x 1, approx 720 x 126 x 1 = 90,720

In [1]:
from datetime import datetime
import pandas as pd
from sklearn.model_selection import train_test_split
from SMB.data import read_config, write_targets_per_stock_by_date, read_targets_per_stock_by_date

from SMB.targets import get_targets
from SMB.calc_features import calc_all_stock_indicators, calc_daily_indicators
import matplotlib as plt
import logging


# Set up logging
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[logging.FileHandler("app.log"),
                              logging.StreamHandler()])

config = read_config('base')

2024-11-12 10:32:16,511 - INFO - Reading config: base


In [2]:
config['tickers'] = config['tickers'][0:4]
targets_per_stock_by_date = get_targets(config)
print(targets_per_stock_by_date)
print(targets_per_stock_by_date.shape)

2024-11-12 10:32:17,405 - INFO - Reading prices file: /Users/stephanie/src/N-LASR-model/data/raw_prices/AAPL_1hr_historical_data_final.csv
2024-11-12 10:32:17,458 - INFO - Reading prices file: /Users/stephanie/src/N-LASR-model/data/raw_prices/ADBE_1hr_historical_data_final.csv
2024-11-12 10:32:17,493 - INFO - Reading prices file: /Users/stephanie/src/N-LASR-model/data/raw_prices/AIG_1hr_historical_data_final.csv
2024-11-12 10:32:17,527 - INFO - Reading prices file: /Users/stephanie/src/N-LASR-model/data/raw_prices/ALL_1hr_historical_data_final.csv


(4839, 8)
(4839, 8)
                       open    high     low   close     volume  barCount
date                                                                    
2021-12-27 14:30:00  177.09  178.47  177.07  177.29  9193301.0     41654
2021-12-27 15:00:00  177.30  178.47  177.13  178.46  9712500.0     46532
2021-12-27 15:30:00  178.46  178.87  178.17  178.84        0.0         0
2021-12-27 16:00:00  178.46  178.87  178.17  178.84  6990437.0     30320
2021-12-27 16:30:00  178.84  179.37  178.78  179.25        0.0         0
...                     ...     ...     ...     ...        ...       ...
2022-01-05 16:30:00  178.80  179.25  178.25  178.35        0.0         0
2022-01-05 17:00:00  178.80  179.25  178.25  178.35  5759086.0     27012
2022-01-05 17:30:00  178.33  178.55  177.67  178.04        0.0         0
2022-01-05 18:00:00  178.33  178.55  177.67  178.04  5833227.0     28047
2022-01-05 18:30:00  178.05  178.72  176.10  176.50        0.0         0

[100 rows x 6 columns]
open   

In [3]:
targets_per_stock_by_date_file_name = write_targets_per_stock_by_date(config['name'], targets_per_stock_by_date)
# print(targets_per_stock_by_date_file_name)

2024-11-12 10:32:36,535 - INFO - Writing model file to: /Users/stephanie/src/N-LASR-model/data/model/base/targets_per_stock_by_date_2024-11-12T10:32:36.pkl


In [4]:

t = read_targets_per_stock_by_date(config['name'], targets_per_stock_by_date_file_name)


2024-11-12 10:32:39,257 - INFO - Reading model file: /Users/stephanie/src/N-LASR-model/data/model/base/targets_per_stock_by_date_2024-11-12T10:32:36.pkl


In [3]:
from datetime import datetime

timestamp = datetime.now().replace(microsecond=0).isoformat()
print(timestamp)

2024-11-12T08:22:22


In [2]:
X = targets_per_stock_by_date
y = targets_per_stock_by_date

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=config['test_size'], 
    random_state=config['random_state'], 
    shuffle=config['shuffle']
)

KeyError: 'random_state'

In [47]:
print(daily_weekday_data.shape)
ta = calc_daily_indicators(daily_weekday_data)
print(ta.shape)
print(ta.dtypes)

(693, 6)
(693, 66)
adx               float64
alma              float64
ao                float64
atr               float64
bias              float64
                   ...   
variance          float64
vortex_VTXP_14    float64
vortex_VTXM_14    float64
wma               float64
willr             float64
Length: 66, dtype: object


In [50]:
ta_sum = pd.concat([ta, ta])
print(ta_sum.shape)

(1386, 66)


In [3]:
all_stock_indicators = calc_all_stock_indicators(data_weekdays_daily)
print(all_stock_indicators.shape)
all_stock_indicators.to_pickle('models_directory'+ 'all_stock_indicators.pkl')

  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indicators['percent_return'] = calc_mean(ta.percent_return(p.close))
  indi

(1006, 8316)


In [2]:
all_stock_indicators = pd.read_pickle('models_directory'+ 'all_stock_indicators.pkl')

In [3]:
# NB the indicators are calculated using the close of the day!
all_stock_indicators.index = all_stock_indicators.index + pd.DateOffset(days=1)

In [24]:
s = all_stock_indicators.iloc[100:101]
rsi_values = s.xs('ema', level=1, axis=1)
print(rsi_values.head)

<bound method NDFrame.head of                 AAPL      ADBE       AIG      ALL       AMD      AMAT  \
date                                                                    
2022-04-06  0.009004  0.012457  0.003535  0.00573  0.062131  0.079398   

                AMGN      AMZN       AON      APD  ...       USB        V  \
date                                               ...                      
2022-04-06 -0.006948  0.011727 -0.000523 -0.00663  ...  0.025371 -0.00616   

               VRTX       WBA      WDAY       WFC       WMT       YUM  \
date                                                                    
2022-04-06 -0.01419  0.041283  0.019501  0.033363 -0.007966  0.014777   

                 ZBH        ZM  
date                            
2022-04-06 -0.013072  0.009136  

[1 rows x 126 columns]>


In [18]:
# axis = 0 by row
# ranked_rsi = rsi_values.rank(axis=1, method='first', ascending=True)
ranked_rsi = rsi_values.rank("columns", method='first', ascending=False)
              # .fillna(0).astype(int))
# ranked_rsi = rsi_values.assign(**rsi_values.iloc[:, 0:].rank(axis=1, method='first', ascending=False).astype(int))

print(ranked_rsi.shape)
print(ranked_rsi.head)

(20, 126)
<bound method NDFrame.head of             AAPL  ADBE    AIG    ALL   AMD  AMAT   AMGN  AMZN    AON    APD  \
date                                                                          
2022-04-06  71.0  58.0   89.0   83.0   2.0   1.0  108.0  59.0   96.0  107.0   
2022-04-07  65.0  53.0   69.0   97.0  11.0   4.0  117.0  49.0  105.0   92.0   
2022-04-08  68.0  69.0   72.0  104.0  12.0  24.0  106.0  37.0   98.0   88.0   
2022-04-09   1.0   2.0    3.0    4.0   5.0   6.0    7.0   8.0    9.0   10.0   
2022-04-10   1.0   2.0    3.0    4.0   5.0   6.0    7.0   8.0    9.0   10.0   
2022-04-11  57.0  45.0   86.0  101.0   9.0  25.0  112.0  30.0   90.0   83.0   
2022-04-12  34.0  65.0   86.0  117.0   2.0   7.0   99.0  17.0  114.0   88.0   
2022-04-13  65.0  32.0   84.0  105.0   4.0  12.0  110.0  18.0   90.0   88.0   
2022-04-14  78.0  43.0   69.0   77.0   9.0  13.0  113.0  50.0   71.0   74.0   
2022-04-15   1.0   2.0    3.0    4.0   5.0   6.0    7.0   8.0    9.0   10.0   
2022-04-16  

In [28]:
def binned_sum_func(row):
    bins, _ = pd.qcut(row, q=5, labels=[1,2,3,4,5], retbins=True, duplicates='drop')
    return bins.value_counts()

print(rsi_values.shape)
rsi_values.dropna(axis=0)
print(rsi_values.shape)
binned_sum = rsi_values.apply(binned_sum_func, axis=1).fillna(0).astype(int)
print(binned_sum)

(1, 126)
(1, 126)
2022-04-06 00:00:00   1   2   3   4   5
date                                   
2022-04-06           26  25  25  25  25
