In [111]:
import numpy as np
import pandas as pd
import pandas_ta as ta
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt

# all the names of my downloaded tickers
tickers = ["AAPL", "ADBE", "AIG", "ALL", "AMD", "AMAT", "AMGN", "AMZN", "AON",
    "APD", "AVGO", "AXP", "BAC", "BABA", "BAX", "BBY", "BDX", "BIIB", "BLK",
    "BMY", "BK", "CB", "C", "CHD", "CI", "CINF", "CL", "CLX", "CMG",
    "CRM", "COST", "CRWD", "CSCO", "CVS", "DDOG", "DG", "DHR", "DOCU", "DPZ",
    "DRI", "EW", "ECL", "F", "FDX", "FMC", "GM", "GILD", "GIS", "GOOG",
    "GS", "HD", "HOG", "HUBS", "IBM", "IFF", "ILMN", "INTC", "INTU", "ISRG",
    "JNJ", "JPM", "K", "KHC", "KR", "LNC", "LOW", "LULU", "MA", "MDB",
    "MCD", "MDT", "META", "MKC", "MMM", "MSFT", "MU", "NET", "NFLX", "NOW",
    "NVDA", "OKTA", "ORCL", "PEP", "PG", "PGR", "PFE", "PLTR", "PNC", "PPG",
    "PRU", "PYPL", "REGN", "ROKU", "RNG", "RL", "SBUX", "SCHW", "SHOP", "SKX",
    "SNOW", "SPGI", "SPY", "SQ", "SYK", "T", "TEAM", "TMO", "TRV", "TSCO",
    "TSLA", "TXN", "TROW", "TWLO", "UA", "UAA", "UNH", "USB", "V", "VRTX",
    "WBA", "WDAY", "WFC", "WMT", "YUM", "ZBH", "ZM"]

# just to make sure that data is ordered correctly
def order_data(df):
    df = df.sort_values(by='date')
    return df

common_dates = None

for ticker in tickers:
    ticker_data = pd.read_csv(f'{ticker}_1hr_historical_data_final.csv')
    ticker_data['date'] = pd.to_datetime(ticker_data['date'], errors='coerce')

    # Get ordered data
    ticker_data_ordered = order_data(ticker_data)

    # For ease of testing
    ticker_data_truncated = ticker_data_ordered[(ticker_data_ordered['date'] >= ticker_data_ordered['date'].max() - timedelta(weeks=70))].copy()

    # If common_dates is None, initialize it with the first ticker's dates
    if common_dates is None:
        common_dates = set(ticker_data_truncated['date'])
    else:
        # Keep only the dates that are common across tickers
        common_dates &= set(ticker_data_truncated['date'])

# Now, filter each ticker's data by the common dates
ticker_arrays = []

for ticker in tickers:
    ticker_data = pd.read_csv(f'{ticker}_1hr_historical_data_final.csv')
    ticker_data['date'] = pd.to_datetime(ticker_data['date'], errors='coerce')

    # Order data
    ticker_data_ordered = order_data(ticker_data)

    # For ease of testing
    ticker_data_truncated = ticker_data_ordered[(ticker_data_ordered['date'] >= ticker_data_ordered['date'].max() - timedelta(weeks=30))].copy()

    # Filter by common dates
    ticker_data_filtered = ticker_data_truncated[ticker_data_truncated['date'].isin(common_dates)]

    # Convert to numpy
    ticker_array_half = ticker_data_filtered.to_numpy()

    # Create empty dataframe for day and hour
    ticker_data_filtered_date = pd.DataFrame()
    ticker_data_filtered_date.loc[:, 'Day'] = ticker_data_filtered['date'].dt.dayofweek
    ticker_data_filtered_date.loc[:, 'Hr'] = ticker_data_filtered['date'].dt.hour

    ticker_array_datetime = ticker_data_filtered_date.to_numpy()

    # Combine open, high, low, close, vol df with date hour df
    total_ticker_array = np.concatenate((ticker_array_half[:, 1:-2], ticker_array_datetime), axis=1)

    # Append to the list
    ticker_arrays.append(total_ticker_array)
# for each 2D array, the x 'rows' are my stocks
# the y 'columns' are the time step
# z is each calculate column/data column, i.e. open,hi,lo,close,vol, day,hr etc.
len_initial_matrix = len(tickers) + 2

ticker_matrix = np.stack(ticker_arrays, axis=0)
print(ticker_matrix.shape)

(126, 2337, 7)


In [4]:
# from github, just removes deprecated warnings etc.
pd.set_option('future.no_silent_downcasting', True)

# massive function to calculate all technical indicators
def add_indicators(matrix, k):
    
    rolling_window = 14
    
    open_col = pd.Series(matrix[k, :, 0])
    high_col = pd.Series(matrix[k, :, 1])
    low_col = pd.Series(matrix[k, :, 2])
    close_col = pd.Series(matrix[k, :, 3])
    volume_col = pd.Series(matrix[k, :, 4])

    # Calculate indicators for the matrix
    fisher = ta.fisher(high_col, low_col).iloc[:, 0].astype('float64')
    dpo = ta.dpo(close_col)
    corr = close_col.rolling(window=14).corr(close_col.shift(1))
    adx = ta.adx(high_col, low_col, close_col)
    rsi = ta.rsi(close_col)
    uo = ta.uo(high_col, low_col, close_col)
    willr = ta.willr(high_col, low_col, close_col)
    kc_upper = ta.kc(high_col, low_col, close_col)
    kama = ta.kama(close_col)
    vtx = ta.vortex(high_col, low_col, close_col)
    cmf = ta.cmf(high_col, low_col, close_col, volume_col)
    kst = ta.kst(close_col)
    eom = ta.eom(high_col, low_col, close_col, volume_col)
    bop = ta.bop(open_col, high_col, low_col, close_col)
    cci = ta.cci(high_col, low_col, close_col)
    cmo = ta.cmo(close_col)
    efi = (ta.efi(close_col, volume_col) - open_col) / open_col
    eri = (ta.eri(high_col, low_col, close_col) - open_col) / open_col
    ema = (ta.ema(close_col) - open_col) / open_col
    hma = (ta.hma(close_col) - open_col) / open_col
    linreg = (ta.linreg(close_col) - open_col) / open_col
    slope = (ta.slope(close_col) - open_col) / open_col
    median = (ta.median(close_col) - open_col) / open_col
    macd = (ta.macd(close_col).iloc[:, 0] - open_col) / open_col
    mom = (ta.mom(close_col) - open_col) / open_col
    obv = (ta.obv(close_col, volume_col) - open_col) / open_col
    pvo = (ta.pvo(volume_col).iloc[:, 0] - open_col) / open_col
    roc = (ta.roc(close_col) - open_col) / open_col
    pvt = (ta.pvt(close_col, volume_col) - open_col) / open_col
    sma = (ta.sma(close_col) - open_col) / open_col
    stddev = (ta.stdev(close_col) - open_col) / open_col
    supertrend = (ta.supertrend(high_col, low_col, close_col).iloc[:, 0] - open_col) / open_col
    t3 = (ta.t3(close_col) - open_col) / open_col
    trima = (ta.trima(close_col) - open_col) / open_col
    true_range = (ta.true_range(high_col, low_col, close_col) - open_col) / open_col
    wma = (ta.wma(close_col) - open_col) / open_col
    tema = (ta.tema(close_col) - open_col) / open_col
    nvi = (ta.nvi(close_col, volume_col) - open_col) / open_col
    nvi = nvi.infer_objects()
    pvi = (ta.pvi(close_col, volume_col) - open_col) / open_col
    pvi = pvi.infer_objects()
    chop = (ta.chop(high_col, low_col, close_col) - open_col) / open_col
    atr = (ta.atr(high_col, low_col, close_col) - open_col) / open_col
    ao = (ta.ao(high_col, low_col) - open_col) / open_col
    dm = (ta.dm(high_col, low_col).iloc[:, 0] - open_col) / open_col
    dema = (ta.dema(close_col) - open_col) / open_col
    bias = (ta.bias(close_col) - open_col) / open_col
    cfo = (ta.cfo(close_col) - open_col) / open_col
    cti = (ta.cti(close_col) - open_col) / open_col
    inertia = (ta.inertia(close_col) - open_col) / open_col
    inertia = inertia.infer_objects()
    ppo = (ta.ppo(close_col).iloc[:, 0] - open_col) / open_col
    trix = (ta.trix(close_col).iloc[:, 0] - open_col) / open_col
    alma = (ta.alma(close_col) - open_col) / open_col
    fwma = (ta.fwma(close_col) - open_col) / open_col
    hilo = (ta.hilo(high_col, low_col, close_col).iloc[:, 0] - open_col) / open_col
    hl2 = (ta.hl2(high_col, low_col) - open_col) / open_col
    hwma = (ta.hwma(close_col) - open_col) / open_col
    midpoint = (ta.midpoint(close_col) - open_col) / open_col
    midprice = (ta.midprice(high_col, low_col) - open_col) / open_col
    pwma = (ta.pwma(close_col) - open_col) / open_col
    sinwma = (ta.sinwma(close_col) - open_col) / open_col
    percent_return = ta.percent_return(close_col)
    percent_return = percent_return.infer_objects()  # Add this only if it triggers the same warning   # Percent return doesn't need rolling mean
    kurtosis = (ta.kurtosis(close_col) - open_col) / open_col
    mad = (ta.mad(close_col) - open_col) / open_col
    skew = (ta.skew(close_col) - open_col) / open_col
    variance = (ta.variance(close_col) - open_col) / open_col

    new_columns = [
        fisher - fisher.rolling(rolling_window).mean(),
        dpo - dpo.rolling(rolling_window).mean(),
        corr - corr.rolling(rolling_window).mean(),
        adx['ADX_14'] - adx['ADX_14'].rolling(rolling_window).mean(),
        rsi - rsi.rolling(rolling_window).mean(),
        uo - uo.rolling(rolling_window).mean(),
        willr - willr.rolling(rolling_window).mean(),
        kc_upper - kc_upper.rolling(rolling_window).mean(),
        kama - kama.rolling(rolling_window).mean(),
        vtx - vtx.rolling(rolling_window).mean(),
        cmf - cmf.rolling(rolling_window).mean(),
        kst - kst.rolling(rolling_window).mean(),
        eom - eom.rolling(rolling_window).mean(),
        bop - bop.rolling(rolling_window).mean(),
        cci - cci.rolling(rolling_window).mean(),
        cmo - cmo.rolling(rolling_window).mean(),
        efi - efi.rolling(rolling_window).mean(),
        ema - ema.rolling(rolling_window).mean(),
        hma - hma.rolling(rolling_window).mean(),
        linreg - linreg.rolling(rolling_window).mean(),
        slope - slope.rolling(rolling_window).mean(),
        median - median.rolling(rolling_window).mean(),
        macd - macd.rolling(rolling_window).mean(),
        mom - mom.rolling(rolling_window).mean(),
        obv - obv.rolling(rolling_window).mean(),
        pvo - pvo.rolling(rolling_window).mean(),
        roc - roc.rolling(rolling_window).mean(),
        pvt - pvt.rolling(rolling_window).mean(),
        sma - sma.rolling(rolling_window).mean(),
        stddev - stddev.rolling(rolling_window).mean(),
        supertrend - supertrend.rolling(rolling_window).mean(),
        t3 - t3.rolling(rolling_window).mean(),
        trima - trima.rolling(rolling_window).mean(),
        true_range - true_range.rolling(rolling_window).mean(),
        wma - wma.rolling(rolling_window).mean(),
        kurtosis - kurtosis.rolling(rolling_window).mean(),
        mad - mad.rolling(rolling_window).mean(),
        skew - skew.rolling(rolling_window).mean(),
        variance - variance.rolling(rolling_window).mean(),
        tema - tema.rolling(rolling_window).mean(),
        nvi - nvi.rolling(rolling_window).mean(),
        pvi - pvi.rolling(rolling_window).mean(),
        chop - chop.rolling(rolling_window).mean(),
        atr - atr.rolling(rolling_window).mean(),
        ao - ao.rolling(rolling_window).mean(),
        dm - dm.rolling(rolling_window).mean(),
        dema - dema.rolling(rolling_window).mean(),
        bias - bias.rolling(rolling_window).mean(),
        cfo - cfo.rolling(rolling_window).mean(),
        cti - cti.rolling(rolling_window).mean(),
        inertia - inertia.rolling(rolling_window).mean(),
        ppo - ppo.rolling(rolling_window).mean(),
        trix - trix.rolling(rolling_window).mean(),
        alma - alma.rolling(rolling_window).mean(),
        fwma - fwma.rolling(rolling_window).mean(),
        hilo - hilo.rolling(rolling_window).mean(),
        hl2 - hl2.rolling(rolling_window).mean(),
        hwma - hwma.rolling(rolling_window).mean(),
        midpoint - midpoint.rolling(rolling_window).mean(),
        midprice - midprice.rolling(rolling_window).mean(),
        pwma - pwma.rolling(rolling_window).mean(),
        sinwma - sinwma.rolling(rolling_window).mean(),
        percent_return - percent_return.rolling(rolling_window).mean()  # No rolling mean needed
    ]

    # Stack the columns side by side and return
    indicator_matrix = np.column_stack(new_columns)

    return indicator_matrix

indicator_arrays = []

for r in range(ticker_matrix.shape[0]):
    ind_per_stock = add_indicators(ticker_matrix, r)
    indicator_arrays.append(ind_per_stock)
    num_of_indicators = ind_per_stock.shape[0]

#create new matrix for only indicators for each ticker
indicator_matrix =  np.stack(indicator_arrays, axis = 0)
print(indicator_matrix.shape)


(126, 1013, 67)


In [8]:
num_stocks = len(tickers)

#cros rank and scale rankings of indicators
def cross_rank(matrix):
    ranked_matrix = np.zeros(matrix.shape)

    for j in range(matrix.shape[1]): 
        
        for k in range(matrix.shape[2]):
            
            values_at_time_j = matrix[:, j, k]
            
            ranked_values = pd.Series(values_at_time_j).rank() / num_stocks

            ranked_values = ranked_values.fillna(0)
            
            ranked_matrix[:, j, k] = ranked_values.to_numpy()

    return ranked_matrix

ranked_indicator_mat = cross_rank(indicator_matrix)
print(ranked_indicator_mat.shape)


(126, 1013, 67)


In [10]:
#bin all ranked indicator values into quintiles
def quantiler(matrix):
    quantiled_matrix = np.zeros(matrix.shape)

    for j in range(matrix.shape[1]): 
        
        for k in range(matrix.shape[2]):

            values_at_time_j = matrix[:, j, k]
            values_series = pd.Series(values_at_time_j)

            quantiled_vals = pd.cut(
                values_series,
                bins=[0, 1/5, 2/5, 3/5, 4/5, 1], 
                labels=[1, 2, 3, 4, 5], 
                include_lowest=True
            ).astype(int)
            
            quantiled_matrix[:, j, k] = quantiled_vals.to_numpy()

    return quantiled_matrix

quantiled_indicator_mat = quantiler(ranked_indicator_mat)
print(quantiled_indicator_mat.shape)

(126, 1013, 67)


In [12]:
#combined quantiles values and initial price values/date values (required for later label calculations and such)
quantiled_stocks_and_vals = np.concatenate((ticker_matrix ,quantiled_indicator_mat), axis = 2)
print(quantiled_stocks_and_vals.shape)
#columns on z axis 0-4 are my open,hi,lo,cl,vol, 5-6 are day, hr, rest are inds 
#cols 0-5 on x are my stocks
#cols on y are timestep

(126, 1013, 74)


In [14]:
# Apply initial filter to select only Mondays (day=0, hour=14) and Fridays (day=4, hour=20)
filter = (
    ((quantiled_stocks_and_vals[:, :, 5] == 0) & (quantiled_stocks_and_vals[:, :, 6] == 14)) |
    ((quantiled_stocks_and_vals[:, :, 5] == 4) & (quantiled_stocks_and_vals[:, :, 6] == 20))
)

mondays_fridays_matrix = quantiled_stocks_and_vals[filter]
#have to reshape arrays after 
mondays_fridays_matrix = mondays_fridays_matrix.reshape(quantiled_stocks_and_vals.shape[0], -1, quantiled_stocks_and_vals.shape[2])

# Remove initial row if it starts with a Friday, or last row if it ends with a Monday
if mondays_fridays_matrix[0, 0, 5] == 4:
    cleaned_matrix = mondays_fridays_matrix[:, 1:, :]
elif mondays_fridays_matrix[0, -1, 5] == 0:
    cleaned_matrix = mondays_fridays_matrix[:, :-1, :]
else:
    cleaned_matrix = mondays_fridays_matrix

# Filter out consecutive Fridays
rows_to_keep_fri = [True]
for i in range(1, cleaned_matrix.shape[1]):
    if cleaned_matrix[0, i, 5] == 4 and cleaned_matrix[0, i - 1, 5] == 4:
        rows_to_keep_fri.append(False)
    else:
        rows_to_keep_fri.append(True)

filtered_matrix_initial = cleaned_matrix[:, rows_to_keep_fri, :]

# Ensure rows_to_keep_mon matches the exact number of rows
rows_to_keep_mon = [True]  # Start with the first row always being kept
for i in range(1, filtered_matrix_initial.shape[1]):
    if filtered_matrix_initial[0, i, 5] == 0 and filtered_matrix_initial[0, i - 1, 5] == 0:
        rows_to_keep_mon.append(False)
    else:
        rows_to_keep_mon.append(True)

# Ensure the boolean mask length matches the dimension of filtered_matrix_initial
if len(rows_to_keep_mon) != filtered_matrix_initial.shape[1]:
    rows_to_keep_mon = rows_to_keep_mon[:filtered_matrix_initial.shape[1]]

# Apply the final filtering
filtered_matrix_final = filtered_matrix_initial[:, rows_to_keep_mon, :]

print(filtered_matrix_final.shape)


(126, 50, 74)


In [16]:
def calc_FR(matrix):
    FR_mats = []
    for i in range(matrix.shape[0]):
        forward_return = []
        for j in range(int((matrix.shape[1])/2)):
            forward_return.append(((matrix[i, 2*j+1, 0] - matrix[i, 2*j, 3])/matrix[i, 2*j, 3]) * 100)
            forward_return.append(0)
        
        FR_mats.append(np.array(forward_return).reshape(-1, 1))
    new_mat = np.stack(FR_mats)
    
    matrix_FR = np.concatenate((matrix, new_mat), axis = 2)

    return matrix_FR

matrix_with_FR = calc_FR(filtered_matrix_final)
print(matrix_with_FR.shape)

(126, 50, 75)


In [36]:
def remove_fridays(matrix):
    #5th column is day index
    mask = matrix[:, :, 5] != 4
    
    cleaned_matrix = matrix[mask]
    
    cleaned_matrix = cleaned_matrix.reshape(matrix.shape[0], -1, matrix.shape[2])
    
    return cleaned_matrix

monday_matrix = remove_fridays(matrix_with_FR)
print(monday_matrix.shape)

(126, 25, 75)


In [34]:
#labelling thresholds
top_threshold = int(0.3 * num_stocks)
bottom_threshold = int(0.7 * num_stocks)

#ranking forward return, then labelling 1 or -1 or 0
def rank_label_FR(matrix): 
    labeled_matrix = np.zeros((matrix.shape[0], matrix.shape[1], 1)) 
    for j in range(matrix.shape[1]): 
        values_at_time_j = matrix[:, j, -1]

        ranked_values = pd.Series(values_at_time_j).rank()
    
        labeled_values = ranked_values.apply(
            lambda x: 1 if x <= top_threshold else (-1 if x > bottom_threshold else 0))
    
        labeled_matrix[:, j, -1] = labeled_values.to_numpy()

    return labeled_matrix

#concat my monday matrix with my label columns, delete top two y (date) rows, this gets rid of first two mondays which will have nan
#values from indicator calcs
labeled_matrix = np.concatenate((monday_matrix, rank_label_FR(monday_matrix)), axis = 2)[:, 2:, :]

print(labeled_matrix.shape)

(126, 23, 76)


In [56]:
def filter_labels(matrix):
    filtered_labeled_matrix = []
    for i in range(matrix.shape[0]):
        mask = matrix[i, :, -1] != 0
        
        filtered_matrix = matrix[i, mask, :]

        filtered_labeled_matrix.append(filtered_matrix)

    total_filtered_matrix = np.concatenate(filtered_labeled_matrix, axis = 0)
    
    return total_filtered_matrix

# 7 onwars means that we only include quantile vals for each stock and fr return labels
#unfortunately second to last row is raw fr vals, as such just need to remmber to ignore them later
training_mat = filter_labels(labeled_matrix)[:, 7:]
#result mat is 2D, no need for 3D as do not need to differentiate when training until doing cross market 
print(training_mat)

[[1.0 2.0 4.0 ... 2.0 4.112669388721219 -1.0]
 [3.0 1.0 4.0 ... 1.0 -5.537421977896116 1.0]
 [1.0 5.0 1.0 ... 5.0 5.8051258633483735 -1.0]
 ...
 [1.0 3.0 2.0 ... 5.0 -4.31405187959495 1.0]
 [5.0 4.0 5.0 ... 4.0 -1.7080291970802945 1.0]
 [5.0 3.0 2.0 ... 4.0 2.1973160300840506 -1.0]]


In [76]:
X_ = training_mat[:, :-2]
y = training_mat[:, -1]

y = y.astype(int)
X = X.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

ada = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=80,
    algorithm='SAMME',
    learning_rate=1,
    random_state=42
)

ada.fit(X_train, y_train)

y_pred = ada.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Print full classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Underperformer (-1)', 'Overperformer (1)']))


Accuracy: 0.65

Classification Report:
                     precision    recall  f1-score   support

Underperformer (-1)       0.63      0.68      0.65       424
  Overperformer (1)       0.66      0.61      0.64       439

           accuracy                           0.65       863
          macro avg       0.65      0.65      0.65       863
       weighted avg       0.65      0.65      0.65       863

