$\textbf{1. Settings and Data Load}$

In [117]:
import torch
import pandas as pd
import numpy as np
import pandas_ta as ta
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

To properly load the data, make sure the data is in the form of 
1. Date
2. Price
3. Open
4. High
5. Low
6. Vol. 

In [118]:
filename = "QQQ Historical Data.csv"
positive_cutoff = 0.1 # Discriminate bewtween "Neutral” and "Positive“
negative_cutoff = -0.1 # Discriminate bewtween "Neutral” and "Negative”
window_size = 15 # Number of consecutive trading days that will be considered



try:
    raw_data = pd.read_csv(filename)
except Exception as e:
    print("Unable to open the file")
    print(f"The error is: {e}")
if 'Unnamed: 0' in raw_data.columns:
    raw_data = raw_data.drop(columns=['Unnamed: 0'])

df = raw_data
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
returns = []
for i in range(len(df)):
    if i + 5 < len(df):
        future_prices = df.loc[i+1:i+3, 'Price']
        current_price = df.loc[i, 'Price']
        avg_return = ((future_prices - current_price) / current_price).mean() * 100
        returns.append(avg_return)
    else:
        returns.append(None)

df['Expected Return'] = returns

df = df.dropna().reset_index(drop=True)

def categorize_return(r):
    if pd.isna(r):
        return None
    elif r > positive_cutoff:
        return 1
    elif r < negative_cutoff:
        return 3
    else:
        return 2

df['Expected Move'] = df['Expected Return'].apply(categorize_return)

# df['1'] = df['Expected Return'].where(df['Expected Move'] == 1).dropna()
# df['2'] = df['Expected Return'].where(df['Expected Move'] == 2).dropna()
# df['3'] = df['Expected Return'].where(df['Expected Move'] == 3).dropna()
# print(len(df['1'].dropna()), len(df['2'].dropna()), len(df['3'].dropna()))

df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df['Weekday'] = df['Date'].dt.weekday + 1

df['Close'] = df['Price']

# Add the technical indicators here
df['RSI'] = ta.rsi(df['Close'], length=14)
df['HMA_20'] = ta.hma(df['Close'], length=20)
df['HMA_60'] = ta.hma(df['Close'], length=60)

macd = ta.macd(df['Close'])
df = pd.concat([df, macd], axis=1)

stoch = ta.stoch(df['High'], df['Low'], df['Close'])
df = pd.concat([df, stoch], axis=1)

df['KDJ_J'] = 3 * df['STOCHk_14_3_3'] - 2 * df['STOCHd_14_3_3']
stochrsi = ta.stochrsi(df['Close'])
df = pd.concat([df, stochrsi], axis=1)


def convert_volume(val):
    if isinstance(val, str):
        val = val.replace(',', '').strip()
        if val.endswith('M'):
            return float(val[:-1]) * 1_000_000
        elif val.endswith('K'):
            return float(val[:-1]) * 1_000
        elif val == '-':
            return None
    try:
        return float(val)
    except:
        return None

df['Vol.'] = df['Vol.'].apply(convert_volume)
df['Change %'] = df['Change %'].str.replace('%', '', regex=False).astype(float) / 100
df['Open'] = df['Open'] / df['Price'] * (1+df['Change %'])
df['High'] = df['High'] / df['Price'] * (1+df['Change %'])
df['Low'] = df['Low'] / df['Price'] * (1+df['Change %'])
df['HMA_20'] = df['HMA_20'] / df['Price']
df['HMA_60'] = df['HMA_60'] / df['Price']

print(df.tail(1))
df = df.dropna().reset_index(drop=True)

X = []
Y = []
#print(df.columns.tolist())

features = [ 'Open', 'High', 'Low', 'Vol.', 'Change %', 'Weekday', 'Close', 'RSI', 'HMA_20', 'HMA_60', 'MACD_12_26_9', 'MACDh_12_26_9', 'MACDs_12_26_9', 'STOCHk_14_3_3', 'STOCHd_14_3_3', 'KDJ_J', 'STOCHRSIk_14_14_3_3', 'STOCHRSId_14_14_3_3']
for i in range(window_size, len(df)):
    x_window = df.iloc[i - window_size:i][features].values
    y_label = df.iloc[i-1]['Expected Move']

    X.append(x_window)
    Y.append(y_label)

X = np.array(X)
Y = np.array(Y)

print(X.shape, Y.shape)

           Date   Price      Open      High       Low        Vol.  Change %  \
4993 2024-11-15  496.57  0.988723  0.989489  0.972111  51460000.0   -0.0238   

      Expected Return  Expected Move  Weekday  ...    HMA_20    HMA_60  \
4993         1.137134              1        5  ...  1.034713  1.018613   

      MACD_12_26_9  MACDh_12_26_9  MACDs_12_26_9  STOCHk_14_3_3  \
4993         5.421       0.053046       5.367953      69.389465   

      STOCHd_14_3_3      KDJ_J  STOCHRSIk_14_14_3_3  STOCHRSId_14_14_3_3  
4993      82.595036  42.978322            59.235984            79.304008  

[1 rows x 22 columns]
(4914, 15, 18) (4914,)


In [119]:
X_flat = X.reshape((X.shape[0], -1))

X_train, X_test, y_train, y_test = train_test_split(X_flat, Y, test_size=0.2, random_state=42)

$\textbf{2. XGBoost Prediction}$

In [120]:
model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=3,  # Assuming classes are 1, 2, 3. XGBoost expects 0-based indexing
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=3
)

# IMPORTANT: convert y to zero-based labels (i.e., 1→0, 2→1, 3→2)
y_train_adj = y_train - 1
y_test_adj = y_test - 1

model.fit(X_train, y_train_adj)

# Step 4: Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test_adj, y_pred, target_names=["Positive", "Netural", "Negative"]))
print(confusion_matrix(y_test_adj, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

    Positive       0.59      0.77      0.67       532
     Netural       1.00      0.02      0.03        61
    Negative       0.52      0.39      0.45       390

    accuracy                           0.57       983
   macro avg       0.71      0.39      0.38       983
weighted avg       0.59      0.57      0.54       983

[[411   0 121]
 [ 44   1  16]
 [239   0 151]]
