In [7]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score
import joblib
import xgboost as xgb    # <-- XGBoost
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

In [8]:
df = pd.read_csv('AAPL.csv')

In [9]:
df.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Brand_Name',
       'Ticker', 'Industry_Tag', 'Country', 'Dividends', 'Stock Splits'],
      dtype='object')

In [10]:
df.columns = df.columns.str.lower().str.replace(' ','_')

In [11]:
df = df.drop_duplicates(subset='date', keep='first').reset_index(drop=True)

In [12]:
df.head()

Unnamed: 0,date,open,high,low,close,volume,brand_name,ticker,industry_tag,country,dividends,stock_splits
0,2010-01-04 05:00:00+00:00,6.469951,6.502387,6.438122,6.487534,493729600.0,apple,AAPL,technology,usa,0.0,0.0
1,2010-01-05 05:00:00+00:00,6.50542,6.535431,6.464496,6.498751,601904800.0,apple,AAPL,technology,usa,0.0,0.0
2,2010-01-06 05:00:00+00:00,6.49875,6.524518,6.38871,6.395379,552160000.0,apple,AAPL,technology,usa,0.0,0.0
3,2010-01-07 05:00:00+00:00,6.419024,6.426603,6.337176,6.383556,477131200.0,apple,AAPL,technology,usa,0.0,0.0
4,2010-01-08 05:00:00+00:00,6.375067,6.426602,6.337478,6.425995,447610800.0,apple,AAPL,technology,usa,0.0,0.0


In [13]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [14]:
len(df_train), len(df_val), len(df_test)

(2335, 779, 779)

In [15]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [16]:
def add_target(df):
    df["target_return_5d"] = df["Close"].shift(-5) / df["Close"] - 1
    df["target"] = (df["target_return_5d"] > 0).astype(int)
    return df

In [17]:
# ---------- FEATURE ENGINEERING ----------
def add_price_action_features(df):

    df["return_1d"] = df["Close"].pct_change()
    df["return_3d"] = df["Close"].pct_change(3)
    df["return_5d"] = df["Close"].pct_change(5)

    df["volatility_10"] = df["return_1d"].rolling(10).std()
    df["atr"] = df["High"] - df["Low"]
    df["atr_14"] = df["atr"].rolling(14).mean()

    df["rsi"] = compute_rsi(df["Close"], 14)

    df["range"] = df["High"] - df["Low"]
    
    # Trendline breakout features
    df = add_trendline_breakout_features(df)

    return df

In [18]:
# ---------- RSI ----------
def compute_rsi(series, period=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(period).mean()
    avg_loss = loss.rolling(period).mean()

    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))


In [19]:
# ---------- TRENDLINE BREAKOUTS ----------
def add_trendline_breakout_features(df):

    df["swing_high"] = ((df["High"] > df["High"].shift(1)) & 
                        (df["High"] > df["High"].shift(-1))).astype(int)

    df["swing_low"] = ((df["Low"] < df["Low"].shift(1)) & 
                       (df["Low"] < df["Low"].shift(-1))).astype(int)

    df["trendline_up_break"] = 0
    df["trendline_down_break"] = 0
    df["breakout_strength"] = 0.0

    lookback = 80  # trendline built on last 80 candles

    for i in range(lookback, len(df)):
        # identify last 3 swing HIGHs for downtrend line
        highs_idx = df.iloc[i-lookback:i].query("swing_high == 1").tail(3).index
        # identify last 3 swing LOWs for uptrend line
        lows_idx = df.iloc[i-lookback:i].query("swing_low == 1").tail(3).index

        if len(highs_idx) == 3:
            # fit downtrend line (bearish)
            x = np.array(highs_idx)
            y = df.loc[highs_idx]["High"]
            slope, intercept = np.polyfit(x, y, 1)
            trend_value = slope * i + intercept

            if df.loc[i, "Close"] > trend_value:
                df.loc[i, "trendline_down_break"] = 1
                df.loc[i, "breakout_strength"] = df.loc[i, "Close"] - trend_value

        if len(lows_idx) == 3:
            # fit uptrend line (bullish)
            x = np.array(lows_idx)
            y = df.loc[lows_idx]["Low"]
            slope, intercept = np.polyfit(x, y, 1)
            trend_value = slope * i + intercept

            if df.loc[i, "Close"] < trend_value:
                df.loc[i, "trendline_up_break"] = 1
                df.loc[i, "breakout_strength"] = trend_value - df.loc[i, "Close"]

    return df


In [20]:
# ---------- FULL MODEL PIPELINE ----------
def run_pipeline(df):

    df = add_price_action_features(df)
    df = add_target(df)
    df = df.dropna()

    features = [
        "return_1d", "return_3d", "return_5d",
        "volatility_10", "atr_14", "rsi",
        "range",
        "trendline_up_break", "trendline_down_break", "breakout_strength"
    ]

    X = df[features]
    y = df["target"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, shuffle=False
    )

    # MODELS
    models = {
        "XGBoost": XGBClassifier(
            max_depth=4,
            learning_rate=0.08,
            n_estimators=300,
            subsample=0.9,
            colsample_bytree=0.9
        ),
        "Random Forest": RandomForestClassifier(
            n_estimators=500,
            max_depth=6
        ),
        "Logistic Regression": LogisticRegression(max_iter=300)
    }

    results = {}

    for name, model in models.items():
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        
        acc = accuracy_score(y_test, pred)
        auc = roc_auc_score(y_test, pred)

        results[name] = {"accuracy": acc, "auc": auc}

    return results