In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import xgboost as xgb
import joblib
import os

In [3]:
df = pd.read_csv('../data/all_stocks_5yr.csv')

In [4]:
df.head(10)

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL
5,2013-02-15,13.93,14.61,13.93,14.5,15628000,AAL
6,2013-02-19,14.33,14.56,14.08,14.26,11354400,AAL
7,2013-02-20,14.17,14.26,13.15,13.33,14725200,AAL
8,2013-02-21,13.62,13.95,12.9,13.37,11922100,AAL
9,2013-02-22,13.57,13.6,13.21,13.57,6071400,AAL


In [5]:
df.describe()

Unnamed: 0,open,high,low,close,volume
count,619029.0,619032.0,619032.0,619040.0,619040.0
mean,83.023334,83.778311,82.256096,83.043763,4321823.0
std,97.378769,98.207519,96.507421,97.389748,8693610.0
min,1.62,1.69,1.5,1.59,0.0
25%,40.22,40.62,39.83,40.245,1070320.0
50%,62.59,63.15,62.02,62.62,2082094.0
75%,94.37,95.18,93.54,94.41,4284509.0
max,2044.0,2067.99,2035.11,2049.0,618237600.0


In [6]:
df.isna().sum()

date       0
open      11
high       8
low        8
close      0
volume     0
Name       0
dtype: int64

In [7]:
# now i will parse the date and sort by data and ticker

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['Name', 'date']).reset_index(drop=True)

In [8]:
# handling missing OHLC values

num_cols = ['open', 'high', 'low', 'close', 'volume']

df[num_cols] = (
    df.groupby('Name', group_keys=False)[num_cols]
      .transform(lambda x: x.ffill().bfill())
)

In [9]:
df.isna().sum()

date      0
open      0
high      0
low       0
close     0
volume    0
Name      0
dtype: int64

In [10]:
# Create useful features 
# Daily % change
df['pct_change'] = (
    df.groupby('Name')['close']
      .transform(lambda x: x.pct_change())
)

# 7-day and 21-day moving averages
df['ma_7'] = (
    df.groupby('Name')['close']
      .transform(lambda x: x.rolling(window=7, min_periods=1).mean())
)
df['ma_21'] = (
    df.groupby('Name')['close']
      .transform(lambda x: x.rolling(window=21, min_periods=1).mean())
)

# Rolling volatility (standard deviation over 7 days)
df['volatility_7'] = (
    df.groupby('Name')['close']
      .transform(lambda x: x.rolling(window=7, min_periods=1).std())
)

# Replace any remaining NaN in engineered features
df = df.fillna(0)

In [11]:
# Relative Strength Index (RSI)
def compute_rsi(series, window=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -1 * delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / (avg_loss + 1e-9)  # avoid division by zero
    rsi = 100 - (100 / (1 + rs))
    return rsi

df['RSI_14'] = df.groupby('Name')['close'].transform(compute_rsi)

# Momentum: difference between close and moving averages
df['momentum_7'] = df['close'] - df['ma_7']
df['momentum_21'] = df['close'] - df['ma_21']

# Moving Average crossover signal (continuous)
df['ma_diff'] = df['ma_7'] - df['ma_21']

# Volume-based feature
df['vol_ratio_20'] = df['volume'] / df.groupby('Name')['volume'].transform(lambda x: x.rolling(20, min_periods=1).mean())

# Drop any rows with NaN from indicators
df = df.dropna()


In [12]:
df.head()

Unnamed: 0,date,open,high,low,close,volume,Name,pct_change,ma_7,ma_21,volatility_7,RSI_14,momentum_7,momentum_21,ma_diff,vol_ratio_20
1,2013-02-11,45.17,45.18,44.45,44.6,2915405,A,-0.010648,44.84,44.84,0.339411,0.0,-0.24,-0.24,0.0,1.230087
2,2013-02-12,44.81,44.95,44.5,44.62,2373731,A,0.000448,44.766667,44.766667,0.271539,4.0,-0.146667,-0.146667,0.0,1.001026
3,2013-02-13,44.81,45.24,44.68,44.75,2052338,A,0.002913,44.7625,44.7625,0.221867,23.809524,-0.0125,-0.0125,0.0,0.895608
4,2013-02-14,44.72,44.78,44.36,44.58,3826245,A,-0.003799,44.726,44.726,0.208758,18.75,-0.146,-0.146,0.0,1.472485
5,2013-02-15,43.48,44.24,42.21,42.25,14657315,A,-0.052266,44.313333,44.313333,1.027923,4.792332,-2.063333,-2.063333,0.0,3.180635


In [13]:
# Define target: up = 1, down = 0
# Binary target: 1 if next day close > today, else 0
df['target'] = (df['close'].shift(-1) > df['close']).astype(int)
df.dropna(inplace=True)

# next; Drop the last row (because target is NaN there after shift)
df.dropna(inplace=True)

# next — Select features
feature_cols = ['pct_change', 'ma_7', 'ma_21', 'volatility_7', 'volume', 
                'RSI_14', 'momentum_7', 'momentum_21', 'ma_diff', 'vol_ratio_20']

X = df[feature_cols]
y = df['target']

# Chronological train-test split (80/20)
split_point = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_point], X.iloc[split_point:]
y_train, y_test = y.iloc[:split_point], y.iloc[split_point:]

print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")
print(f"Positive class ratio in train: {y_train.mean():.2f}")

Train set: (494828, 10), Test set: (123707, 10)
Positive class ratio in train: 0.52


In [14]:
# initialize and train the random forest model

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

In [15]:
# train the model

xgb_model.fit(X_train, y_train)
print("XGBoost training complete!")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost training complete!


In [16]:
# making predictions on the test set
y_pred = xgb_model.predict(X_test)

In [17]:
# model evaluation

# Accuracy and F1 score
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}")
print(f"Test F1 Score: {f1:.4f}")

Test Accuracy: 0.5273
Test F1 Score: 0.6273


In [20]:
# Save XGBoost model
# Create folder one level up
os.makedirs('../model', exist_ok=True)

# Save model in that folder
model_path = '../model/xgb_stock_model.pkl'
xgb_model.save_model("../model/xgb_stock_model.json")
print(f"Model saved to '{model_path}'")

Model saved to '../model/xgb_stock_model.pkl'
