In [23]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import joblib
import os

In [3]:
df = pd.read_csv("../data/all_stocks_5yr.csv")

df.head()

Unnamed: 0,date,open,high,low,close,volume,Name
0,2013-02-08,15.07,15.12,14.63,14.75,8407500,AAL
1,2013-02-11,14.89,15.01,14.26,14.46,8882000,AAL
2,2013-02-12,14.45,14.51,14.1,14.27,8126000,AAL
3,2013-02-13,14.3,14.94,14.25,14.66,10259500,AAL
4,2013-02-14,14.94,14.96,13.16,13.99,31879900,AAL


In [5]:
df['ma7'] = df.groupby('Name')['close'].transform(lambda x: x.rolling(7).mean())
df['ma21'] = df.groupby('Name')['close'].transform(lambda x: x.rolling(21).mean())

In [6]:
# Volatility (7-day rolling std)
df['volatility7'] = df.groupby('Name')['close'].transform(lambda x: x.rolling(7).std())

# RSI (14-day)
def compute_rsi(series, period=14):
    delta = series.diff()
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    avg_gain = pd.Series(gain).rolling(period).mean()
    avg_loss = pd.Series(loss).rolling(period).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

In [7]:
df['rsi14'] = df.groupby('Name')['close'].transform(compute_rsi)

# Momentum
df['momentum7'] = df.groupby('Name')['close'].transform(lambda x: x.diff(7))
df['momentum21'] = df.groupby('Name')['close'].transform(lambda x: x.diff(21))

# MA difference
df['ma_diff'] = df['ma7'] - df['ma21']

# Volume ratio (vs 7-day average)
df['vol_ratio'] = df.groupby('Name')['volume'].transform(lambda x: x / x.rolling(7).mean())

# Drop rows with NaNs caused by rolling calculations
df = df.dropna().reset_index(drop=True)

df.head()

Unnamed: 0,date,open,high,low,close,volume,Name,ma7,ma21,volatility7,rsi14,momentum7,momentum21,ma_diff,vol_ratio
0,2013-03-12,15.14,15.6,14.95,15.5,8999100,AAL,14.698571,14.096667,0.57258,83.180428,1.89,0.75,0.601905,0.972515
1,2013-03-13,15.54,16.2,15.48,15.91,11380000,AAL,14.985714,14.165714,0.608245,84.89011,2.01,1.45,0.82,1.171829
2,2013-03-14,15.98,16.36,15.93,16.25,8383300,AAL,15.3,14.26,0.612536,85.449735,2.2,1.98,1.04,0.854363
3,2013-03-15,16.45,16.54,15.88,15.98,17667700,AAL,15.501429,14.322857,0.56224,92.285714,1.41,1.32,1.178571,1.691593
4,2013-03-18,15.8,16.33,15.71,16.29,6514100,AAL,15.711429,14.432381,0.539365,92.436975,1.47,2.3,1.279048,0.646793


In [9]:
# Features & target
features = ['open', 'high', 'low', 'volume', 'ma7', 'ma21', 'volatility7', 
            'rsi14', 'momentum7', 'momentum21', 'ma_diff', 'vol_ratio']
target = 'close'

X = df[features]
y = df[target]


In [10]:
# Split by time (not random shuffle)
split_index = int(len(df) * 0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Train model
model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

In [14]:
model.fit(X_train_scaled, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [18]:
# Evaluate
y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Test RMSE: {rmse:.4f}")

Test RMSE: 0.7993


In [24]:
# Ensure model directory exists
os.makedirs("../model", exist_ok=True)

# Save model and scaler
joblib.dump(model, "../model/stock_price_xgb.pkl")
joblib.dump(scaler, "../model/scaler.pkl")

print("Model and scaler saved in /model folder")

Model and scaler saved in /model folder
