APS1052 Final Project

The focus if this project is the seed model from the book Machine Learning and Data Science Blueprints for Finance. The goal this project is to use regression modelto predict the future price of an ETF/Stock. 

# 1.Data Preprocessing


In [1]:
# Basic Tools
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [None]:
# Feature Engineering
import talib

# Feature Selection
from sklearn.feature_selection import SelectKBest, chi2, f_regression

In [3]:
# Machine Learning
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:
import warnings
warnings.filterwarnings("ignore")

## Load data

Load the data (stocks)

In [5]:
# stocks
AAPL = pd.read_csv('data/stock/AAPL.csv')
AMD = pd.read_csv('data/stock/AMD.csv')
AMZN = pd.read_csv('data/stock/AMZN.csv')
GOOGL = pd.read_csv('data/stock/GOOGL.csv')
INTC = pd.read_csv('data/stock/INTC.csv')
MSFT = pd.read_csv('data/stock/MSFT.csv')
NVDA = pd.read_csv('data/stock/NVDA.csv')

stocks = [AAPL, AMD, AMZN, GOOGL, INTC, MSFT, NVDA]

# Use TSLA as testing etf to determine the effective of our model
TSLA = pd.read_csv('data/stock/TSLA.csv')

In [6]:
AAPL

Unnamed: 0,Date,Close,Volume,Open,High,Low
0,03/07/2025,$239.07,46273570,$235.105,$241.37,$234.76
1,03/06/2025,$235.33,45170420,$234.435,$237.86,$233.1581
2,03/05/2025,$235.74,47227640,$235.42,$236.55,$229.23
3,03/04/2025,$235.93,53798060,$237.705,$240.07,$234.68
4,03/03/2025,$238.03,47183990,$241.79,$244.0272,$236.112
5,02/28/2025,$241.84,56833360,$236.95,$242.09,$230.20
6,02/27/2025,$237.30,41153640,$239.41,$242.46,$237.06
7,02/26/2025,$240.36,44433560,$244.33,$244.98,$239.13
8,02/25/2025,$247.04,48013270,$248.00,$250.00,$244.91
9,02/24/2025,$247.10,51326400,$244.925,$248.86,$244.42


In [7]:
print(AAPL.dtypes)

Date      object
Close     object
Volume     int64
Open      object
High      object
Low       object
dtype: object


## Clean Data

In [None]:
def clean(df):
    new_df = pd.DataFrame()

    new_df["Date"] = pd.to_datetime(df.iloc[:, 0])

    columns_to_clean = ["Open", "High", "Low", "Close", "Volume"]

    for col in columns_to_clean:
        new_df[col] = df[col].astype(str).str.replace(r'[\$,]', '', regex=True).astype(float)

    return new_df

for index, stock in enumerate(stocks):
    stocks[index] = clean(stock)

TSLA = clean(TSLA)


In [9]:
TSLA

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,2025-03-07,259.32,266.2499,250.73,262.67,102369600.0
1,2025-03-06,272.06,272.65,260.02,263.45,98451570.0
2,2025-03-05,272.92,279.55,267.71,279.1,94042910.0
3,2025-03-04,270.93,284.35,261.8401,272.04,126706600.0
4,2025-03-03,300.34,303.94,277.3,284.65,115551400.0
5,2025-02-28,279.5,293.88,273.6,292.98,115697000.0
6,2025-02-27,291.16,297.23,280.88,281.95,101748200.0
7,2025-02-26,303.715,309.0,288.04,290.8,100118300.0
8,2025-02-25,327.025,328.89,297.2512,302.8,134228800.0
9,2025-02-24,338.14,342.3973,324.7,330.53,76052320.0


In [10]:
print(TSLA.dtypes)

Date      datetime64[ns]
Open             float64
High             float64
Low              float64
Close            float64
Volume           float64
dtype: object


# 2. Feature Engineering

In [None]:
import talib as ta

def get_indicators(df, indicators):
    if indicators is None:
        indicators = ['MACD', 'MFI', 'SMA', 'MOM', 'ROC', 'RSI', 'ATR', 'BBANDS']

    timeperiod = 14  # Standard time period for indicators

    # Create a copy to avoid modifying the original DataFrame
    df = df.copy()

    # Moving Average Convergence Divergence (MACD)
    if 'MACD' in indicators:
        macd, macdsignal, macdhist = ta.MACD(df['Close'].values, fastperiod=12, slowperiod=26, signalperiod=9)
        df['MACD_Diff'] = macd
        df['MACD_Signal'] = macdsignal
        df['MACD_Hist'] = macdhist

    # Money Flow Index (MFI)
    if 'MFI' in indicators:
        df['MFI'] = ta.MFI(df['High'].values, df['Low'].values, df['Close'].values, df['Volume'].values, timeperiod=timeperiod)

    # Simple Moving Average (SMA)
    if 'SMA' in indicators:
        df['SMA_20'] = ta.SMA(df['Close'].values, timeperiod=20)
        df['SMA_50'] = ta.SMA(df['Close'].values, timeperiod=50)

    # Momentum Indicator (MOM)
    if 'MOM' in indicators:
        df['MOM'] = ta.MOM(df['Close'].values, timeperiod=timeperiod)

    # Rate of Change (ROC)
    if 'ROC' in indicators:
        df['ROC'] = ta.ROC(df['Close'].values, timeperiod=timeperiod)

    # Relative Strength Index (RSI)
    if 'RSI' in indicators:
        df['RSI'] = ta.RSI(df['Close'].values, timeperiod=timeperiod)

    # Average True Range (ATR)
    if 'ATR' in indicators:
        df['ATR'] = ta.ATR(df['High'].values, df['Low'].values, df['Close'].values, timeperiod=timeperiod)

    # Bollinger Bands (BBANDS)
    if 'BBANDS' in indicators:
        df['Upper_BB'], df['Middle_BB'], df['Lower_BB'] = ta.BBANDS(df['Close'].values, timeperiod=20)

    # Fill NaN values generated by TA-Lib
    df.fillna(method='bfill', inplace=True)  # Backfill missing values

    return df


# 3. Feature Selection

In [None]:
train_dfs = []
for stock in stocks:
    stock_ind = get_indicators(stock, None)
    
    # Create target: next day’s closing price.
    stock_ind['Target'] = stock_ind['Close'].shift(-1)
    
    # Drop the last row (which has no target) and reset index.
    stock_ind = stock_ind.dropna().reset_index(drop=True)
    train_dfs.append(stock_ind)

# Combine training data from all stocks into one DataFrame.
train_data = pd.concat(train_dfs, ignore_index=True)


# Define features: use all columns except Date and Target.
all_features = train_data.columns.difference(['Date', 'Target'])
X_train = train_data[all_features]
y_train = train_data['Target']

Combined training data shape: (10060, 20)
Selected features: ['ATR', 'Close', 'High', 'Low', 'Lower_BB', 'Middle_BB', 'Open', 'SMA_20', 'SMA_50', 'Upper_BB']


In [None]:
# Use SelectKBest with f_regression to select top 10 features.
selector = SelectKBest(score_func=f_regression, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
selected_features = X_train.columns[selector.get_support()]
print("Selected features:", list(selected_features))

X_train = X_train[selected_features]


# 4. Model Selection

In [14]:
# It is often beneficial to scale features for regression.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Define a set of candidate regression models.

models = {
    'LinearRegression': LinearRegression(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'RandomForest': RandomForestRegressor(random_state=42),
    'GradientBoosting': GradientBoostingRegressor(random_state=42),
    'AdaBoost': AdaBoostRegressor(random_state=42),
    'ExtraTrees': ExtraTreesRegressor(random_state=42),
    'XGB': XGBRegressor(random_state=42, objective='reg:squarederror')
}

# Use 5-fold cross validation to evaluate each model (using R^2 score).
cv = KFold(n_splits=5, shuffle=True, random_state=42)

model_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='r2')
    model_scores[name] = scores.mean()
    print(f"{name}: Average R^2 = {scores.mean():.4f}")

# Select the best model based on the highest average R^2.
best_model_name = max(model_scores, key=model_scores.get)
best_model = models[best_model_name]
print(f"\nBest model selected: {best_model_name} with R^2 = {model_scores[best_model_name]:.4f}")

# Fit the chosen model on the entire training set.
best_model.fit(X_train_scaled, y_train)

LinearRegression: Average R^2 = 0.9995
Lasso: Average R^2 = 0.9989
ElasticNet: Average R^2 = 0.9925
RandomForest: Average R^2 = 0.9994
GradientBoosting: Average R^2 = 0.9994
AdaBoost: Average R^2 = 0.9962
ExtraTrees: Average R^2 = 0.9994
XGB: Average R^2 = 0.9993

Best model selected: LinearRegression with R^2 = 0.9995


# 5. Model Evaluation on TSLA

In [None]:
TSLA_ind = get_indicators(TSLA, indicators=None)
TSLA_ind['Target'] = TSLA_ind['Close'].shift(-1)
TSLA_ind = TSLA_ind.dropna().reset_index(drop=True)

# Select the same features for TSLA.
X_test = TSLA_ind[selected_features]
y_test = TSLA_ind['Target']

# Scale TSLA features using the same scaler.
X_test_scaled = scaler.transform(X_test)

# Predict on TSLA test data.
y_pred = best_model.predict(X_test_scaled)

# Calculate evaluation metrics.
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("\nTSLA Test Set Evaluation Metrics:")
print(f"MAE: {mae:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R^2: {r2:.4f}")

ValueError: Found array with 0 sample(s) (shape=(0, 10)) while a minimum of 1 is required by StandardScaler.

# 6. Trading Strategy & Backtesting