<a href="https://colab.research.google.com/github/Gavinedelen/stock-classifier/blob/main/Binary_Stock_Movement_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Stock Movement Classifier**

##**Load Data**

In [None]:
!pip install yfinance --upgrade
!pip install ta

import yfinance as yf
import pandas as pd
import ta

print(yf.__version__)

Collecting yfinance
  Downloading yfinance-0.2.64-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading yfinance-0.2.64-py2.py3-none-any.whl (119 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.4/119.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yfinance
  Attempting uninstall: yfinance
    Found existing installation: yfinance 0.2.63
    Uninstalling yfinance-0.2.63:
      Successfully uninstalled yfinance-0.2.63
Successfully installed yfinance-0.2.64
Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=744a27f32b46e45088e19feb2bbdf8c737c3fca77b6a8c44f0cd65a7fd011d93
  Stored in directory: /root/.cache/pip/wheels/a1/d7/29/7781cc5eb9a3659d032d7d15bdd0f49d07d2b24fec29f44bc4
Successfully built ta
Installi

## **Install Features**

In [None]:
# Basing Model off of 8 total tickers, 2 from each of the following sectors:
# Technology (Apple, Microsoft), Finance (JP Morgan Chase, Goldman Sachs), Consumer Goods (Coca Cola, Proctor and Gamble), and Energy (ExxonMobil, Chevron)
# Downloading data from the companies' last five years
tickers = ["AAPL", "MSFT", "JPM", "GS", "KO", "PG", "XOM", "CVX"]
total_data = []


for ticker in tickers:
  df = yf.download(ticker, start="2020-01-01", end="2024-01-01")
  df["Ticker"] = ticker
  df["Date"] = df.index
  df.reset_index(drop=True)
  if isinstance(df.columns, pd.MultiIndex):
    df.columns = [col[0] if isinstance(col, tuple) else col for col in df.columns]
  df["SMA_5"] = df["Close"].rolling(window=5).mean()  # 5-day simple moving average
  df["SMA_10"] = df["Close"].rolling(window=10).mean()  # 10-day simple moving average
  df["Volatility"] = df["Close"].rolling(window=5).std()  # 5-day volatility
  df["Return"] = df["Close"].pct_change()  # Daily return
  df["Momentum"] = df["Close"] - df["Close"].shift(10) # Price Momentum
  close_series = df["Close"].squeeze()
  df["RSI_14"] = ta.momentum.RSIIndicator(close=close_series, window=14).rsi() # RSI (over/undervalued based on buyer trends)
  bb = ta.volatility.BollingerBands(close=close_series, window=20, window_dev=2) #Bollinger Bands (Same as RSI based on 20 day AVG and Standard Deviation)
  df["BB_upper"] = bb.bollinger_hband() # Upper Band (Mean +2SD)
  df["BB_lower"] = bb.bollinger_lband() # Lower Band (Mean -2SD)
  df["BB_width"] = df["BB_upper"] - df["BB_lower"] # Range of expected movement
  df["Volume_SMA_5"] = df["Volume"].rolling(window=5).mean() # 5-day simple moving average for volume
  volume_series = df["Volume"].squeeze()
  average_volume_series = df["Volume_SMA_5"].squeeze()
  df["Volume_Ratio"] = volume_series/average_volume_series #Volume today compared to last five
  df["DayOfWeek"] = pd.to_datetime(df["Date"]).dt.dayofweek # Accounts for daily trends
  df["Month"] = pd.to_datetime(df["Date"]).dt.month #Accounts for Monthly Trends
  df["Quarter"] = pd.to_datetime(df["Date"]).dt.quarter #Accounts for Quarterly Trends
  df["Target"] = (df["Close"].shift(-10) > df["Close"]).astype(int) # Label for output of Model
  df = df.dropna() # Drops rows with missing values
  total_data.append(df) #Combines features and all 8 tickers into one data frame


data = pd.concat(total_data, axis=0).reset_index(drop=True) # Keeps date as a column
data.head()


  df = yf.download(ticker, start="2020-01-01", end="2024-01-01")
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start="2020-01-01", end="2024-01-01")
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start="2020-01-01", end="2024-01-01")
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start="2020-01-01", end="2024-01-01")
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start="2020-01-01", end="2024-01-01")
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start="2020-01-01", end="2024-01-01")
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start="2020-01-01", end="2024-01-01")
[*********************100%***********************]  1 of 1 completed
  df = yf.download(ticker, start="2020-01-01", end="2024-01-01

Unnamed: 0,Close,High,Low,Open,Volume,Ticker,Date,SMA_5,SMA_10,Volatility,...,RSI_14,BB_upper,BB_lower,BB_width,Volume_SMA_5,Volume_Ratio,DayOfWeek,Month,Quarter,Target
0,78.307678,78.360872,77.069728,77.502529,126743200,AAPL,2020-01-30,77.041199,76.903621,1.503905,...,66.350107,79.389449,71.503525,7.885923,162736800.0,0.778823,3,1,1,1
1,74.835625,78.019961,74.540644,77.596833,199588400,AAPL,2020-01-31,76.615654,76.765079,1.802774,...,48.053169,79.295632,71.81882,7.476812,173346960.0,1.151381,4,1,1,1
2,74.630096,75.797926,73.072985,73.5759,173788400,AAPL,2020-02-03,76.601633,76.521599,1.82157,...,47.223012,79.072402,72.313578,6.758825,175716640.0,0.989026,0,2,1,1
3,77.093895,77.284909,75.831765,76.237966,136616400,AAPL,2020-02-04,76.657724,76.576725,1.833999,...,56.847227,79.012982,72.833604,6.179378,170593120.0,0.800832,1,2,1,1
4,77.722557,78.522873,77.118089,78.223051,118826800,AAPL,2020-02-05,76.51797,76.667396,1.686722,...,58.906394,78.853603,73.550544,5.303059,151112640.0,0.786346,2,2,1,0


## **Split Data into Training and Testing**

In [None]:
from sklearn.model_selection import train_test_split

feature_cols = ["SMA_5", "SMA_10", "Volatility", "Momentum", "Return", "RSI_14", "BB_width", "Volume_Ratio", "DayOfWeek", "Month"]


def split_by_ticker(data, feature_cols, target_col="Target", test_size=0.2, drop_ticker=False):
    train_parts = []
    test_parts = []

    tickers = data["Ticker"].unique()

    for ticker in tickers:
        ticker_data = data[data["Ticker"] == ticker].sort_values("Date")

        split_idx = int((1 - test_size) * len(ticker_data))
        train = ticker_data.iloc[:split_idx]
        test = ticker_data.iloc[split_idx:]

        train_parts.append(train)
        test_parts.append(test)

    train_data = pd.concat(train_parts).reset_index(drop=True)
    test_data = pd.concat(test_parts).reset_index(drop=True)

    X_train = train_data[feature_cols].copy()
    Y_train = train_data[target_col].copy()
    X_test = test_data[feature_cols].copy()
    Y_test = test_data[target_col].copy()

    if drop_ticker and "Ticker" in X_train.columns:
        X_train = X_train.drop(columns=["Ticker"])
        X_test = X_test.drop(columns=["Ticker"])

    return X_train, X_test, Y_train, Y_test, test_data

X_train, X_test, Y_train, Y_test, test_data = split_by_ticker(data, feature_cols)






## **Find XGB Parameters**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 2],
    'min_child_weight': [1, 3, 5]
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

random_search = RandomizedSearchCV(
    xgb,
    param_distributions=param_grid,
    scoring='accuracy',
    n_iter=25,  # try more for better results
    cv=3,
    verbose=1,
    random_state=42
)

random_search.fit(X_train, Y_train)
print("Best Parameters:", random_search.best_params_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best Parameters: {'subsample': 0.6, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 5, 'learning_rate': 0.01, 'gamma': 2, 'colsample_bytree': 1.0}


## **Train Models**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

!pip install xgboost

# Logistic Regression
log_model = LogisticRegression(max_iter=10000)
log_model.fit(X_train, Y_train)
log_pred = log_model.predict(X_test)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, Y_train)
rf_pred = rf_model.predict(X_test)

# XGBoost
xgb_model = XGBClassifier(max_depth=5, n_estimators=300, min_child_weight=5, learning_rate=0.01, subsample=0.6, gamma=2, colsample_bytree=1.0, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, Y_train)
xgb_pred = xgb_model.predict(X_test)



Parameters: { "use_label_encoder" } are not used.



## **Evaluate Models**

In [None]:
def evaluate_model(name, Y_true, Y_pred):
    print(f"\n--- {name} ---")
    print("Accuracy:", accuracy_score(Y_true, Y_pred))
    print("Confusion Matrix:\n", confusion_matrix(Y_true, Y_pred))
    print("Classification Report:\n", classification_report(Y_true, Y_pred))

evaluate_model("Logistic Regression", Y_test, log_pred)
evaluate_model("Random Forest", Y_test, rf_pred)
evaluate_model("XGBoost", Y_test, xgb_pred)


--- Logistic Regression ---
Accuracy: 0.5195707070707071
Confusion Matrix:
 [[100 592]
 [169 723]]
Classification Report:
               precision    recall  f1-score   support

           0       0.37      0.14      0.21       692
           1       0.55      0.81      0.66       892

    accuracy                           0.52      1584
   macro avg       0.46      0.48      0.43      1584
weighted avg       0.47      0.52      0.46      1584


--- Random Forest ---
Accuracy: 0.5978535353535354
Confusion Matrix:
 [[364 328]
 [309 583]]
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.53      0.53       692
           1       0.64      0.65      0.65       892

    accuracy                           0.60      1584
   macro avg       0.59      0.59      0.59      1584
weighted avg       0.60      0.60      0.60      1584


--- XGBoost ---
Accuracy: 0.6369949494949495
Confusion Matrix:
 [[335 357]
 [218 674]]
Classification R

##**Results by Ticker**

In [None]:
Y_pred = xgb_model.predict(X_test)

results = test_data.copy()
results["Prediction"] = Y_pred
results["Actual"] = Y_test.values

ticker_accuracies = results.groupby("Ticker").apply(
    lambda group: (group["Prediction"] == group["Actual"]).mean()
)

print(ticker_accuracies)

Ticker
AAPL    0.575758
CVX     0.560606
GS      0.676768
JPM     0.681818
KO      0.686869
MSFT    0.671717
PG      0.742424
XOM     0.500000
dtype: float64


  ticker_accuracies = results.groupby("Ticker").apply(
