# Machine Learning Stock Market Analyzer
Actual trading scripts will be later implemented (potentially through a seperate program). As of right now, this program just aims to predict future prices.

Potential Limitations/Bottlenecks: Yahoo API support for small intervals & rate limiting for large datasets

#### Necessary Packages


In [43]:
# -- Data Collection -- 
import pandas as pd
import yfinance as yf
import matplotlib as plt
import numpy as np

# -- Data Cleaning & Preparation --
#//Necessary Libraries Here, will add as needed

# -- Shallow ML Models --

# Testing Accuracy
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import time

# Simple Linear Regression (Predicting Values)
from sklearn.linear_model import LinearRegression

# SGD Regression (Predicting Values)
from sklearn.linear_model import SGDRegressor

# Ridge Regression (Predicting Values, assuming <100k samples)
from sklearn.kernel_ridge import KernelRidge

# SVR Regression (Predicting Values, kernel dependant on sample size)
from sklearn.svm import SVR

# SGD Classifier (Classifying movement, assuming >100k samples)
from sklearn.linear_model import SGDClassifier #(Be careful with feature scaling)

# Kernel Approximation (Classifying movement, assuming >100k samples)
from sklearn.kernel_approximation import RBFSampler
from sklearn.kernel_approximation import PolynomialCountSketch #*

# Linear SVC (Classifying movement, assuming <100k samples)
from sklearn import svm

# KNeighbors Classification (Classifying Movement, assuming <100k samples)
from sklearn.neighbors import KNeighborsClassifier #Could also add regressor

#Decision Trees (Predicting Price & Classifying Movement, Good Visual)
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

# -- Deep ML Model --
#Will utilize tensorflow, will need to learn more about first
#Recurrent Neural Network & Convolutional Neural Network Hybrid
#Temporal Convolutional Networks
#Long Short-Term Memory Networks (Little Data)
#Gated Recurrent Unit (GRU) Networks (More Data)


#### Basic Data Collection

In [50]:
ticker_symbol = 'AAPL' #Placeholder for now
t_period = "max" #Time Period to test, For testing purposes
t_interval = "1d" #Intervals to collect data for, For testing purposes

ticker = yf.Ticker(ticker_symbol)
historical_data = ticker.history(period=t_period, interval=t_interval, actions=False, auto_adjust=True)
print(historical_data) #Testing

                                 Open        High         Low       Close  \
Date                                                                        
1980-12-12 00:00:00-05:00    0.099192    0.099623    0.099192    0.099192   
1980-12-15 00:00:00-05:00    0.094448    0.094448    0.094017    0.094017   
1980-12-16 00:00:00-05:00    0.087548    0.087548    0.087117    0.087117   
1980-12-17 00:00:00-05:00    0.089273    0.089704    0.089273    0.089273   
1980-12-18 00:00:00-05:00    0.091861    0.092292    0.091861    0.091861   
...                               ...         ...         ...         ...   
2024-06-07 00:00:00-04:00  194.649994  196.940002  194.139999  196.889999   
2024-06-10 00:00:00-04:00  196.899994  197.300003  192.149994  193.119995   
2024-06-11 00:00:00-04:00  193.649994  207.160004  193.630005  207.149994   
2024-06-12 00:00:00-04:00  207.369995  220.199997  206.899994  213.070007   
2024-06-13 00:00:00-04:00  214.779999  216.750000  211.600006  214.240005   

#### Reformatting the Dataframe

Shifting to include previous high, previous low, etc.

In [52]:
if len(historical_data.columns) == 5:
    # Renaming Columns 
    historical_data['prev_open'] = historical_data['Open']
    historical_data['prev_close'] = historical_data['Close']
    historical_data.columns = ['open', 'prev_high', 'prev_low', 'prev_volume', 'Close', 'prev_open', 'prev_close']

    #Creating a dummy row for shifting
    empty_row = pd.DataFrame([None] * len(historical_data.columns)).T #Creating an empty row to concatonate
    empty_row.columns = historical_data.columns
    historical_data = pd.concat([historical_data, empty_row], ignore_index=True)
    
    #Shifting down all statistics for previous days
    historical_data['prev_high'] = historical_data['prev_high'].shift(1)
    historical_data['prev_low'] = historical_data['prev_low'].shift(1)
    historical_data['prev_close'] = historical_data['prev_close'].shift(1)
    historical_data['prev_volume'] = historical_data['prev_volume'].shift(1)
    historical_data['prev_open'] = historical_data['prev_open'].shift(1)
    
    #Creating a new df with the most recent statistics
    most_recent = historical_data.iloc[-1:].copy()
    #Replacing unknown statistics with signal value
    most_recent.iloc[-1, most_recent.columns.get_loc('open')] = 999999
    most_recent.iloc[-1, most_recent.columns.get_loc('Close')] = 999999
    
    
    #Dropping all empty columns
    historical_data.replace(['NA', 'NaN', None], np.nan, inplace=True)
    historical_data = historical_data.dropna()
    

print(historical_data)

             open   prev_high    prev_low  prev_volume        Close  \
1        0.094448    0.099623    0.099192     0.099192  175884800.0   
2        0.087548    0.094448    0.094017     0.094017  105728000.0   
3        0.089273    0.087548    0.087117     0.087117   86441600.0   
4        0.091861    0.089704    0.089273     0.089273   73449600.0   
5        0.097467    0.092292    0.091861     0.091861   48630400.0   
...           ...         ...         ...          ...          ...   
10962  194.649994  196.500000  194.169998   194.479996   53103900.0   
10963  196.899994  196.940002  194.139999   196.889999   97262100.0   
10964  193.649994  197.300003  192.149994   193.119995  172373300.0   
10965  207.369995  207.160004  193.630005   207.149994  198134300.0   
10966  214.779999  220.199997  206.899994   213.070007   96416123.0   

        prev_open  prev_close  
1        0.099192    0.099192  
2        0.094448    0.094017  
3        0.087548    0.087117  
4        0.089273  

Converting to numpy array for increased efficiency. Although most models do this internally anyways, explicitly converting can reduce overhead

In [55]:
x_df = historical_data[['open', 'prev_high', 'prev_low', 'prev_volume', 'prev_open', 'prev_close']]
y_df = historical_data[['Close']] #Target Value to be Specified

x = x_df.to_numpy()
target = y_df.to_numpy()
print(x)
print(target)

[[9.44484613e-02 9.96233438e-02 9.91920978e-02 9.91920978e-02
  9.91920978e-02 9.91920978e-02]
 [8.75477574e-02 9.44484613e-02 9.40172151e-02 9.40172151e-02
  9.44484613e-02 9.40172151e-02]
 [8.92727226e-02 8.75477574e-02 8.71165171e-02 8.71165171e-02
  8.75477574e-02 8.71165171e-02]
 ...
 [1.93649994e+02 1.97300003e+02 1.92149994e+02 1.93119995e+02
  1.96899994e+02 1.93119995e+02]
 [2.07369995e+02 2.07160004e+02 1.93630005e+02 2.07149994e+02
  1.93649994e+02 2.07149994e+02]
 [2.14779999e+02 2.20199997e+02 2.06899994e+02 2.13070007e+02
  2.07369995e+02 2.13070007e+02]]
[[1.7588480e+08]
 [1.0572800e+08]
 [8.6441600e+07]
 ...
 [1.7237330e+08]
 [1.9813430e+08]
 [9.6416123e+07]]


#### Getting X to Predict
Here, we will gather the most recent information for us to predict. This feature will need to be implemented in the future.

In [None]:
# Get Info Here (We have most_recent to work with previous values)

## Shallow Models - Regression

#### Benchmarking Functions

In [59]:
#Splitting data into consistent training and testing sets
rand_state: int = 42 #Seeding the random state, getting equal splits
x_train, x_test, y_train, y_test = train_test_split(x, target, test_size=0.2, random_state=42)
def test_model(y_pred):
    # start_time = time.time()
    # end_time = time.time()
    # elasped_time = end_time - start_time
    # print(f"Duration of Execution: {elasped_time} seconds")
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')
    print(f'R-Squared: {r2}')

#### Simple Linear Regression Model

In [60]:
#Fitting/Training the model
lg_model = LinearRegression()
lg_model.fit(x_train, y_train)
#Getting & Evaluating Results
y_pred = lg_model.predict(x_test)
test_model(y_pred)

Mean Squared Error: 1.1002368609756437e+17
R-Squared: 0.06351280862250042


#### SGD Regression Model

In [None]:
sgd_model = SGDRegressor()