# Machine Learning Stock Market Analyzer
Actual trading scripts will be later implemented (potentially through a seperate program). As of right now, this program just aims to predict future prices.

Potential Limitations/Bottlenecks: Yahoo API support for small intervals & rate limiting for large datasets

#### Necessary Packages


In [17]:
# -- Data Collection -- 
import pandas as pd
import yfinance as yf
import matplotlib as plt
import numpy as np

# -- Data Cleaning & Preparation --
#//Necessary Libraries Here, will add as needed

# -- Shallow ML Models --

# Testing Accuracy
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import time

# Simple Linear Regression (Predicting Values)
from sklearn.linear_model import LinearRegression

# SGD Regression (Predicting Values)
from sklearn.linear_model import SGDRegressor

# Ridge Regression (Predicting Values, assuming <100k samples)
from sklearn.kernel_ridge import KernelRidge

# SVR Regression (Predicting Values, kernel dependant on sample size)
from sklearn.svm import SVR

# SGD Classifier (Classifying movement, assuming >100k samples)
from sklearn.linear_model import SGDClassifier #(Be careful with feature scaling)

# Kernel Approximation (Classifying movement, assuming >100k samples)
from sklearn.kernel_approximation import RBFSampler
from sklearn.kernel_approximation import PolynomialCountSketch #*

# Linear SVC (Classifying movement, assuming <100k samples)
from sklearn import svm

# KNeighbors Classification (Classifying Movement, assuming <100k samples)
from sklearn.neighbors import KNeighborsClassifier #Could also add regressor

#Decision Trees (Predicting Price & Classifying Movement, Good Visual)
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

# -- Deep ML Model --
#Will utilize tensorflow, will need to learn more about first
#Recurrent Neural Network & Convolutional Neural Network Hybrid
#Temporal Convolutional Networks
#Long Short-Term Memory Networks (Little Data)
#Gated Recurrent Unit (GRU) Networks (More Data)


#### Basic Data Collection

In [25]:
ticker_symbol = 'AAPL' #Placeholder for now
t_period = "max" #Time Period to test, For testing purposes
t_interval = "1d" #Intervals to collect data for, For testing purposes

ticker = yf.Ticker(ticker_symbol)
historical_data = ticker.history(period=t_period, interval=t_interval, actions=False, auto_adjust=True)
print(historical_data) #Testing

                                 Open        High         Low       Close  \
Date                                                                        
1980-12-12 00:00:00-05:00    0.099058    0.099488    0.099058    0.099058   
1980-12-15 00:00:00-05:00    0.094321    0.094321    0.093890    0.093890   
1980-12-16 00:00:00-05:00    0.087429    0.087429    0.086999    0.086999   
1980-12-17 00:00:00-05:00    0.089152    0.089582    0.089152    0.089152   
1980-12-18 00:00:00-05:00    0.091737    0.092167    0.091737    0.091737   
...                               ...         ...         ...         ...   
2024-06-05 00:00:00-04:00  195.399994  196.899994  194.869995  195.869995   
2024-06-06 00:00:00-04:00  195.690002  196.500000  194.169998  194.479996   
2024-06-07 00:00:00-04:00  194.649994  196.940002  194.139999  196.889999   
2024-06-10 00:00:00-04:00  196.899994  197.300003  192.149994  193.119995   
2024-06-12 00:00:00-04:00  207.360001  220.199997  206.910004  213.070007   

#### Reformatting the Dataframe

Shifting to include previous high, previous low, etc.

In [27]:
if len(historical_data.columns) == 5:
    # Renaming Columns 
    historical_data['prev_open'] = historical_data['Open']
    historical_data['prev_close'] = historical_data['Close']
    historical_data.columns = ['open', 'prev_high', 'prev_low', 'prev_volume', 'Close', 'prev_open', 'prev_close']

    #Creating a dummy row for shifting
    empty_row = pd.DataFrame([None] * len(historical_data.columns)).T #Creating an empty row to concatonate
    empty_row.columns = historical_data.columns
    historical_data = pd.concat([historical_data, empty_row], ignore_index=True)
    #Shifting down all statistics for previous days
    historical_data['prev_high'] = historical_data['prev_high'].shift(1)
    historical_data['prev_low'] = historical_data['prev_low'].shift(1)
    historical_data['prev_close'] = historical_data['prev_close'].shift(1)
    historical_data['prev_volume'] = historical_data['prev_volume'].shift(1)
    historical_data['prev_open'] = historical_data['prev_open'].shift(1)
    #Dropping the newly formed empty columns
    historical_data = historical_data.dropna()

print(historical_data)

             open   prev_high    prev_low  prev_volume      Close   prev_open  \
1        0.094321    0.099488    0.099058     0.099058  175884800    0.099058   
2        0.087429    0.094321    0.093890     0.093890  105728000    0.094321   
3        0.089152    0.087429    0.086999     0.086999   86441600    0.087429   
4        0.091737    0.089582    0.089152     0.089152   73449600    0.089152   
5        0.097335    0.092167    0.091737     0.091737   48630400    0.091737   
...           ...         ...         ...          ...        ...         ...   
10960  195.399994  195.320007  193.029999   194.350006   54156800  194.639999   
10961  195.690002  196.899994  194.869995   195.869995   41181800  195.399994   
10962  194.649994  196.500000  194.169998   194.479996   53103900  195.690002   
10963  196.899994  196.940002  194.139999   196.889999   97262100  194.649994   
10964  207.360001  197.300003  192.149994   193.119995  194031540  196.899994   

       prev_close  
1      

Converting to numpy array for increased efficiency. Although most models do this internally anyways, can reduce overhead

In [29]:
x_df = historical_data[['open', 'prev_high', 'prev_low', 'prev_volume', 'prev_open', 'prev_close']]
y_df = historical_data[['Close']] #Target Value to be Specified

x = x_df.to_numpy
target = y_df.to_numpy

## Shallow Models - Regression

#### Benchmarking Functions

In [None]:
def test_model(): #Should add explicit return type when finished
    start_time = time.time()
    end_time = time.time()
    elasped_time = end_time - start_time
    print(f"Duration of Execution: {elasped_time} seconds")
    return 'UNFINISHED'

#### Simple Linear Regression Model