Final project for SAT 4650

Machine Learning to Predict Future Stock Prices

Matthew Jeffrey

Spring 2023

LEGAL DISCLAIMER:
This program is not intended to make accurate or correct predictions of future stock prices. Do not use this program to predict stock prices with the intent to actually buy or sell stocks. If you do so, you will be doing it at your own risk and will lose all your money.

In [None]:
pip install xgboost

In [None]:
import time
import pandas as pd
import glob
import numpy as np
import xgboost as xgb

In [None]:
#get user input for target stocks
print("This program will help identify the best stocks to buy\n")
print("Type 0 for NASDAQ, 1 for NYSE, 2 for Both, or type a specific ticker in the format of: AAPL\nPress enter to confirm your choice")
stocks=input("\nEnter your choice: ")

In [None]:
#get user input for target day
print("How many days ahead would you like to predict?\n")
days=input("Enter your choice: ")

In [None]:
#use user input to create training set
if stocks=='0':
    df=load_folder_data(r"path\to\NASDQ data\**\*.txt")
elif stocks=='1':
    df=load_folder_data(r"path\to\NYSE data\**\*.txt")
elif stocks=='2':
    df1=load_folder_data(r"path\to\NASDQ data\**\*.txt")
    df2=load_folder_data(r"path\to\NYSE data\**\*.txt")
    df=pd.concat([df1, df2], ignore_index=True)
else:
    filename=stocks + ".us.txt"
    filepath1=r"path\\to\\NASDQ data\\**\\" + filename
    filepath2=r"path\\to\\NYSE data\\**\\" + filename
    try:
        file=glob.glob(filepath1, recursive=True)[0]
        df=pd.read_csv(file, sep=',')
    except IndexError:
        try:
            file=glob.glob(filepath2, recursive=True)[0]
            df=pd.read_csv(file, sep=',')
        except IndexError:
            raise FileNotFoundError(f"Ticker not found: {stocks}")

In [None]:
#make predictions
predictions=predict_stock_price(df, days)
#get top 10 preformers from predictions
get_top_performers(predictions, 10)

In [None]:
#used when you input a single stock to predict highest value over day range. 
when_to_sell(df, days)

In [None]:
def load_folder_data(path):
    files=glob.glob(path, recursive=True)
    dfs=[]
    for file in files:
        try:
            df = pd.read_csv(file, sep=',')
            dfs.append(df)
        except pd.errors.EmptyDataError:
            print(f"No data found in file {file}")
    if not dfs:
        return None
    return pd.concat(dfs, ignore_index=True)

In [None]:
def predict_stock_price(df, days):
    #format dataframe
    df=df[1:]
    df['datetime']=pd.to_datetime(df['<DATE>'].astype(str) + df['<TIME>'].astype(str), format='%Y%m%d%H%M%S')
    #print(df['datetime'])
    df.set_index(['<TICKER>', 'datetime'], inplace=True)
    df.drop(['<DATE>', '<TIME>'], axis=1, inplace=True)

    #create features
    df['year'] = df.index.get_level_values('datetime').year
    df['month'] = df.index.get_level_values('datetime').month
    df['day'] = df.index.get_level_values('datetime').day
    df['hour'] = df.index.get_level_values('datetime').hour

    #create prediction target time
    #print(df.head)
    #print(df.index.levels[1][-1])
    target=df.index.levels[1][-1]+pd.Timedelta(days=int(days))
    t_year = str(target.year).encode('utf-8')
    t_month = str(target.month).encode('utf-8')
    t_day = str(target.day).encode('utf-8')
    t_hour = str(target.hour).encode('utf-8')
    
    #group data by ticker and apply prediction function
    results=[]
    for ticker, group in df.groupby(level='<TICKER>'):
        X=group[['year', 'month', 'day', 'hour']].values
        y=group['<CLOSE>'].values
        model=xgb.XGBRegressor()
        model.fit(X, y)
        pred = model.predict([[int(t_year), int(t_month), int(t_day), int(t_hour)]])
        percent_increase=(pred[0] - group['<CLOSE>'][-1])/group['<CLOSE>'][-1] * 100
        results.append((ticker, pred[0], percent_increase))
    return results

In [None]:
def when_to_sell(df, days):
    daily_prices=[]
    for i in range(int(days)):
        result=predict_stock_price(df, i)
        ticker = result[0][0]
        pred = result[0][1]
        percent_increase = result[0][2]
        daily_prices.append((ticker, pred, percent_increase, i))
    results=sorted(daily_prices, key=lambda x: x[2], reverse=True)
    return results

In [None]:
def get_top_performers(results, num):
    filtered_results=[d for d in results if '-' not in d[0] and '_' not in d[0]]
    sorted_results=sorted(filtered_results, key=lambda x: x[2], reverse=True)
    print("Ticker  \tPercent Change("+days+" days)")
    for i in range(num):
        ticker, price, percent_change = sorted_results[i]
        print(f"{ticker}  \t{percent_change:.2f}%")
    return sorted_results