# Database population Script

## Populating the DB

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import ta.momentum
import ta.trend
import yfinance as yf
import sklearn as sk
import sklearn.preprocessing
import os
import sys
import time
from IPython.display import clear_output

In [3]:
NUM_TICKERS = 50
# Get the data for the top companies in the S&P 500
data = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
table = data[0]
# Get the top 50 companies and assigning them a unique id
tickers = {table.index[i]: table['Symbol'][i] for i in range(NUM_TICKERS)}

In [None]:
# Download the data for each company
data_pds = {}

# if data.csv does not exist, download the data
if not os.path.exists("data.csv"):
    print("Downloading data...")
    for ticker_id, ticker in tickers.items():
        clear_output(wait=True)
        sys.stdout.write(f"\rDownloading data for {ticker} ({ticker_id + 1}/{NUM_TICKERS})")
        time.sleep(0.1)
        data = yf.download(ticker, progress=False)
        data = data.reset_index()
        data.columns = data.columns.droplevel(1)
        data = data.rename_axis("Index")

        # Add the ticker ID to the data
        data["Ticker_ID"] = ticker_id

        # turning raw date into year, month, day columns
        data['Year'] = data['Date'].dt.year
        data['Month'] = data['Date'].dt.month
        data['Day'] = data['Date'].dt.day

        close_data = data['Close'].squeeze()
        data['SMA_20'] = ta.trend.sma_indicator(close_data, window=20) # 20 day simple moving average
        data['RSI_14'] = ta.momentum.rsi(close_data, window=14) # 14 day RSI

        bbands = ta.volatility.BollingerBands(close_data, window=20) # 20 day Bollinger Bands
        data["BB_Upper"] = bbands.bollinger_hband() # upper band
        data["BB_Middle"] = bbands.bollinger_mavg() # middle band
        data["BB_Lower"] = bbands.bollinger_lband() # lower band

        macd = ta.trend.MACD(close_data) # MACD
        data["MACD"] = macd.macd() # MACD line
        data["Signal"] = macd.macd_signal() # Signal line

        data["ATR"] = ta.volatility.average_true_range(data['High'].squeeze(), data['Low'].squeeze(), close_data, window=14) # 14 day ATR

        data_pds[ticker] = data.dropna()

    # combine the data
    data = pd.concat(data_pds.values())
    data.to_csv("data.csv")
else:
    print("Data already downloaded.")

In [None]:
data = pd.read_csv("data.csv")
data.drop(columns=["Index"], inplace=True)
data

In [None]:
# normalize the data
scaler = sk.preprocessing.MinMaxScaler()
columns_to_normalize = data.columns.drop(['Date', 'Ticker_ID', 'Year', 'Month', 'Day'])
data[columns_to_normalize] = scaler.fit_transform(data[columns_to_normalize])

# save to csv
data.to_csv("data.csv")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Train a linear regression model for each company
models = {}
for ticker, df in data_pds.items():
    X = df[['Close','MA_10', 'MA_50']] #use the rolling average
    y = df['Close'].shift(-1).dropna() #predict the next day's closing price
    X = X[:-1]

    # Check if there are enough samples to split
    if len(X) < 2:
        print(f"Not enough data for {ticker}")
        continue

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    model = LinearRegression()
    model.fit(X_train, y_train)

    models[ticker] = model







Not enough data for AMTM


In [20]:
#Ask the user to input a stock to predict the price for, and the specific date

stock = 'AAPL'
date = '2020-03-19'

if stock in data_pds:
    df = data_pds[stock]
    X = df[['Close','MA_10', 'MA_50']]
    X = X[:-1]
    y = df['Close'].shift(-1).dropna()

    model = models[stock]
    prediction = model.predict([X.loc[date]])
    actual = y.loc[date]

    print(f"Predicted price for {stock} on {date}: {prediction[0]}\n")
    print(f"Actual price for {stock} on {date}: {actual}")



Predicted price for AAPL on 2020-03-19: [61.32029176]

Actual price for AAPL on 2020-03-19: Ticker
AAPL    57.310001
Name: 2020-03-19 00:00:00, dtype: float64
