# Stock Market Prediction Model of the Magnificient 7

#pip install the dependencies from requirements. May take up to 2 minutes

In [69]:
!python3 -m pip install -r requirements.txt


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


### importing our libraries after pipinstall

In [70]:

#python data manipulation
import pandas as pd
import numpy as np

#data visualizaiton tools, EDA
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

#mysql and Cassandra
import mysql.connector                         
from cassandra.cluster import Cluster          

#ARIMA
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm

#LSTM Neural network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow as tf

# our finance data
import yfinance as yf



## Extracting the historical data.


 ### The historical aspect of the data is starting from January 1st 2020 to yesterday (March 23rd), We can adjust this to most recent always. We extracted the 7 companies and start with first 7 features from our project proposal: Date/time, open price, low price, high price, adjusted close price, trading volume

In [75]:
def extract_data(tickers, start_date="2020-01-01", end_date="2025-03-23", interval="1d"):
    #importing the yfinance api data, making sure the autoadjust is off because it overrides the close value if its on
    df = yf.download(tickers, start=start_date, end=end_date, interval=interval, auto_adjust=False)

    #tidying the data to make it better for analysis and transformation later
    df.columns = df.columns.swaplevel(0, 1)
    df = df.sort_index(axis=1, level=0)
    df_flat = df.stack(level=0, future_stack=True).reset_index()  
    df_flat.rename(columns={'level_0': 'Date'}, inplace=True)
    if 'Adj Close' in df_flat.columns:
        df_flat.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True)
    expected_cols = ['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj_Close', 'Volume']
    existing_cols = [col for col in expected_cols if col in df_flat.columns]

    return df_flat[existing_cols]


In [81]:
#giving yfinance a list of companies to return a list of data of:
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "META", "NVDA", "TSLA"]
df_raw = extract_data(tickers)
display(df_raw)

[*********************100%***********************]  7 of 7 completed


Price,Date,Ticker,Open,High,Low,Close,Adj_Close,Volume
0,2020-01-02,AAPL,74.059998,75.150002,73.797501,75.087502,72.716057,135480400
1,2020-01-02,AMZN,93.750000,94.900497,93.207497,94.900497,94.900497,80580000
2,2020-01-02,GOOGL,67.420502,68.433998,67.324501,68.433998,68.108368,27278000
3,2020-01-02,META,206.750000,209.789993,206.270004,209.779999,208.795929,12077100
4,2020-01-02,MSFT,158.779999,160.729996,158.330002,160.619995,153.323273,22622100
...,...,...,...,...,...,...,...,...
9179,2025-03-21,GOOGL,161.210007,164.240005,160.889999,163.990005,163.990005,36625800
9180,2025-03-21,META,583.419983,597.539978,580.950012,596.250000,596.250000,25015900
9181,2025-03-21,MSFT,383.220001,391.739990,382.799988,391.260010,391.260010,39675900
9182,2025-03-21,NVDA,116.940002,117.989998,115.419998,117.699997,117.699997,266498500


In [85]:
print(df_raw.columns.tolist())


['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj_Close', 'Volume']


## Transform the data.

### We are going to be transforming the tidy dataframe by cleaning it, and doing some feature engineering. We will clean the data by taking care of any null values. We can use back and forward filling if there is a value missing from a day. Otherwise we will drop/ or use the average of the past few days for the column.  Then, we want to feature engineer and create the following fields into our table: 
1. Simple Moving Average (SMA)
2. Exponential Moving Average (EMA)
3. Relative Strength Index (RSI)
4. Bollinger Bands
5. MACD (Moving Average Convergence Divergence)
6. On-Balance Volume (OBV)
7. Volatility (ATR - Average True Range)
