# Stock Market Prediction Model of the Magnificient Seven 

#pip install the dependencies from requirements. May take up to 12 minutes

In [18]:
!python3 -m pip install -r requirements.txt


Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


### importing our libraries after pipinstall

In [19]:

#python data manipulation
import pandas as pd
import numpy as np

#data visualizaiton tools, EDA
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

#mysql and Cassandra
import mysql.connector                         
from cassandra.cluster import Cluster          

#ARIMA
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm

#LSTM Neural network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow as tf

# our finance data
import yfinance as yf



## Extracting the Historical Data.


 ### The historical aspect of the data is starting from January 1st 2020 to yesterday (april 6th), We can adjust this to most recent always. We extracted the 7 companies and start with first 7 features from our project proposal: Date/time, open price, low price, high price, adjusted close price, trading volume

In [20]:
def extract_data(tickers, start_date="2020-01-01", end_date="2025-04-06", interval="1d"):
    # importing the yfinance api data, making sure the autoadjust is off because it overrides the close value if its on
    df = yf.download(tickers, start=start_date, end=end_date, interval=interval, auto_adjust=False)
    # tidying the data to make it better for analysis and transformation later
    df.columns = df.columns.swaplevel(0, 1)
    df = df.sort_index(axis=1, level=0)
    df_flat = df.stack(level=0, future_stack=True).reset_index()  
    df_flat.rename(columns={'level_0': 'Date'}, inplace=True)
    # renaming 'Adj Close' to 'Adj_Close' for consistency
    if 'Adj Close' in df_flat.columns:
        df_flat.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True)
    # adding the previous day's close price per ticker
    df_flat['Previous_Close'] = df_flat.groupby('Ticker')['Close'].shift(1)
    # selecting the columns we expect and need
    expected_cols = ['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj_Close', 'Previous_Close', 'Volume']
    existing_cols = [col for col in expected_cols if col in df_flat.columns]
    return df_flat[existing_cols]


In [21]:
#giving yfinance a list of companies to return a list of data of. AND displaying the df
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "META", "NVDA", "TSLA"]
df_raw = extract_data(tickers)
display(df_raw)

[*********************100%***********************]  7 of 7 completed


Price,Date,Ticker,Open,High,Low,Close,Adj_Close,Previous_Close,Volume
0,2020-01-02,AAPL,74.059998,75.150002,73.797501,75.087502,72.716072,,135480400
1,2020-01-02,AMZN,93.750000,94.900497,93.207497,94.900497,94.900497,,80580000
2,2020-01-02,GOOGL,67.420502,68.433998,67.324501,68.433998,68.108376,,27278000
3,2020-01-02,META,206.750000,209.789993,206.270004,209.779999,208.795929,,12077100
4,2020-01-02,MSFT,158.779999,160.729996,158.330002,160.619995,153.323257,,22622100
...,...,...,...,...,...,...,...,...,...
9249,2025-04-04,GOOGL,148.009995,151.070007,145.380005,145.600006,145.600006,150.720001,62259500
9250,2025-04-04,META,506.619995,518.000000,494.200012,504.730011,504.730011,531.619995,38589800
9251,2025-04-04,MSFT,364.130005,374.589996,359.480011,359.839996,359.839996,373.109985,49209900
9252,2025-04-04,NVDA,98.910004,100.129997,92.110001,94.309998,94.309998,101.800003,532273800


## Historical Data: Data Cleaning: 
### We will clean the data by taking care of any null values. We can use back and forward filling if there is a value missing from a float/int value. Otherwise we will drop the value if its in the date or Ticker/ or use the average of the past few days for the column. Additionally if there are are any duplicate records for a company and a speciifc date, one should be kept while rest dropped.

In [22]:
nans = df_raw.isna().sum()
print(nans)
def cleaning(df):
    # Step 1: Drop duplicates (keep the first entry for each Ticker-Date pair)
    df = df.drop_duplicates(subset=['Date', 'Ticker'], keep='first')

    # Step 2: Define numeric columns to clean (excluding 'Date' and 'Ticker')
    numeric_cols = ['Open', 'High', 'Low', 'Close', 'Adj_Close', 'Volume', 'Previous_Close']
    
    # Step 3: Fill missing values for numeric columns
    for col in numeric_cols:
        # Forward fill and backward fill first
        df[col] = df.groupby('Ticker')[col].transform(lambda x: x.ffill().bfill())
        
        # Fill remaining NaNs with rolling mean (3-day window)
        df[col] = df.groupby('Ticker')[col].transform(
            lambda x: x.fillna(x.rolling(window=3, min_periods=1).mean())
        )

    # Step 4: Ensure 'Previous_Close' is properly filled (first entry has no previous data)
    df['Previous_Close'] = df.groupby('Ticker')['Previous_Close'].ffill()

    # Step 5: Drop rows with nulls in 'Date' or 'Ticker' (shouldn't exist, but safety check)
    df = df.dropna(subset=['Date', 'Ticker'])

    return df
# Apply cleaning
df_cleaned = cleaning(df_raw)

# Verify results
print("Null values after cleaning:")
print(df_cleaned.isna().sum())
display(df_cleaned)

Price
Date              0
Ticker            0
Open              0
High              0
Low               0
Close             0
Adj_Close         0
Previous_Close    7
Volume            0
dtype: int64
Null values after cleaning:
Price
Date              0
Ticker            0
Open              0
High              0
Low               0
Close             0
Adj_Close         0
Previous_Close    0
Volume            0
dtype: int64


Price,Date,Ticker,Open,High,Low,Close,Adj_Close,Previous_Close,Volume
0,2020-01-02,AAPL,74.059998,75.150002,73.797501,75.087502,72.716072,75.087502,135480400
1,2020-01-02,AMZN,93.750000,94.900497,93.207497,94.900497,94.900497,94.900497,80580000
2,2020-01-02,GOOGL,67.420502,68.433998,67.324501,68.433998,68.108376,68.433998,27278000
3,2020-01-02,META,206.750000,209.789993,206.270004,209.779999,208.795929,209.779999,12077100
4,2020-01-02,MSFT,158.779999,160.729996,158.330002,160.619995,153.323257,160.619995,22622100
...,...,...,...,...,...,...,...,...,...
9249,2025-04-04,GOOGL,148.009995,151.070007,145.380005,145.600006,145.600006,150.720001,62259500
9250,2025-04-04,META,506.619995,518.000000,494.200012,504.730011,504.730011,531.619995,38589800
9251,2025-04-04,MSFT,364.130005,374.589996,359.480011,359.839996,359.839996,373.109985,49209900
9252,2025-04-04,NVDA,98.910004,100.129997,92.110001,94.309998,94.309998,101.800003,532273800


## Historical Data, Feature Engineering:

### We are going to be transforming the tidy dataframe by doing some feature engineering. We will create the following fields into our table by manipulating the prexisting data from df_cleaned: 
1. Simple Moving Average (SMA)
2. Exponential Moving Average (EMA)
3. Relative Strength Index (RSI)
4. Bollinger Bands
5. MACD (Moving Average Convergence Divergence)
6. On-Balance Volume (OBV)
7. Volatility (ATR - Average True Range)


In [23]:
def transform(df):
    # Ensure data is sorted by Ticker and Date
    df = df.sort_values(['Ticker', 'Date']).reset_index(drop=True)
    
    # 1. Simple Moving Average (SMA)
    #uses 20 periods of data (NaN till 19 row)
    df['SMA_20'] = df.groupby('Ticker')['Close'].transform(lambda x: x.rolling(window=20).mean())
    
    # 2. Exponential Moving Average (EMA)
    df['EMA_20'] = df.groupby('Ticker')['Close'].transform(lambda x: x.ewm(span=20, adjust=False).mean())
    
    # 3. Relative Strength Index (RSI)
    #uses 14 periods of data (NaN till 13 row)
    def compute_rsi(data, period=14):
        delta = data.diff()
        gain = delta.where(delta > 0, 0)
        loss = -delta.where(delta < 0, 0)
        avg_gain = gain.rolling(window=period).mean()
        avg_loss = loss.rolling(window=period).mean()
        rs = avg_gain / avg_loss
        return 100 - (100 / (1 + rs))
    df['RSI_14'] = df.groupby('Ticker')['Close'].transform(compute_rsi)
    
    # 4. Bollinger Bands
    #NaN till we get SMA values (19th row)
    df['BB_Middle'] = df['SMA_20']
    df['BB_Std'] = df.groupby('Ticker')['Close'].transform(lambda x: x.rolling(window=20).std())
    df['BB_Upper'] = df['BB_Middle'] + 2 * df['BB_Std']
    df['BB_Lower'] = df['BB_Middle'] - 2 * df['BB_Std']
    
    # 5. MACD
    def compute_macd(data):
        ema12 = data.ewm(span=12, adjust=False).mean()
        ema26 = data.ewm(span=26, adjust=False).mean()
        return ema12 - ema26
    df['MACD'] = df.groupby('Ticker')['Close'].transform(compute_macd)
    df['MACD_Signal'] = df.groupby('Ticker')['MACD'].transform(lambda x: x.ewm(span=9, adjust=False).mean())
    
    # 6. OBV (Fixed)
    def compute_obv(group):
        group['OBV'] = np.sign(group['Close'].diff()).replace(0, 1) * group['Volume']
        return group['OBV'].cumsum()
    df['OBV'] = df.groupby('Ticker').apply(compute_obv).reset_index(drop=True)
    
    # 7. ATR (Fixed)
    def compute_atr(group):
        high_low = group['High'] - group['Low']
        high_close = (group['High'] - group['Close'].shift()).abs()
        low_close = (group['Low'] - group['Close'].shift()).abs()
        tr = pd.concat([high_low, high_close, low_close], axis=1).max(axis=1)
        return tr.rolling(14).mean()
    df['ATR_14'] = df.groupby('Ticker').apply(compute_atr).reset_index(drop=True)
    
    # Forward fill missing values created by rolling windows
    # df = df.groupby('Ticker').ffill()
    
    return df

# Apply transformation to cleaned data
df_transformed = transform(df_cleaned)
display(df_transformed[['Date', 'Ticker', 'SMA_20', 'EMA_20', 'RSI_14', 'BB_Upper', 'BB_Lower', 'MACD','OBV','ATR_14']].head(n=20)) #only shows APPL b/c all other tickers appear later. 
#add in 'OBV' and 'ATR_14' when fixed 

#to filter a specific ticker: 
display(df_transformed[df_transformed['Ticker'] == 'MSFT'].iloc[20:25])

#to shuffle/show random rows: 
display(df_transformed.sample(10))  # Show random rows

# count = len(df_transformed)
# print(count)


  df['OBV'] = df.groupby('Ticker').apply(compute_obv).reset_index(drop=True)
  df['ATR_14'] = df.groupby('Ticker').apply(compute_atr).reset_index(drop=True)


Price,Date,Ticker,SMA_20,EMA_20,RSI_14,BB_Upper,BB_Lower,MACD,OBV,ATR_14
0,2020-01-02,AAPL,,75.087502,,,,0.0,,
1,2020-01-03,AAPL,,75.017977,,,,-0.058234,-146322800.0,
2,2020-01-06,AAPL,,75.011503,,,,-0.05593,-27935600.0,
3,2020-01-07,AAPL,,74.972075,,,,-0.081607,-136807600.0,
4,2020-01-08,AAPL,,75.050687,,,,-0.005068,-4728400.0,
5,2020-01-09,AAPL,,75.275145,,,,0.183389,165380000.0,
6,2020-01-10,AAPL,,75.494893,,,,0.342911,306024800.0,
7,2020-01-13,AAPL,,75.85157,,,,0.596207,427556800.0,
8,2020-01-14,AAPL,,76.072372,,,,0.702507,265602400.0,
9,2020-01-15,AAPL,,76.240241,,,,0.751062,143678800.0,


Price,Date,Ticker,Open,High,Low,Close,Adj_Close,Previous_Close,Volume,SMA_20,EMA_20,RSI_14,BB_Middle,BB_Std,BB_Upper,BB_Lower,MACD,MACD_Signal,OBV,ATR_14
5308,2020-01-31,MSFT,172.210007,172.399994,169.580002,170.229996,162.496674,172.779999,36142700,164.167999,165.244291,65.892017,164.167999,3.935323,172.038644,156.297353,2.175122,1.508417,68410900.0,2.845717
5309,2020-02-03,MSFT,170.429993,174.5,170.399994,174.380005,166.458191,170.229996,30107000,164.955999,166.114359,68.389658,164.955999,4.324559,173.605117,156.306881,2.627071,1.732148,98517900.0,3.004289
5310,2020-02-04,MSFT,177.139999,180.639999,176.309998,180.119995,171.937378,174.380005,36433300,166.010499,167.448229,75.869974,166.010499,5.271194,176.552887,155.46811,3.409114,2.067541,134951200.0,3.317145
5311,2020-02-05,MSFT,184.029999,184.199997,178.410004,179.899994,171.727386,180.119995,39186300,167.126498,168.634112,74.631681,167.126498,5.734751,178.596,155.656997,3.965426,2.447118,95764900.0,3.601431
5312,2020-02-06,MSFT,180.970001,183.820007,180.059998,183.630005,175.287964,179.899994,27751400,168.303499,170.062292,75.172992,168.303499,6.569494,181.442486,155.164511,4.653644,2.888423,123516300.0,3.662859


Price,Date,Ticker,Open,High,Low,Close,Adj_Close,Previous_Close,Volume,SMA_20,EMA_20,RSI_14,BB_Middle,BB_Std,BB_Upper,BB_Lower,MACD,MACD_Signal,OBV,ATR_14
392,2021-07-23,AAPL,147.550003,148.720001,146.919998,148.559998,145.57428,146.800003,71447400,142.843501,142.797781,65.704883,142.843501,4.843734,152.530969,133.156032,3.936812,4.032972,1032163000.0,3.245714
4922,2023-10-19,META,319.880005,321.890015,311.75,312.809998,311.342621,316.970001,18709200,310.806996,312.854413,58.447311,310.806996,9.973957,330.75491,290.859082,4.908208,4.793492,719899900.0,8.710713
7124,2022-01-14,NVDA,26.299999,27.197001,26.209999,26.941999,26.894793,26.575001,395832000,28.65365,28.454384,27.278918,28.65365,1.265231,31.184112,26.123188,-0.663585,-0.412368,25794140000.0,1.368786
5649,2021-06-09,MSFT,253.809998,255.529999,253.210007,253.589996,245.763763,252.570007,17937600,247.867999,249.68933,68.478675,247.867999,4.037457,255.942913,239.793084,0.829244,0.116312,729477100.0,3.737143
6010,2022-11-11,MSFT,242.990005,247.990005,241.929993,247.110001,241.951767,242.979996,34620200,233.500499,233.336516,49.929747,233.500499,9.75226,253.005018,213.99598,-1.744307,-3.885935,474029900.0,8.972146
1035,2024-02-13,AAPL,185.770004,186.210007,183.509995,185.039993,184.170349,187.149994,56529500,189.282999,188.385101,29.033685,189.282999,3.564718,196.412435,182.153563,-0.606863,-0.298439,1545419000.0,3.30857
4968,2023-12-26,META,354.98999,356.980011,353.450012,354.829987,353.165497,353.390015,9898600,335.358,339.286615,80.521268,335.358,12.140882,359.639764,311.076236,7.482646,5.303784,758489700.0,8.118567
8843,2023-08-16,TSLA,228.020004,233.970001,225.380005,225.600006,225.600006,232.960007,112484500,253.46,250.303705,28.575486,253.46,12.155679,277.771357,229.148642,-7.19233,-3.218737,14730040000.0,8.842854
4503,2022-02-17,META,214.020004,217.5,207.160004,207.710007,206.735657,216.539993,38747500,261.039001,253.178617,21.825929,261.039001,43.201927,347.442856,174.635146,-28.691385,-24.764103,151685400.0,15.802145
8429,2021-12-21,TSLA,305.623322,313.166656,295.373322,312.843323,312.843323,299.980011,71517900,342.211501,333.322923,31.646601,342.211501,25.237808,392.687117,291.735885,-11.222511,-5.651948,12066820000.0,18.589999
