# Stock Market Prediction Model of the Magnificient Seven 

#pip install the dependencies from requirements. May take up to 2 minutes

In [160]:
!python3 -m pip install -r requirements.txt


Defaulting to user installation because normal site-packages is not writeable
Collecting finnhub-python
  Downloading finnhub_python-2.4.23-py3-none-any.whl (11 kB)
Installing collected packages: finnhub-python
Successfully installed finnhub-python-2.4.23
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m


### importing our libraries after pipinstall

In [282]:

#python data manipulation
import pandas as pd
import numpy as np

#data visualizaiton tools, EDA
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

#mysql and Cassandra
import mysql.connector                         
from cassandra.cluster import Cluster          

#ARIMA
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm

#LSTM Neural network
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow as tf

# our finance data
import yfinance as yf



## Extracting the Historical Data.


 ### The historical aspect of the data is starting from January 1st 2020 to yesterday (March 23rd), We can adjust this to most recent always. We extracted the 7 companies and start with first 7 features from our project proposal: Date/time, open price, low price, high price, adjusted close price, trading volume

In [283]:
def extract_data(tickers, start_date="2020-01-01", end_date="2025-03-25", interval="1d"):
    #importing the yfinance api data, making sure the autoadjust is off because it overrides the close value if its on
    df = yf.download(tickers, start=start_date, end=end_date, interval=interval, auto_adjust=False)

    #tidying the data to make it better for analysis and transformation later
    df.columns = df.columns.swaplevel(0, 1)
    df = df.sort_index(axis=1, level=0)
    df_flat = df.stack(level=0, future_stack=True).reset_index()  
    df_flat.rename(columns={'level_0': 'Date'}, inplace=True)
    if 'Adj Close' in df_flat.columns:
        df_flat.rename(columns={'Adj Close': 'Adj_Close'}, inplace=True)
    expected_cols = ['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Adj_Close', 'Volume']
    existing_cols = [col for col in expected_cols if col in df_flat.columns]

    return df_flat[existing_cols]


In [284]:
#giving yfinance a list of companies to return a list of data of. AND displaying the df
tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "META", "NVDA", "TSLA"]
df_raw = extract_data(tickers)
display(df_raw)

[*********************100%***********************]  7 of 7 completed


Price,Date,Ticker,Open,High,Low,Close,Adj_Close,Volume
0,2020-01-02,AAPL,74.059998,75.150002,73.797501,75.087502,72.716072,135480400
1,2020-01-02,AMZN,93.750000,94.900497,93.207497,94.900497,94.900497,80580000
2,2020-01-02,GOOGL,67.420502,68.433998,67.324501,68.433998,68.108368,27278000
3,2020-01-02,META,206.750000,209.789993,206.270004,209.779999,208.795929,12077100
4,2020-01-02,MSFT,158.779999,160.729996,158.330002,160.619995,153.323273,22622100
...,...,...,...,...,...,...,...,...
9186,2025-03-24,GOOGL,167.070007,168.320007,165.139999,167.679993,167.679993,30849700
9187,2025-03-24,META,614.969971,622.539978,612.200012,618.849976,618.849976,15722400
9188,2025-03-24,MSFT,395.399994,395.399994,389.809998,393.079987,393.079987,20985700
9189,2025-03-24,NVDA,119.879997,122.220001,119.339996,121.410004,121.410004,227701000


## Historical Data: Data Cleaning: 
### We will clean the data by taking care of any null values. We can use back and forward filling if there is a value missing from a float/int value. Otherwise we will drop the value if its in the date or Ticker/ or use the average of the past few days for the column. Additionally if there are are any duplicate records for a company and a speciifc date, one should be kept while rest dropped.

In [164]:
def cleaning(df):
    return df

## Historical Data, Feature Engineering:

### We are going to be transforming the tidy dataframe by doing some feature engineering. We will create the following fields into our table by manipulating the prexisting data from df_cleaned: 
1. Simple Moving Average (SMA)
2. Exponential Moving Average (EMA)
3. Relative Strength Index (RSI)
4. Bollinger Bands
5. MACD (Moving Average Convergence Divergence)
6. On-Balance Volume (OBV)
7. Volatility (ATR - Average True Range)


In [165]:
def transform(df):
    return df

## Loading into MySQL with historical data 

## Extracting Real time data
### Using Finnhub API to get real time data of the stock market , filling the Cassandra database every 10 seconds until 60seconds is reached for the past minute of data

In [285]:
from datetime import datetime
import pandas as pd
import finnhub

#using finhub api, use api key
finnhub_client = finnhub.Client(api_key="cvhdlvhr01qrtb3o0350cvhdlvhr01qrtb3o035g")

tickers = ["AAPL", "MSFT", "GOOGL", "AMZN", "META", "NVDA", "TSLA"]

def get_realtime_quotes():
    quotes = []
    for symbol in tickers:
        data = finnhub_client.quote(symbol)
        quotes.append({
            "Date": pd.to_datetime(data['t'], unit='s'),
            "Ticker": symbol,
            "Open": data['o'],
            "High": data['h'],
            "Low": data['l'],
            "Close": data['c'],           
            "Adj_Close": data['c'],       
            "Volume": None               
        })
    return pd.DataFrame(quotes)


## Loading into Cassandra with Real time data

In [286]:
from cassandra.cluster import Cluster

try:
    cluster = Cluster(['localhost'])
    session = cluster.connect()
    print("Cassandra connection established.")
except Exception as e:
    print("Connection error:", e)


# connecting to  the Cassandra session
cluster = Cluster(['localhost'])
session = cluster.connect()

# we are making a keyspace called "stock data" and inside the keyspace we have a table called real_time_quotes where we are pulling data realtime from the finnhub api
session.execute("""
    CREATE KEYSPACE IF NOT EXISTS stock_data 
    WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 1};
""")
session.set_keyspace('stock_data')
#schema for the cassandra csql table
session.execute("""
    CREATE TABLE IF NOT EXISTS real_time_quotes (
    ticker TEXT,
    date TIMESTAMP,
    open DOUBLE,
    high DOUBLE,
    low DOUBLE,
    close DOUBLE,
    adj_close DOUBLE,
    volume BIGINT,
    PRIMARY KEY (ticker, date)
);
""")


Cassandra connection established.


<cassandra.cluster.ResultSet at 0x32aacbf40>

In [287]:
from cassandra.query import PreparedStatement
import time
import os
#inserting the values from the api and clearing the white space and storing the columns as ticker, date, open, high, low, close, adj_close, and volume
def insert_to_cassandra(df):
    query = """
    INSERT INTO real_time_quotes (ticker, date, open, high, low, close, adj_close, volume)
    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
    """
    for _, row in df.iterrows():
        session.execute(query, (
            row['Ticker'],
            row['Date'].to_pydatetime(),
            row['Open'],
            row['High'],
            row['Low'],
            row['Close'],
            row['Adj_Close'],
            row['Volume'] if pd.notna(row['Volume']) else 0
        ))

#making a backup incase the cassandra server acts up
def backup_to_csv(df, path='realtime_backup.csv'):
    header = not os.path.exists(path)
    df.to_csv(path, mode='a', index=False, header=header)



In [288]:
# pulls stocks values every 10 seconds, 6 times. 60 seconds total
for _ in range(6):
    df_hour = get_realtime_quotes()
    insert_to_cassandra(df_hour)
    backup_to_csv(df_hour)
    print(" Test snapshot inserted and backed up.")
    time.sleep(10)  # delay


 Test snapshot inserted and backed up.
 Test snapshot inserted and backed up.
 Test snapshot inserted and backed up.
 Test snapshot inserted and backed up.
 Test snapshot inserted and backed up.
 Test snapshot inserted and backed up.


In [290]:
#turning the cassandra table into a python pandas dataframe for datamanipulation
query = "SELECT * FROM stock_data.real_time_quotes"
rows = session.execute(query)
df_cassandra = pd.DataFrame(rows)
display(df_cassandra)

## Real Time Data: Data Cleaning 


## Real Time Data: Feature Engineering 

## Exploratory Data Analysis

## Machine Learning
### ARIMA for MySQL (historical) and LSTM for Cassandra (real time)