# Stage 01: Problem Framing & Scoping
Goal: Enable individual investors to analyze and predict NVDA stock trends for the future using open data and reproducible analytics.

### Data Dictionary:
+ Date: Trading day
+ Open: Opening price
+ High: Highest price
+ Low: Lowest price
+ Close: Closing price
+ Volume: Trading volume

### Methodology:

Acquire daily OHLCV data for NVDA (2023-2025)
Clean, analyze, and model the data for trend detection and prediction

# Stage02：tooling_setup

In [36]:
# Environment & Setup Quick Check
from pathlib import Path
import os, sys
from dotenv import load_dotenv

#check.env 
load_dotenv() 
print(".env loaded (if present)")

# Define project directories
PROJECT_ROOT = Path.cwd()
DATA_DIR = PROJECT_ROOT / "data"
print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_DIR:", DATA_DIR)

#check api key
API_KEY = os.getenv("API_KEY")
if API_KEY:
    print("API_KEY is set")
else:
    print("API_KEY is not set")

#check requirements txt
if os.path.exists('requirements.txt'):
    size = os.path.getsize('requirements.txt')
    if size > 0:
        print(f"text exist & is non-empty ")
    else:
        print("text exist but empty")
else:
    print("text non-exist")


.env loaded (if present)
PROJECT_ROOT: c:\Users\noven\Desktop\Github Fre-5040 Bootcamp\bootcamp_Junkuang_Lai\Bootcamp_Junkuang_Lai\project\notebooks
DATA_DIR: c:\Users\noven\Desktop\Github Fre-5040 Bootcamp\bootcamp_Junkuang_Lai\Bootcamp_Junkuang_Lai\project\notebooks\data
API_KEY is set
text exist & is non-empty 


# Stage03：Python_fundamentals

In [None]:
from pathlib import Path
import pandas as pd

# load dataset
project_root = Path.cwd().parent
data_path = project_root / "data" / "raw" / "NVDA-Starter Data 2021-2025 Daily.csv"

print("Looking for data file at:", data_path)
print("File exists:", data_path.exists())

df = pd.read_csv(data_path)

# Dataset quick check
print("Dataset loading:\n", df)
print(df.info())
print(df.head())
print("Summary Statistics:\n", df.describe())

Looking for data file at: c:\Users\noven\Desktop\Github Fre-5040 Bootcamp\bootcamp_Junkuang_Lai\Bootcamp_Junkuang_Lai\project\data\raw\NVDA-Starter Data 2021-2025 Daily.csv
File exists: True
Dataset loading:
            time     open       high        low    close
0    2023-04-25   27.078   27.24700   26.22500   26.241
1    2023-04-26   27.002   27.33000   26.70510   26.956
2    2023-04-27   27.363   27.49500   26.62501   27.226
3    2023-04-28   27.225   27.75800   27.07148   27.749
4    2023-05-01   27.840   29.05844   27.78000   28.910
..          ...      ...        ...        ...      ...
579  2025-08-15  181.880  181.90000  178.04170  180.450
580  2025-08-18  180.600  182.93750  180.59000  182.010
581  2025-08-19  182.430  182.50000  175.49000  175.640
582  2025-08-20  175.165  176.00000  168.80100  175.400
583  2025-08-21  174.850  176.90000  173.81000  174.980

[584 rows x 5 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 584 entries, 0 to 583
Data columns (total 5 c

In [None]:
# src\utils demo
import sys

src_path = Path.cwd().parent / "src"
sys.path.append(str(src_path))

from utils import calc_mean_std_logged  

# utils example 
print(calc_mean_std_logged([1, 2, 3, 4, 5]))

Function calc_mean_std_logged called at 2025-08-23 09:42:46.217032
Function calc_mean_std_logged called at 2025-08-23 09:42:46.217559
(np.float64(3.0), np.float64(1.4142135623730951))


In [57]:
#summarize the dataset
print("Column names:", df.columns.tolist())
print("Number of rows:", len(df))

def df_Summary():
    return df.groupby(df.columns[0]).agg({
        df.columns[1]: ['mean', 'sum', 'count']
    })

#Quick check example
print(df_Summary().head(10))


Column names: ['time', 'open', 'high', 'low', 'close']
Number of rows: 584
               open               
               mean      sum count
time                              
2023-04-25  27.0780  27.0780     1
2023-04-26  27.0020  27.0020     1
2023-04-27  27.3630  27.3630     1
2023-04-28  27.2250  27.2250     1
2023-05-01  27.8400  27.8400     1
2023-05-02  28.6800  28.6800     1
2023-05-03  27.8400  27.8400     1
2023-05-04  27.6510  27.6510     1
2023-05-05  27.8255  27.8255     1
2023-05-08  28.5220  28.5220     1


In [None]:
#To keep the dataset integrity, I will save all to the summary.csv file
project_root = Path.cwd().parent
processed_dir = project_root / "data" / "processed"
processed_dir.mkdir(parents=True, exist_ok=True)

summary_stats = df.describe()
summary_stats.to_csv(processed_dir / "summary.csv")
print(f"Summary stats saved to {processed_dir / 'summary.csv'}")

Summary stats saved to c:\Users\noven\Desktop\Github Fre-5040 Bootcamp\bootcamp_Junkuang_Lai\Bootcamp_Junkuang_Lai\project\data\processed\summary.csv


# Stage04：Data Acquisition and Ingestion

+ The NVDA dataset for the project has downloaded in homework time, therefore the script below is to demonstrate for milestone requirements

In [None]:
# Download stock data from yfinance
SYMBOL = 'NVDA'
import yfinance as yf
df_api = yf.download(SYMBOL, period='6mo', interval='1d', auto_adjust=True).reset_index()
# Use 'Adj Close' if present, else use 'Close'
if 'Adj Close' in df_api.columns:
    df_api = df_api[['Date', 'Adj Close']]
    df_api.columns = ['date', 'adj_close']
elif 'Close' in df_api.columns:
    df_api = df_api[['Date', 'Close']]
    df_api.columns = ['date', 'adj_close']
else:
    raise ValueError("No 'Adj Close' or 'Close' column found in yfinance data")

# Quick check
df_api.head()


[*********************100%***********************]  1 of 1 completed


Unnamed: 0,date,adj_close
0,2025-02-24,130.258972
1,2025-02-25,126.609558
2,2025-02-26,131.25882
3,2025-02-27,120.130608
4,2025-02-28,124.899834


# Stage05：Data Storage