In [2]:
from dotenv import load_dotenv
import os

load_dotenv()  # loads variables from .env file
print('ALPHAVANTAGE_API_KEY loaded?', bool(os.getenv('ALPHAVANTAGE_API_KEY')))


ALPHAVANTAGE_API_KEY loaded? True


In [2]:
import os
import pathlib
import datetime as dt
import requests
import pandas as pd
from bs4 import BeautifulSoup
from dotenv import load_dotenv

# Create raw data folder
RAW = pathlib.Path('data/raw')
RAW.mkdir(parents=True, exist_ok=True)

# Load API key from .env
load_dotenv()
print('ALPHAVANTAGE_API_KEY loaded?', bool(os.getenv('ALPHAVANTAGE_API_KEY')))


ALPHAVANTAGE_API_KEY loaded? True


In [3]:
def ts():
    return dt.datetime.now().strftime('%Y%m%d-%H%M%S')

def save_csv(df: pd.DataFrame, prefix: str, **meta):
    mid = '_'.join([f"{k}-{v}" for k,v in meta.items()])
    path = RAW / f"{prefix}_{mid}_{ts()}.csv"
    df.to_csv(path, index=False)
    print('Saved', path)
    return path

def validate(df: pd.DataFrame, required):
    missing = [c for c in required if c not in df.columns]
    return {'missing': missing, 'shape': df.shape, 'na_total': int(df.isna().sum().sum())}


In [4]:
SYMBOL = 'AAPL'
USE_ALPHA = bool(os.getenv('ALPHAVANTAGE_API_KEY'))

if USE_ALPHA:
    url = 'https://www.alphavantage.co/query'
    params = {
        'function': 'TIME_SERIES_DAILY_ADJUSTED',
        'symbol': SYMBOL,
        'outputsize': 'compact',
        'apikey': os.getenv('ALPHAVANTAGE_API_KEY')
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    js = r.json()
    
    # Check Time Series key
    ts_keys = [k for k in js if 'Time Series' in k]
    if not ts_keys:
        print("No Time Series data found in Alpha Vantage response. Switching to yfinance fallback.")
        USE_ALPHA = False
    
if not USE_ALPHA:
    import yfinance as yf
    df_api = yf.download(SYMBOL, period='3mo', interval='1d').reset_index()[['Date','Adj Close']]
    df_api.columns = ['date','adj_close']

# Convert types
df_api['date'] = pd.to_datetime(df_api['date'])
df_api['adj_close'] = pd.to_numeric(df_api['adj_close'])

# Validate and save
v_api = validate(df_api, ['date','adj_close'])
print('API Validation:', v_api)
_ = save_csv(df_api.sort_values('date'), prefix='api', source='yfinance' if not USE_ALPHA else 'alpha', symbol=SYMBOL)


No Time Series data found in Alpha Vantage response. Switching to yfinance fallback.


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['AAPL']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


API Validation: {'missing': [], 'shape': (0, 2), 'na_total': 0}
Saved data/raw/api_source-yfinance_symbol-AAPL_20250820-233154.csv


In [5]:
import requests
from bs4 import BeautifulSoup

SCRAPE_URL = 'https://www.nasdaq.com/market-activity/stocks/screener'  # example public page
headers = {'User-Agent':'AFE-Homework/1.0'}

try:
    resp = requests.get(SCRAPE_URL, headers=headers, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')

    # Extract table rows
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    # Filter out empty rows
    rows = [r for r in rows if r]
    header, *data = rows

    # Create DataFrame
    df_scrape = pd.DataFrame(data, columns=header)
except Exception as e:
    print('Scrape failed, using inline demo table:', e)
    html = '<table><tr><th>Ticker</th><th>Price</th></tr><tr><td>AAA</td><td>101.2</td></tr></table>'
    soup = BeautifulSoup(html, 'html.parser')
    rows = [[c.get_text(strip=True) for c in tr.find_all(['th','td'])] for tr in soup.find_all('tr')]
    header, *data = [r for r in rows if r]
    df_scrape = pd.DataFrame(data, columns=header)

# Convert numeric columns
if 'Price' in df_scrape.columns:
    df_scrape['Price'] = pd.to_numeric(df_scrape['Price'], errors='coerce')

# Validate
v_scrape = validate(df_scrape, list(df_scrape.columns))
print('Scrape Validation:', v_scrape)

# Save CSV
_ = save_csv(df_scrape, prefix='scrape', site='nasdaq_table', table='markets')


Scrape failed, using inline demo table: 1 columns passed, passed data had 6 columns
Scrape Validation: {'missing': [], 'shape': (1, 2), 'na_total': 0}
Saved data/raw/scrape_site-nasdaq_table_table-markets_20250820-233432.csv


## Step 5: Documentation, Assumptions & Risks

### Data Sources
1. **API:** Alpha Vantage / Yahoo Finance fallback (`AAPL` daily adjusted close prices)
2. **Scrape:** Example market table (fallback inline HTML table used due to live scrape failure)

### Parameters
- Symbol: `AAPL`
- API: Alpha Vantage (key from `.env`) or `yfinance` fallback
- Scrape URL: placeholder `https://example.com/markets-table`
- CSV files saved to: `data/raw/`

### Validation Logic
- API data:
  - Required columns: `date`, `adj_close`
  - Check for missing columns, total NA values, shape
- Scrape data:
  - Required columns: all columns in table
  - Check for missing columns, total NA values, shape

### Assumptions & Risks
- API fallback ensures workflow works if Alpha Vantage premium endpoint is unavailable
- Scrape fallback ensures reproducibility even if the webpage is unreachable
- Rate limits on `yfinance` may prevent fetching recent data immediately
- CSV files are timestamped for versioning


In [6]:
# Confirm .env is not committed and show what API key is loaded
import os
from dotenv import load_dotenv

load_dotenv()
print("ALPHAVANTAGE_API_KEY loaded?", bool(os.getenv('ALPHAVANTAGE_API_KEY')))

# List all CSVs saved in data/raw/ for reference
import pathlib

RAW = pathlib.Path('data/raw')
for f in sorted(RAW.glob('*.csv')):
    print(f.name)


ALPHAVANTAGE_API_KEY loaded? True
api_source-yfinance_symbol-AAPL_20250820-232435.csv
api_source-yfinance_symbol-AAPL_20250820-233154.csv
scrape_site-nasdaq_table-markets_20250820-232436.csv
scrape_site-nasdaq_table_table-markets_20250820-233432.csv
