In [20]:
pip install lxml

Collecting lxml
  Downloading lxml-6.0.0-cp313-cp313-win_amd64.whl.metadata (6.8 kB)
Downloading lxml-6.0.0-cp313-cp313-win_amd64.whl (4.0 MB)
   ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
   ---------- ----------------------------- 1.0/4.0 MB 20.7 MB/s eta 0:00:01
   ---------------------------------------- 4.0/4.0 MB 28.1 MB/s eta 0:00:00
Installing collected packages: lxml
Successfully installed lxml-6.0.0
Note: you may need to restart the kernel to use updated packages.


In [21]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from datetime import datetime


In [22]:
load_dotenv()
api_key = os.getenv("API_KEY")
print("API Key Loaded:", bool(api_key))


API Key Loaded: True


In [23]:
# Params
ticker = "AAPL"
url = "https://www.alphavantage.co/query"
params = {
    "function": "TIME_SERIES_DAILY",
    "symbol": ticker,
    "apikey": api_key,
    "outputsize": "compact"
}

# Request
response = requests.get(url, params=params)
data = response.json()

# Parse
df_api = pd.DataFrame.from_dict(data["Time Series (Daily)"], orient="index")
df_api.index = pd.to_datetime(df_api.index)
df_api = df_api.astype(float).reset_index().rename(columns={"index":"date"})

# Validate
print(df_api.shape)
print(df_api.isna().sum())

# Save
timestamp = datetime.now().strftime("%Y%m%d-%H%M")
fname = f"D:/bootcamp_Jinay_Jain/homework/homework4/data/raw/api_alphavantage_{ticker}_{timestamp}.csv"
df_api.to_csv(fname, index=False)


(100, 6)
date         0
1. open      0
2. high      0
3. low       0
4. close     0
5. volume    0
dtype: int64


In [24]:
url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
html = requests.get(url).text
soup = BeautifulSoup(html, "html.parser")

# Parse first table
table = soup.find("table", {"id": "constituents"})
df_scrape = pd.read_html(str(table))[0]

# Validate
print(df_scrape.head())
print(df_scrape.isna().sum())

# Save
timestamp = datetime.now().strftime("%Y%m%d-%H%M")
fname = f"D:/bootcamp_Jinay_Jain/homework/homework4/data/raw/scrape_wikipedia_sp500_{timestamp}.csv"
df_scrape.to_csv(fname, index=False)


  Symbol             Security             GICS Sector  \
0    MMM                   3M             Industrials   
1    AOS          A. O. Smith             Industrials   
2    ABT  Abbott Laboratories             Health Care   
3   ABBV               AbbVie             Health Care   
4    ACN            Accenture  Information Technology   

                GICS Sub-Industry    Headquarters Location  Date added  \
0        Industrial Conglomerates    Saint Paul, Minnesota  1957-03-04   
1               Building Products     Milwaukee, Wisconsin  2017-07-26   
2           Health Care Equipment  North Chicago, Illinois  1957-03-04   
3                   Biotechnology  North Chicago, Illinois  2012-12-31   
4  IT Consulting & Other Services          Dublin, Ireland  2011-07-06   

       CIK      Founded  
0    66740         1902  
1    91142         1916  
2     1800         1888  
3  1551152  2013 (1888)  
4  1467373         1989  
Symbol                   0
Security                 0
GI

  df_scrape = pd.read_html(str(table))[0]


### Data Sources
- API: Alpha Vantage (TIME_SERIES_DAILY) — params: symbol=AAPL, outputsize=compact
- Scrape: Wikipedia S&P 500 Companies List

### Validation Logic
- API: ensured dates parsed, numeric types converted, NA counts checked
- Scrape: verified text/numeric cols, NA counts checked

### Assumptions & Risks
- API limited to 5 calls/min (risk: throttling)
- Wikipedia structure may change (risk: scraping fails)
- Dates parsed as UTC (risk: time zone differences)

### Notes
- `.env` file stores API key and is excluded from GitHub.
