In [17]:
%pip install pandas numpy matplotlib seaborn statsmodels requests openpyxl yfinance kaggle python-dotenv

1384.97s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install /home/pinagm/beaapi-0.0.2-py3-none-any.whl

Processing /home/pinagm/beaapi-0.0.2-py3-none-any.whl
beaapi is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Note: you may need to restart the kernel to use updated packages.


## Getting the ETF Fund data from State Street Global Advisors

In [19]:
import requests as req
import pandas as pd
import os
import glob
import subprocess
import zipfile
import  yfinance as yf
from typing import List
import beaapi as bea
from dotenv import load_dotenv
from datetime import datetime
# Output the filtered DataFrame
load_dotenv()


True

In [4]:
def fetch_and_process_etf_data(etf_list):
    ssga = [etf.lower() for etf in etf_list]
    subfolder = 'SSGA Data'
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)

    ssga_df_list = []

    for etf in ssga:
        url = f'https://www.ssga.com/library-content/products/fund-data/etfs/us/holdings-daily-us-en-{etf}.xlsx'
        response = req.get(url)
        current_date = datetime.now().strftime('%m-%d-%Y')
        file_path = os.path.join(subfolder, f'{etf}-{current_date}.xlsx')
        with open(file_path, 'wb') as file:
            file.write(response.content)
        df = pd.read_excel(file_path, skiprows=4, header=0, usecols="A:H")
        
        drop_index = df[df['Name'] == "Past performance is not a reliable indicator of future performance. Investment return and principal value will fluctuate, so you may have a gain or loss when shares are sold. Current performance may be higher or lower than that quoted. All results are historical and assume the reinvestment of dividends and capital gains. Visit www.ssga.com for most recent month-end performance. "].index

        if not drop_index.empty:
            df = df[:drop_index[0]]

        if df.iloc[-1].isna().all():
            df = df[:-1]

        ssga_df_list.append(df)

    return ssga_df_list

# Testing Output
etf_list = ['XLI', 'XLK', 'XLE', 'XLB']
ssga_df_list = fetch_and_process_etf_data(etf_list)
for df in ssga_df_list:
    print(df.tail())

                             Name  Ticker Identifier    SEDOL    Weight  \
76           GENERAC HOLDINGS INC    GNRC  368736104  B6197Q2  0.195883   
77   HUNTINGTON INGALLS INDUSTRIE     HII  446413106  B40SSC9  0.185974   
78                      US DOLLAR       -  999USDZ92        -  0.025554   
79  SSI US GOV MONEY MARKET CLASS       -  924QSGII3        -  0.018963   
80        XAI EMINI INDUSTR MAR25  AIXH25  ADI2SGVK5        - -0.002176   

   Sector  Shares Held Local Currency  
76      -    303522.00            USD  
77      -    200836.00            USD  
78      -   5408113.03            USD  
79      -   4013194.45            USD  
80      -     25800.00            USD  
                             Name  Ticker Identifier    SEDOL    Weight  \
68             ENPHASE ENERGY INC    ENPH  29355A107  B65SQW4  0.095184   
69                      US DOLLAR       -  999USDZ92        -  0.038426   
70  SSI US GOV MONEY MARKET CLASS       -  924QSGII3        -  0.027552   
71       

In [10]:
def fetch_bea_data(api_key, years):
    base_url = "https://apps.bea.gov/api/data/"
    params = {
        "UserID": api_key,
        "method": "GetData",
        "datasetname": "GDPbyIndustry",
        "Frequency": "A,Q",
        "Year": years,
        "Industry": "ALL",
        "TableID": "ALL",
        "ResultFormat": "JSON"
    }
    try:
        response = req.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if 'Error' in data['BEAAPI']:
            error_code = data['BEAAPI']['Error']['APIErrorCode']
            error_description = data['BEAAPI']['Error']['APIErrorDescription']
            print(f"API request failed with error code {error_code}: {error_description}")
            return None
        return data
    except req.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        return None

def process_bea_data(data):
    if 'BEAAPI' not in data or 'Results' not in data['BEAAPI']:
        print("Error: 'Results' key not found in the API response.")
        return None
    results = data['BEAAPI']['Results']
    df = pd.DataFrame(results)
    nested_data = df.loc[0, "Data"]
    bea_df = pd.DataFrame(nested_data)
    bea_df.rename(columns={"IndustrYDescription": "IndustryDescription"}, inplace=True)
    bea_df['DataValue'] = bea_df['DataValue'].astype(float)
    
    # Pivot the data to have years and quarters as columns
    bea_df['YearQuarter'] = bea_df['Year'].astype(str) + 'Q' + bea_df['Quarter']
    pivot_df = bea_df.pivot_table(index=['Industry', 'IndustryDescription'], columns='YearQuarter', values='DataValue', aggfunc='sum').reset_index()
    
    # Filter out unwanted industry descriptions
    filtered_df = pivot_df[~pivot_df['IndustryDescription'].isin([
        "Taxes on production and imports less subsidies",
        "Energy inputs",
        "Intermediate inputs",
        "Materials inputs",
        "Purchased-services inputs",
        "Value added",
        "Compensation of employees",
        "Gross operating surplus"
    ])]
    
    return filtered_df

def save_bea_data(filtered_df):
    subfolder = 'BEA Data'
    os.makedirs(subfolder, exist_ok=True)
    current_date = datetime.now().strftime('%m-%d-%Y')
    file_path = os.path.join(subfolder, f"bea-gdp-by-industry-raw-{current_date}.xlsx")
    filtered_df.to_excel(file_path, index=False)
    return file_path

def map_sectors(filtered_df):
    sector_map = {
        "Technology": [
            "Computer and electronic products",
            "Computer systems design and related services",
            "Data processing, internet publishing, and other information services",
            "Information-communications-technology-producing industries"
        ],
        "Materials": [
            "Agriculture, forestry, fishing, and hunting",
            "Farms",
            "Forestry, fishing, and related activities",
            "Mining",
            "Mining, except oil and gas",
            "Support activities for mining",
            "Wood products",
            "Paper products",
            "Chemical products",
            "Plastics and rubber products",
            "Nonmetallic mineral products",
            "Primary metals",
            "Fabricated metal products"
        ],
        "Energy": [
            "Oil and gas extraction",
            "Petroleum and coal products",
            "Pipeline transportation"
        ],
        "Industrials": [
            "Construction",
            "Machinery",
            "Electrical equipment, appliances, and components",
            "Other transportation equipment",
            "Miscellaneous manufacturing",
            "Durable goods",
            "Wholesale trade",
            "Rail transportation",
            "Water transportation",
            "Truck transportation",
            "Transit and ground passenger transportation",
            "Other transportation and support activities",
            "Transportation and warehousing",
            "Warehousing and storage",
            "Waste management and remediation services"
        ]
    }

    def get_sector(category):
        for sector, cat_list in sector_map.items():
            if category in cat_list:
                return sector
        return "Other"

    filtered_df = filtered_df.copy()
    filtered_df.loc[:, "Sector"] = filtered_df["IndustryDescription"].apply(get_sector)
    focus_sectors = ["Technology", "Materials", "Energy", "Industrials"]
    df_filtered = filtered_df[filtered_df["Sector"].isin(focus_sectors)]
    return df_filtered

def fetch_bea_gdp_by_industry(api_key, years):
    data = fetch_bea_data(api_key, years)
    if data is None:
        return None
    filtered_df = process_bea_data(data)
    if filtered_df is not None:
        save_bea_data(filtered_df)
        df_filtered = map_sectors(filtered_df)
        current_date = datetime.now().strftime('%m-%d-%Y')
        subfolder = 'BEA Data'
        filtered_file_path = os.path.join(subfolder, f"bea-gdp-by-industry-filtered-{current_date}.xlsx")
        df_filtered.to_excel(filtered_file_path, index=False)
        return df_filtered
    else:
        print("Error: Processed data is None.")
        return None

# Output the filtered DataFrame
api_key = os.environ.get("beakey")
years = "2020,2021,2022,2023, 2024"
bea_df = fetch_bea_gdp_by_industry(api_key, years)
if bea_df is not None:
    print(bea_df.tail())

YearQuarter Industry                                IndustryDescription  \
394             48TW                     Transportation and warehousing   
404              493                            Warehousing and storage   
451              514  Data processing, internet publishing, and othe...   
559             5415       Computer systems design and related services   
602              562          Waste management and remediation services   

YearQuarter  2020Q2020    2020QI   2020QII  2020QIII   2020QIV  2021Q2021  \
394           7881.345  8036.309  5907.210  7264.389  7595.658   9350.114   
404           2134.056  1338.051  1240.265  1570.607  1506.619   2378.808   
451           4786.393  3599.149  3500.392  4012.804  4217.588   5781.148   
559           4695.484  3919.720  3784.263  3775.772  4042.613   5194.543   
602           2083.641  1357.995  1169.754  1400.315  1396.236   2285.658   

YearQuarter    2021QI   2021QII  ...   2022QIV  2023Q2023    2023QI   2023QII  \
394  

In [11]:
def setup_kaggle_data():
    # Step 1: Ensure the "Kaggle Data" directory exists
    kaggle_data_dir = "Kaggle Data"
    os.makedirs(kaggle_data_dir, exist_ok=True)
    
    # Step 2: Check if the .csv file already exists
    csv_exists = any(filename.endswith('.csv') for filename in os.listdir(kaggle_data_dir))
    
    if not csv_exists:
        # Step 3: Download the dataset using the Kaggle CLI
        dataset = "jakewright/9000-tickers-of-stock-market-data-full-history"
        subprocess.run(["kaggle", "datasets", "download", "-d", dataset], check=True)
        
        # Step 4: Unzip the downloaded file
        zip_filename = dataset.split('/')[-1] + ".zip"
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall(kaggle_data_dir)
        
        # Step 5: Remove any files in the "Kaggle Data" directory that are not .csv files
        for filename in os.listdir(kaggle_data_dir):
            if not filename.endswith('.csv'):
                os.remove(os.path.join(kaggle_data_dir, filename))
        
        # Optionally, remove the downloaded zip file
        os.remove(zip_filename)

# Call the function
setup_kaggle_data()

# Load the CSV file into a DataFrame
kag_df = pd.read_csv('Kaggle Data/all_stock_data.csv')

# Convert the 'Date' column to datetime format
kag_df['Date'] = pd.to_datetime(kag_df['Date'])

# Filter the DataFrame for dates between 2020-01-01 and 2023-12-31
start_date = '2020-01-01'
end_date = '2023-12-31'
filtered_kag_df = kag_df[(kag_df['Date'] >= start_date) & (kag_df['Date'] <= end_date)]

# Display the first few rows of the filtered DataFrame
filtered_kag_df.head()

Dataset URL: https://www.kaggle.com/datasets/jakewright/9000-tickers-of-stock-market-data-full-history
License(s): other
Downloading 9000-tickers-of-stock-market-data-full-history.zip to /home/pinagm/dev/Udacity_DataAnalyst_NanoDegree/D502 - Capstone Project


100%|██████████| 1.79G/1.79G [00:27<00:00, 70.8MB/s]





Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Dividends,Stock Splits
24919429,2020-01-02,HOFV,232.320007,232.539993,231.660004,231.660004,11459.0,0.0,0.0
24919430,2020-01-02,CTSH,61.055629,61.222259,60.457717,60.692959,2234500.0,0.0,0.0
24919431,2020-01-02,AZUL,43.610001,44.080002,43.150002,43.59,532300.0,0.0,0.0
24919432,2020-01-02,FNCTF,12.707134,12.707134,12.707134,12.707134,100.0,0.0,0.0
24919433,2020-01-02,BLE,13.720488,13.720488,13.621064,13.657218,54700.0,0.0,0.0


In [12]:
filtered_kag_df.tail()

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Dividends,Stock Splits
32724160,2023-12-29,RTO,28.459999,28.620001,28.26,28.610001,620800.0,0.0,0.0
32724161,2023-12-29,SHZNY,41.02,41.02,41.02,41.02,0.0,0.0,0.0
32724162,2023-12-29,ONMD,0.745,0.9,0.745,0.81,70100.0,0.0,0.0
32724163,2023-12-29,KMPR,48.349998,49.150002,48.150002,48.669998,312300.0,0.0,0.0
32724164,2023-12-29,SBT,5.85,5.87,5.77,5.77,14200.0,0.0,0.0


In [15]:
def ensure_directory_exists(directory: str) -> None:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created.")
    else:
        print(f"Directory '{directory}' already exists.")

def download_data(tickers: List[str], start_date: str, end_date: str, interval: str) -> pd.DataFrame:
    return yf.download(
        tickers=tickers,
        start=start_date,
        end=end_date,
        interval=interval,
        group_by="ticker",
        auto_adjust=True,
        threads=True,
        progress=True
    )

def filter_columns(df: pd.DataFrame, tickers: List[str], fields: List[str]) -> pd.DataFrame:
    keep_cols = [(t, field) for t in tickers for field in fields if (t, field) in df.columns]
    return df[keep_cols].copy()

def add_change_columns(df: pd.DataFrame, tickers: List[str]) -> pd.DataFrame:
    for t in tickers:
        if (t, "High") in df.columns and (t, "Low") in df.columns:
            df[(t, "Change")] = df[(t, "High")] - df[(t, "Low")]
    return df

def process_change_columns(df: pd.DataFrame) -> pd.DataFrame:
    df_only_change = df.loc[:, (slice(None), "Change")].copy()
    df_only_change.columns = df_only_change.columns.droplevel(1)
    df_only_change.columns = [f"{ticker}_Change" for ticker in df_only_change.columns]
    df_only_change.reset_index(inplace=True)
    if df_only_change.columns[0] == "index":
        df_only_change.rename(columns={"index": "Date"}, inplace=True)
    return df_only_change

def save_to_csv(df: pd.DataFrame, path: str) -> None:
    df.to_csv(path, index=False)
    print(f"Data with only 'Change' columns saved to: {path}")

# Main execution flow
data_dir = "Yahoo Data"
ensure_directory_exists(data_dir)

tickers = ["XLI", "XLK", "XLE", "XLB"]
df_full = download_data(tickers, "2020-01-01", "2024-01-01", "3mo")

df_filtered = filter_columns(df_full, tickers, ["High", "Low"])
df_filtered = add_change_columns(df_filtered, tickers)

df_only_change = process_change_columns(df_filtered)

csv_path = os.path.join(data_dir, "sector_quarterly_only_change.csv")
save_to_csv(df_only_change, csv_path)

print(df_only_change.head())

[*********************100%***********************]  4 of 4 completed

Directory 'Yahoo Data' created.
Data with only 'Change' columns saved to: Yahoo Data/sector_quarterly_only_change.csv
        Date  XLI_Change  XLK_Change  XLE_Change  XLB_Change
0 2020-01-01   34.623135   33.295942   30.822577   21.908651
1 2020-04-01   19.467889   28.695256   16.112842   16.252516
2 2020-07-01   12.474146   23.360491    8.296762   10.461685
3 2020-10-01   14.514662   21.075830   12.746975   10.079117
4 2021-01-01   13.851598   14.060702   14.423070    9.381535





In [None]:
import pandas as pd
import glob

# Step 1: Read the SSGA Excel files and extract the top 10 stocks by weight
ssga_files = glob.glob('/home/pinagm/dev/Udacity_DataAnalyst_NanoDegree/D502 - Capstone Project/SSGA Data/*.xlsx')
top_tickers = []

for file in ssga_files:
    df = pd.read_excel(file, skiprows=4)  # Adjust skiprows if necessary
    top_10 = df.nlargest(10, 'Weight')['Ticker'].tolist()
    top_tickers.extend(top_10)

# Remove duplicates
top_tickers = list(set(top_tickers))

# Step 2: Filter the existing filtered_kag_df based on the extracted tickers
filtered_kag_df = filtered_kag_df[filtered_kag_df['Ticker'].isin(top_tickers)]

# Step 3: Filter the DataFrame for dates between 2020-01-01 and 2023-12-31
start_date = '2020-01-01'
end_date = '2023-12-31'
filtered_kag_df = filtered_kag_df[(filtered_kag_df['Date'] >= start_date) & (filtered_kag_df['Date'] <= end_date)]

# Display the first few rows of the filtered DataFrame
filtered_kag_df.head()
filtered_kag_df.to_csv('/home/pinagm/dev/Udacity_DataAnalyst_NanoDegree/D502 - Capstone Project/Kaggle Data/filtered_stock_data.csv', index=False)

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Dividends,Stock Splits
24919794,2020-01-02,COP,60.402057,60.772169,60.004188,60.568607,4122800.0,0.0,0.0
24919827,2020-01-02,BA,326.606765,331.378393,325.761816,331.348572,4544400.0,0.0,0.0
24919990,2020-01-02,RTX,88.948579,91.015768,88.948579,90.968246,4451584.0,0.0,0.0
24920232,2020-01-02,KMI,18.292551,18.292551,18.068736,18.111778,10373100.0,0.0,0.0
24920271,2020-01-02,AVGO,16.922052,17.090574,16.816065,17.084743,20329000.0,0.0,0.0
