In [25]:
%pip install pandas numpy matplotlib seaborn statsmodels requests openpyxl yfinance kaggle

Collecting kaggle
  Downloading kaggle-1.6.17.tar.gz (82 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tqdm (from kaggle)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting python-slugify (from kaggle)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting bleach (from kaggle)
  Downloading bleach-6.2.0-py3-none-any.whl.metadata (30 kB)
Collecting webencodings (from bleach->kaggle)
  Downloading webencodings-0.5.1-py2.py3-none-any.whl.metadata (2.1 kB)
Collecting text-unidecode>=1.3 (from python-slugify->kaggle)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Downloading bleach-6.2.0-py3-none-any.whl (163 kB)
Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Do

## Getting the ETF Fund data from State Street Global Advisors

In [28]:
import requests as req
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import subprocess
import zipfile
import json
import  yfinance as yf
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import statsmodels.graphics.api as smg
import statsmodels.tsa.api as smt
import kaggle
from datetime import datetime


In [None]:
def fetch_and_process_etf_data(etf_list):
    ssga = [etf.lower() for etf in etf_list]
    subfolder = 'SSGA Data'
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)

    ssga_df_list = []

    for etf in ssga:
        url = f'https://www.ssga.com/library-content/products/fund-data/etfs/us/holdings-daily-us-en-{etf}.xlsx'
        response = req.get(url)
        current_date = datetime.now().strftime('%m-%d-%Y')
        file_path = os.path.join(subfolder, f'{etf}-{current_date}.xlsx')
        with open(file_path, 'wb') as file:
            file.write(response.content)
        df = pd.read_excel(file_path, skiprows=4, header=0, usecols="A:H")
        
        drop_index = df[df['Name'] == "Past performance is not a reliable indicator of future performance. Investment return and principal value will fluctuate, so you may have a gain or loss when shares are sold. Current performance may be higher or lower than that quoted. All results are historical and assume the reinvestment of dividends and capital gains. Visit www.ssga.com for most recent month-end performance. "].index

        if not drop_index.empty:
            df = df[:drop_index[0]]

        if df.iloc[-1].isna().all():
            df = df[:-1]

        ssga_df_list.append(df)

    return ssga_df_list

# Example usage
etf_list = ['XLI', 'XLK', 'XLE', 'XLB']
ssga_df_list = fetch_and_process_etf_data(etf_list)
for df in ssga_df_list:
    print(df.tail())

                             Name  Ticker Identifier    SEDOL    Weight  \
76              SMITH (A.O.) CORP     AOS  831865209  2816023  0.186860   
77   HUNTINGTON INGALLS INDUSTRIE     HII  446413106  B40SSC9  0.164458   
78  SSI US GOV MONEY MARKET CLASS       -  924QSGII3        -  0.026875   
79                      US DOLLAR       -  999USDZ92        -  0.003086   
80        XAI EMINI INDUSTR MAR25  AIXH25  ADI2SGVK5        - -0.000239   

   Sector  Shares Held Local Currency  
76      -    617461.00            USD  
77      -    204036.00            USD  
78      -   5931422.87            USD  
79      -    680999.90            USD  
80      -     10000.00            USD  
                             Name  Ticker Identifier    SEDOL    Weight  \
67         SKYWORKS SOLUTIONS INC    SWKS  83088M102  2961053  0.111133   
68             ENPHASE ENERGY INC    ENPH  29355A107  B65SQW4  0.092398   
69  SSI US GOV MONEY MARKET CLASS       -  924QSGII3        -  0.082990   
70       

In [None]:
def fetch_bea_gdp_by_industry(api_key, years):
    base_url = "https://apps.bea.gov/api/data/"
    
    # Parameters for the API request
    params = {
        "UserID": api_key,          # Your API key
        "method": "GetData",        # API method
        "datasetname": "GDPbyIndustry",  # Dataset
        "Frequency": "A",           # Annual data (use "Q" for quarterly)
        "Year": years,              # Data for specified years
        "Industry": "ALL",          # Retrieve data for all industries
        "TableID": "6",             # TableID for value-added contributions (GDP by industry)
        "ResultFormat": "JSON"      # Request data in JSON format
    }

    try:
        # Send the API request
        response = req.get(base_url, params=params)
        response.raise_for_status()
    except req.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        return None

    # Parse the JSON response
    data = response.json()
    # Extract the relevant data from the response
    results = data['BEAAPI']['Results']
    # Convert the data to a Pandas DataFrame
    df = pd.DataFrame(results)

    # Extract the nested "Data" column from the DataFrame
    nested_data = df.loc[0, "Data"]  # Extract the list of dictionaries in the "Data" column

    # Convert the nested data into a DataFrame
    bea_df = pd.DataFrame(nested_data)

    # Rename the column
    bea_df.rename(columns={"IndustrYDescription": "IndustryDescription"}, inplace=True)

    # Convert 'DataValue' to float
    bea_df['DataValue'] = bea_df['DataValue'].astype(float)

    # Group by 'Industry' and sum the 'DataValue'
    grouped_df = bea_df.groupby(['Industry', 'IndustryDescription'], as_index=False)['DataValue'].sum()

    # Filter out specific IndustryDescription values
    filtered_df = grouped_df[~grouped_df['IndustryDescription'].isin([
        "Taxes on production and imports less subsidies",
        "Compensation of employees",
        "Gross operating surplus"
    ])]

    # Create directory if it doesn't exist
    subfolder = 'BEA Data'
    os.makedirs(subfolder, exist_ok=True)

    # Get the current date
    current_date = datetime.now().strftime('%m-%d-%Y')

    # Save to Excel file
    file_path = os.path.join(subfolder, f"bea-gdp-by-industry-raw-{current_date}.xlsx")
    filtered_df.to_excel(file_path, index=False)

    # Sector mapping
    sector_map = {
        "Technology": [
            "Computer and electronic products",
            "Computer systems design and related services",
            "Data processing, internet publishing, and other information services",
            "Information-communications-technology-producing industries"
        ],
        "Materials": [
            "Agriculture, forestry, fishing, and hunting",
            "Farms",
            "Forestry, fishing, and related activities",
            "Mining",
            "Mining, except oil and gas",
            "Support activities for mining",
            "Wood products",
            "Paper products",
            "Chemical products",
            "Plastics and rubber products",
            "Nonmetallic mineral products",
            "Primary metals",
            "Fabricated metal products"
        ],
        "Energy": [
            "Oil and gas extraction",
            "Petroleum and coal products",
            "Pipeline transportation"
        ],
        "Industrials": [
            "Construction",
            "Machinery",
            "Electrical equipment, appliances, and components",
            "Other transportation equipment",
            "Miscellaneous manufacturing",
            "Durable goods",
            "Wholesale trade",
            "Rail transportation",
            "Water transportation",
            "Truck transportation",
            "Transit and ground passenger transportation",
            "Other transportation and support activities",
            "Transportation and warehousing",
            "Warehousing and storage",
            "Waste management and remediation services"
        ]
    }

    # Define a function to look up the sector for each row
    def get_sector(category):
        for sector, cat_list in sector_map.items():
            if category in cat_list:
                return sector
        return "Other"

    # Apply the function to your DataFrame
    filtered_df = filtered_df.copy()  # Avoid SettingWithCopyWarning
    filtered_df.loc[:, "Sector"] = filtered_df["IndustryDescription"].apply(get_sector)

    # Filter out rows that are not in your 4 focus sectors or are categorized as "Other"
    focus_sectors = ["Technology", "Materials", "Energy", "Industrials"]
    df_filtered = filtered_df[filtered_df["Sector"].isin(focus_sectors)]

    # Save the filtered DataFrame to a new Excel file
    filtered_file_path = os.path.join(subfolder, f"bea-gdp-by-industry-filtered-{current_date}.xlsx")
    df_filtered.to_excel(filtered_file_path, index=False)

    return df_filtered

# Example usage
api_key = "E202F759-759F-424F-BEBA-158DCD981AAA"
years = "2020,2021,2022,2023"
bea_df = fetch_bea_gdp_by_industry(api_key, years)
if bea_df is not None:
    print(bea_df.tail())

    Industry                           IndustryDescription  DataValue  \
242     5411                                Legal services     1326.5   
249     5415  Computer systems design and related services     1781.9   
254       55       Management of companies and enterprises     1831.7   
260      561           Administrative and support services     2827.9   
267      562     Waste management and remediation services      285.1   

          Sector  
242  Industrials  
249   Technology  
254  Industrials  
260  Industrials  
267  Industrials  


In [None]:
def setup_kaggle_data():
    # Step 1: Ensure the "Kaggle Data" directory exists
    kaggle_data_dir = "Kaggle Data"
    os.makedirs(kaggle_data_dir, exist_ok=True)
    
    # Step 2: Check if the .csv file already exists
    csv_exists = any(filename.endswith('.csv') for filename in os.listdir(kaggle_data_dir))
    
    if not csv_exists:
        # Step 3: Download the dataset using the Kaggle CLI
        dataset = "jakewright/9000-tickers-of-stock-market-data-full-history"
        subprocess.run(["kaggle", "datasets", "download", "-d", dataset], check=True)
        
        # Step 4: Unzip the downloaded file
        zip_filename = dataset.split('/')[-1] + ".zip"
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall(kaggle_data_dir)
        
        # Step 5: Remove any files in the "Kaggle Data" directory that are not .csv files
        for filename in os.listdir(kaggle_data_dir):
            if not filename.endswith('.csv'):
                os.remove(os.path.join(kaggle_data_dir, filename))
        
        # Optionally, remove the downloaded zip file
        os.remove(zip_filename)

# Call the function
setup_kaggle_data()

# Load the CSV file into a DataFrame
kag_df = pd.read_csv('Kaggle Data/all_stock_data.csv')

# Convert the 'Date' column to datetime format
kag_df['Date'] = pd.to_datetime(kag_df['Date'])

# Filter the DataFrame for dates between 2020-01-01 and 2023-12-31
start_date = '2020-01-01'
end_date = '2023-12-31'
filtered_kag_df = kag_df[(kag_df['Date'] >= start_date) & (kag_df['Date'] <= end_date)]

# Display the first few rows of the filtered DataFrame
filtered_kag_df.head()

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Dividends,Stock Splits
24919429,2020-01-02,HOFV,232.320007,232.539993,231.660004,231.660004,11459.0,0.0,0.0
24919430,2020-01-02,CTSH,61.055629,61.222259,60.457717,60.692959,2234500.0,0.0,0.0
24919431,2020-01-02,AZUL,43.610001,44.080002,43.150002,43.59,532300.0,0.0,0.0
24919432,2020-01-02,FNCTF,12.707134,12.707134,12.707134,12.707134,100.0,0.0,0.0
24919433,2020-01-02,BLE,13.720488,13.720488,13.621064,13.657218,54700.0,0.0,0.0


In [34]:
filtered_kag_df.tail()

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Dividends,Stock Splits
32724160,2023-12-29,RTO,28.459999,28.620001,28.26,28.610001,620800.0,0.0,0.0
32724161,2023-12-29,SHZNY,41.02,41.02,41.02,41.02,0.0,0.0,0.0
32724162,2023-12-29,ONMD,0.745,0.9,0.745,0.81,70100.0,0.0,0.0
32724163,2023-12-29,KMPR,48.349998,49.150002,48.150002,48.669998,312300.0,0.0,0.0
32724164,2023-12-29,SBT,5.85,5.87,5.77,5.77,14200.0,0.0,0.0
