In [1]:
%pip install pandas numpy matplotlib seaborn statsmodels requests openpyxl yfinance kaggle python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install /home/pinagm/beaapi-0.0.2-py3-none-any.whl

Processing /home/pinagm/beaapi-0.0.2-py3-none-any.whl
beaapi is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.
Note: you may need to restart the kernel to use updated packages.


## Getting the ETF Fund data from State Street Global Advisors

In [3]:
import requests as req
import pandas as pd
import os
import subprocess
import zipfile
import  yfinance as yf
from typing import List
import beaapi as bea
from dotenv import load_dotenv
from datetime import datetime
# Output the filtered DataFrame
load_dotenv()


True

In [4]:
def fetch_and_process_etf_data(etf_list):
    ssga = [etf.lower() for etf in etf_list]
    subfolder = 'SSGA Data'
    if not os.path.exists(subfolder):
        os.makedirs(subfolder)

    ssga_df_list = []

    for etf in ssga:
        url = f'https://www.ssga.com/library-content/products/fund-data/etfs/us/holdings-daily-us-en-{etf}.xlsx'
        response = req.get(url)
        current_date = datetime.now().strftime('%m-%d-%Y')
        file_path = os.path.join(subfolder, f'{etf}-{current_date}.xlsx')
        with open(file_path, 'wb') as file:
            file.write(response.content)
        df = pd.read_excel(file_path, skiprows=4, header=0, usecols="A:H")
        
        drop_index = df[df['Name'] == "Past performance is not a reliable indicator of future performance. Investment return and principal value will fluctuate, so you may have a gain or loss when shares are sold. Current performance may be higher or lower than that quoted. All results are historical and assume the reinvestment of dividends and capital gains. Visit www.ssga.com for most recent month-end performance. "].index

        if not drop_index.empty:
            df = df[:drop_index[0]]

        if df.iloc[-1].isna().all():
            df = df[:-1]

        ssga_df_list.append(df)

    return ssga_df_list

# Example usage
etf_list = ['XLI', 'XLK', 'XLE', 'XLB']
ssga_df_list = fetch_and_process_etf_data(etf_list)
for df in ssga_df_list:
    print(df.tail())

                             Name  Ticker Identifier    SEDOL    Weight  \
76              SMITH (A.O.) CORP     AOS  831865209  2816023  0.186860   
77   HUNTINGTON INGALLS INDUSTRIE     HII  446413106  B40SSC9  0.164458   
78  SSI US GOV MONEY MARKET CLASS       -  924QSGII3        -  0.026875   
79                      US DOLLAR       -  999USDZ92        -  0.003086   
80        XAI EMINI INDUSTR MAR25  AIXH25  ADI2SGVK5        - -0.000239   

   Sector  Shares Held Local Currency  
76      -    617461.00            USD  
77      -    204036.00            USD  
78      -   5931422.87            USD  
79      -    680999.90            USD  
80      -     10000.00            USD  
                             Name  Ticker Identifier    SEDOL    Weight  \
67         SKYWORKS SOLUTIONS INC    SWKS  83088M102  2961053  0.111133   
68             ENPHASE ENERGY INC    ENPH  29355A107  B65SQW4  0.092398   
69  SSI US GOV MONEY MARKET CLASS       -  924QSGII3        -  0.082990   
70       

In [19]:
def fetch_bea_data(api_key, years):
    base_url = "https://apps.bea.gov/api/data/"
    params = {
        "UserID": api_key,
        "method": "GetData",
        "datasetname": "GDPbyIndustry",
        "Frequency": "A,Q",
        "Year": years,
        "Industry": "ALL",
        "TableID": "ALL",
        "ResultFormat": "JSON"
    }
    try:
        response = req.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if 'Error' in data['BEAAPI']:
            error_code = data['BEAAPI']['Error']['APIErrorCode']
            error_description = data['BEAAPI']['Error']['APIErrorDescription']
            print(f"API request failed with error code {error_code}: {error_description}")
            return None
        return data
    except req.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        return None

def process_bea_data(data):
    if 'BEAAPI' not in data or 'Results' not in data['BEAAPI']:
        print("Error: 'Results' key not found in the API response.")
        return None
    results = data['BEAAPI']['Results']
    df = pd.DataFrame(results)
    nested_data = df.loc[0, "Data"]
    bea_df = pd.DataFrame(nested_data)
    bea_df.rename(columns={"IndustrYDescription": "IndustryDescription"}, inplace=True)
    bea_df['DataValue'] = bea_df['DataValue'].astype(float)
    grouped_df = bea_df.groupby(['Industry', 'IndustryDescription'], as_index=False)['DataValue'].sum()
    filtered_df = grouped_df[~grouped_df['IndustryDescription'].isin([
        "Taxes on production and imports less subsidies",
        "Compensation of employees",
        "Gross operating surplus"
    ])]
    return filtered_df

def save_bea_data(filtered_df):
    subfolder = 'BEA Data'
    os.makedirs(subfolder, exist_ok=True)
    current_date = datetime.now().strftime('%m-%d-%Y')
    file_path = os.path.join(subfolder, f"bea-gdp-by-industry-raw-{current_date}.xlsx")
    filtered_df.to_excel(file_path, index=False)
    return file_path

def map_sectors(filtered_df):
    sector_map = {
        "Technology": [
            "Computer and electronic products",
            "Computer systems design and related services",
            "Data processing, internet publishing, and other information services",
            "Information-communications-technology-producing industries"
        ],
        "Materials": [
            "Agriculture, forestry, fishing, and hunting",
            "Farms",
            "Forestry, fishing, and related activities",
            "Mining",
            "Mining, except oil and gas",
            "Support activities for mining",
            "Wood products",
            "Paper products",
            "Chemical products",
            "Plastics and rubber products",
            "Nonmetallic mineral products",
            "Primary metals",
            "Fabricated metal products"
        ],
        "Energy": [
            "Oil and gas extraction",
            "Petroleum and coal products",
            "Pipeline transportation"
        ],
        "Industrials": [
            "Construction",
            "Machinery",
            "Electrical equipment, appliances, and components",
            "Other transportation equipment",
            "Miscellaneous manufacturing",
            "Durable goods",
            "Wholesale trade",
            "Rail transportation",
            "Water transportation",
            "Truck transportation",
            "Transit and ground passenger transportation",
            "Other transportation and support activities",
            "Transportation and warehousing",
            "Warehousing and storage",
            "Waste management and remediation services"
        ]
    }

    def get_sector(category):
        for sector, cat_list in sector_map.items():
            if category in cat_list:
                return sector
        return "Other"

    filtered_df = filtered_df.copy()
    filtered_df.loc[:, "Sector"] = filtered_df["IndustryDescription"].apply(get_sector)
    focus_sectors = ["Technology", "Materials", "Energy", "Industrials"]
    df_filtered = filtered_df[filtered_df["Sector"].isin(focus_sectors)]
    return df_filtered

def fetch_bea_gdp_by_industry(api_key, years):
    data = fetch_bea_data(api_key, years)
    if data is None:
        return None
    filtered_df = process_bea_data(data)
    if filtered_df is not None:
        save_bea_data(filtered_df)
        df_filtered = map_sectors(filtered_df)
        current_date = datetime.now().strftime('%m-%d-%Y')
        subfolder = 'BEA Data'
        filtered_file_path = os.path.join(subfolder, f"bea-gdp-by-industry-filtered-{current_date}.xlsx")
        df_filtered.to_excel(filtered_file_path, index=False)
        return df_filtered
    else:
        print("Error: Processed data is None.")
        return None

# Output the filtered DataFrame
api_key = os.environ.get("beakey")
years = "2020,2021,2022,2023, 2024"
bea_df = fetch_bea_gdp_by_industry(api_key, years)
if bea_df is not None:
    print(bea_df.tail())

    Industry                                IndustryDescription   DataValue  \
394     48TW                     Transportation and warehousing  208578.810   
404      493                            Warehousing and storage   40099.181   
451      514  Data processing, internet publishing, and othe...  117225.096   
559     5415       Computer systems design and related services  108399.211   
602      562          Waste management and remediation services   37627.182   

          Sector  
394  Industrials  
404  Industrials  
451   Technology  
559   Technology  
602  Industrials  


In [26]:
gdp = bea.get_parameter_list(api_key, "GDPbyIndustry")
# display(gdp)
tbl = bea.get_data(api_key, datasetname="GDPbyIndustry", Frequency="A,Q", Year="2020,2021,2022,2023,2024", TableID="ALL", Industry="ALL")
# display(tbl.set_index(tbl.attrs['index_cols']).head(2))
# print('Extra detail keys:' + str(tbl.attrs['detail'].keys()))
# print("Let's look at some interesting ones.")
# print('Statistic: ' + tbl.attrs['detail']['Statistic'])
# print("Notes corresponding to NoteRef:")
# display(tbl.attrs['detail']['Notes'].head())
display(bea.to_wide_vars_in_rows(tbl))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NoteRef,"(A, 2020, 2020)","(A, 2021, 2021)","(A, 2022, 2022)","(A, 2023, 2023)","(Q, 2020, I)","(Q, 2020, II)","(Q, 2020, III)","(Q, 2020, IV)","(Q, 2021, I)",...,"(Q, 2022, II)","(Q, 2022, III)","(Q, 2022, IV)","(Q, 2023, I)","(Q, 2023, II)","(Q, 2023, III)","(Q, 2023, IV)","(Q, 2024, I)","(Q, 2024, II)","(Q, 2024, III)"
TableID,Industry,IndustrYDescription,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,11,"Agriculture, forestry, fishing, and hunting",1,162.9,228.6,290.0,274.2,175.7,131.1,160.4,184.4,196.4,...,289.3,298.1,309.3,299.4,280.1,269.4,247.8,235.4,243.0,251.0
1,111CA,Farms,1,119.0,183.6,241.9,222.3,131.6,90.9,115.9,137.7,151.7,...,242.9,249.6,259.1,247.8,227.7,217.4,196.4,184.6,193.4,203.2
1,113FF,"Forestry, fishing, and related activities",1,43.9,45.0,48.1,51.9,44.1,40.2,44.5,46.6,44.6,...,46.5,48.5,50.2,51.7,52.4,52.0,51.4,50.7,49.6,47.8
1,21,Mining,1,201.9,331.9,460.6,411.8,241.4,146.8,195.4,223.8,279.9,...,503.6,488.2,435.0,406.6,388.1,424.1,428.3,399.2,404.4,390.9
1,211,Oil and gas extraction,1,103.3,221.4,319.1,257.0,132.1,53.6,99.6,127.8,179.7,...,363.5,342.0,286.9,248.5,231.2,273.6,274.8,246.5,251.9,239.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,ORE,Other real estate,209,756.5,885.4,901.0,936.3,763.7,700.7,752.0,809.8,825.7,...,888.4,901.9,896.7,921.0,933.9,942.4,948.1,969.0,963.5,947.5
209,PGOOD,Private goods-producing industries<sup>2</sup>,"209;209.2.A,Q",4457.6,4499.5,4537.2,4635.1,4662.3,4200.1,4448.2,4519.7,4586.8,...,4522.0,4557.4,4562.9,4609.9,4632.2,4646.2,4652.1,4708.1,4690.5,4688.8
209,PROF,Professional and business services,209,1510.0,1577.8,1685.5,1672.9,1599.4,1429.1,1459.8,1551.7,1548.7,...,1695.9,1707.6,1688.6,1707.2,1675.4,1645.2,1663.7,1662.6,1677.1,1700.9
209,PSERV,Private services-producing industries<sup>3</sup>,"209;209.3.A,Q",8764.0,9827.6,10430.2,10552.4,9115.1,8126.2,8697.0,9117.6,9447.0,...,10469.1,10523.4,10419.8,10550.4,10519.7,10552.2,10587.1,10702.6,10714.7,10836.0


In [8]:
def setup_kaggle_data():
    # Step 1: Ensure the "Kaggle Data" directory exists
    kaggle_data_dir = "Kaggle Data"
    os.makedirs(kaggle_data_dir, exist_ok=True)
    
    # Step 2: Check if the .csv file already exists
    csv_exists = any(filename.endswith('.csv') for filename in os.listdir(kaggle_data_dir))
    
    if not csv_exists:
        # Step 3: Download the dataset using the Kaggle CLI
        dataset = "jakewright/9000-tickers-of-stock-market-data-full-history"
        subprocess.run(["kaggle", "datasets", "download", "-d", dataset], check=True)
        
        # Step 4: Unzip the downloaded file
        zip_filename = dataset.split('/')[-1] + ".zip"
        with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
            zip_ref.extractall(kaggle_data_dir)
        
        # Step 5: Remove any files in the "Kaggle Data" directory that are not .csv files
        for filename in os.listdir(kaggle_data_dir):
            if not filename.endswith('.csv'):
                os.remove(os.path.join(kaggle_data_dir, filename))
        
        # Optionally, remove the downloaded zip file
        os.remove(zip_filename)

# Call the function
setup_kaggle_data()

# Load the CSV file into a DataFrame
kag_df = pd.read_csv('Kaggle Data/all_stock_data.csv')

# Convert the 'Date' column to datetime format
kag_df['Date'] = pd.to_datetime(kag_df['Date'])

# Filter the DataFrame for dates between 2020-01-01 and 2023-12-31
start_date = '2020-01-01'
end_date = '2023-12-31'
filtered_kag_df = kag_df[(kag_df['Date'] >= start_date) & (kag_df['Date'] <= end_date)]

# Display the first few rows of the filtered DataFrame
filtered_kag_df.head()

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Dividends,Stock Splits
24919429,2020-01-02,HOFV,232.320007,232.539993,231.660004,231.660004,11459.0,0.0,0.0
24919430,2020-01-02,CTSH,61.055629,61.222259,60.457717,60.692959,2234500.0,0.0,0.0
24919431,2020-01-02,AZUL,43.610001,44.080002,43.150002,43.59,532300.0,0.0,0.0
24919432,2020-01-02,FNCTF,12.707134,12.707134,12.707134,12.707134,100.0,0.0,0.0
24919433,2020-01-02,BLE,13.720488,13.720488,13.621064,13.657218,54700.0,0.0,0.0


In [34]:
filtered_kag_df.tail()

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Dividends,Stock Splits
32724160,2023-12-29,RTO,28.459999,28.620001,28.26,28.610001,620800.0,0.0,0.0
32724161,2023-12-29,SHZNY,41.02,41.02,41.02,41.02,0.0,0.0,0.0
32724162,2023-12-29,ONMD,0.745,0.9,0.745,0.81,70100.0,0.0,0.0
32724163,2023-12-29,KMPR,48.349998,49.150002,48.150002,48.669998,312300.0,0.0,0.0
32724164,2023-12-29,SBT,5.85,5.87,5.77,5.77,14200.0,0.0,0.0


In [None]:
def ensure_directory_exists(directory: str) -> None:
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created.")
    else:
        print(f"Directory '{directory}' already exists.")

def download_data(tickers: List[str], start_date: str, end_date: str, interval: str) -> pd.DataFrame:
    return yf.download(
        tickers=tickers,
        start=start_date,
        end=end_date,
        interval=interval,
        group_by="ticker",
        auto_adjust=True,
        threads=True,
        progress=True
    )

def filter_columns(df: pd.DataFrame, tickers: List[str], fields: List[str]) -> pd.DataFrame:
    keep_cols = [(t, field) for t in tickers for field in fields if (t, field) in df.columns]
    return df[keep_cols].copy()

def add_change_columns(df: pd.DataFrame, tickers: List[str]) -> pd.DataFrame:
    for t in tickers:
        if (t, "High") in df.columns and (t, "Low") in df.columns:
            df[(t, "Change")] = df[(t, "High")] - df[(t, "Low")]
    return df

def process_change_columns(df: pd.DataFrame) -> pd.DataFrame:
    df_only_change = df.loc[:, (slice(None), "Change")].copy()
    df_only_change.columns = df_only_change.columns.droplevel(1)
    df_only_change.columns = [f"{ticker}_Change" for ticker in df_only_change.columns]
    df_only_change.reset_index(inplace=True)
    if df_only_change.columns[0] == "index":
        df_only_change.rename(columns={"index": "Date"}, inplace=True)
    return df_only_change

def save_to_csv(df: pd.DataFrame, path: str) -> None:
    df.to_csv(path, index=False)
    print(f"Data with only 'Change' columns saved to: {path}")

# Main execution flow
data_dir = "Yahoo Data"
ensure_directory_exists(data_dir)

tickers = ["XLI", "XLK", "XLE", "XLB"]
df_full = download_data(tickers, "2020-01-01", "2024-01-01", "3mo")

df_filtered = filter_columns(df_full, tickers, ["High", "Low"])
df_filtered = add_change_columns(df_filtered)

df_only_change = process_change_columns(df_filtered)

csv_path = os.path.join(data_dir, "sector_quarterly_only_change.csv")
save_to_csv(df_only_change, csv_path)

print(df_only_change.head())

[*********************100%***********************]  4 of 4 completed

Directory 'Yahoo Data' created.
Data with only 'Change' columns saved to: Yahoo Data/sector_quarterly_only_change.csv
        Date  XLI_Change  XLK_Change  XLE_Change  XLB_Change
0 2020-01-01   34.623139   33.295932   30.822575   21.908651
1 2020-04-01   19.467893   28.695256   16.112837   16.252515
2 2020-07-01   12.474152   23.360486    8.296763   10.461685
3 2020-10-01   14.514658   21.075830   12.746976   10.079116
4 2021-01-01   13.851598   14.060707   14.423068    9.381533



