In [1]:
# data
import pandas as pd
import numpy as np

# api calls
import requests
import time
import os
from dotenv import load_dotenv

In [2]:
def industries_List(keywords):
    sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]
    mask = sp500['GICS Sub-Industry'].str.contains('|'.join(keywords), case=False, na=False)
    companies = sp500[mask]
    companiesTickers = companies['Symbol'].tolist()
    return companiesTickers, companies

In [3]:
def get_Revenue(companyTickers, outputFile, master_file):
    load_dotenv()
    api_key = os.getenv("ALPHA_VANTAGE_API_KEY")
    if os.path.exists(outputFile):
        print(f"The data you are trying to retrieve already exists: {outputFile}, Skipping data fetch")
        return pd.read_csv(outputFile)
    
    base_url = 'https://www.alphavantage.co/query'
    revenue_rows = []
    for symbol in companyTickers:
        try:
            params = {
                'function': 'INCOME_STATEMENT',
                'symbol': symbol,
                'apikey': api_key
            }
            response = requests.get(base_url, params=params)
            data = response.json()
            print(f"Response for {symbol}: {data}")
            if 'annualReports' in data:
                for report in data['annualReports']:
                    revenue = report.get('totalRevenue')
                    fiscal_date = report.get('fiscalDateEnding')
                    if revenue:
                        revenue_rows.append({
                            'Ticker': symbol,
                            'Fiscal Date': fiscal_date,
                            'Revenue (USD)': int(revenue)
                        })
            else:
                print(f"No annualReports found for {symbol}.")
            time.sleep(12)
        except Exception as e:
            print(f"Error fetching data for {symbol}: {e}")
    df = pd.DataFrame(revenue_rows)
    df.to_csv(outputFile, index=False)
    print(f"Saved to {outputFile}")
    
    if os.path.exists(master_file):
        master_df = pd.read_csv(master_file)
        combined  = pd.concat([master_df, df], ignore_index=True, sort=False)
        # drop exact duplicates, if any, on key columns
        combined = combined.drop_duplicates(subset=["Ticker", "Fiscal Date"], keep="last")
    else:
        combined = df

    combined.to_csv(master_file, index=False)
    print(f"Appended data → {master_file}")
  
    return df

In [4]:
def Market_Share_Calculations(dataset: pd.DataFrame) -> pd.DataFrame:
    mergedData = dataset.copy()

    mergedData["Fiscal Year"] = pd.to_datetime(mergedData["Fiscal Date"]).dt.year

    total_rev = (
        mergedData.groupby("Fiscal Year")["Revenue (USD)"]
          .sum()                          # Series
          .rename("Total Revenue")        # Series.rename("new_name")
    )

    mergedData = mergedData.merge(total_rev , on="Fiscal Year")
    mergedData["Market Share"] = (
        mergedData["Revenue (USD)"] / mergedData["Total Revenue"]
    )
    return mergedData

In [5]:
def StreamLitDataPrep(df, output_file):
    cols_to_keep = ["Ticker", "Fiscal Year", "Market Share"]
    cleaned = df.loc[:, cols_to_keep].copy()
    cleaned.to_csv(output_file, index=False)
    return cleaned