In [1]:
#Get Data
#Need Ticker, returns from past 5 years, current price, beta, ESG score, and sector
#Need to get data from Yahoo Finance, ESG, and Sector

import yfinance as yf
import requests
import json
import pandas as pd
import numpy as np
import os
import time
import datetime
# %pip install yesg
import yesg


In [2]:
import pandas as pd
import yfinance as yf
import yesg


def get_sp500_tickers():
    # Get S&P 500 tickers from Wikipedia
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    sp500 = pd.read_html(url)
    sp500 = sp500[0]
    return sp500["Symbol"].tolist()


def get_sp500_data():
    # Get S&P 500 tickers
    sp500 = get_sp500_tickers()

    # Filter out any tickers that may not be valid
    valid_tickers = []
    for ticker in sp500:
        try:
            yf.Ticker(ticker).info
            valid_tickers.append(ticker)
        except:
            pass

    # Initialize lists to store data
    rows = []

    # Fetch data for valid tickers
    for ticker in valid_tickers:
        # Fetch data for each ticker
        ticker_data = yf.Ticker(ticker)

        # Get historical price data
        history = ticker_data.history(period="5y")

        # Get beta and sector
        info = ticker_data.info
        beta = info.get("beta", None)
        sector = info.get("sector", None)

        # Append each row with date, ticker, price, beta, sector, and ESG score
        for date, row in history.iterrows():
            rows.append({
                "Date": date,
                "Ticker": ticker,
                "Price": row["Close"],
                "Beta": beta,
                "Sector": sector,
            })

    # Create DataFrame
    df = pd.DataFrame(rows)

    return df

# Example usage
sp500_df = get_sp500_data()


BRK.B: No data found, symbol may be delisted
BF.B: No price data found, symbol may be delisted (period=5y)


In [3]:
sp500_df

Unnamed: 0,Date,Ticker,Price,Beta,Sector
0,2019-04-26 00:00:00-04:00,MMM,124.518021,1.035,Industrials
1,2019-04-29 00:00:00-04:00,MMM,123.569542,1.035,Industrials
2,2019-04-30 00:00:00-04:00,MMM,123.114761,1.035,Industrials
3,2019-05-01 00:00:00-04:00,MMM,120.879982,1.035,Industrials
4,2019-05-02 00:00:00-04:00,MMM,120.022476,1.035,Industrials
...,...,...,...,...,...
624144,2024-04-22 00:00:00-04:00,ZTS,145.539993,0.848,Healthcare
624145,2024-04-23 00:00:00-04:00,ZTS,149.559998,0.848,Healthcare
624146,2024-04-24 00:00:00-04:00,ZTS,150.880005,0.848,Healthcare
624147,2024-04-25 00:00:00-04:00,ZTS,153.360001,0.848,Healthcare


In [4]:
#calculate returns for each calendar year
sp500_df['Year'] = sp500_df['Date'].dt.year
sp500_df['Year'] = sp500_df['Year'].astype(int)
sp500_df['Year'] = sp500_df['Year'].astype(str)
#calculate the returns per calendary year
sp500_df['Return'] = sp500_df.groupby(['Ticker', 'Year'])['Price'].pct_change()
sp500_df

#calculate the cumulative return for each ticker per year
sp500_df['Cumulative_Return'] = sp500_df.groupby(['Ticker', 'Year'])['Return'].transform(lambda x: (1 + x).cumprod() - 1)


In [5]:
sp500_df

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Return,Cumulative_Return
0,2019-04-26 00:00:00-04:00,MMM,124.518021,1.035,Industrials,2019,,
1,2019-04-29 00:00:00-04:00,MMM,123.569542,1.035,Industrials,2019,-0.007617,-0.007617
2,2019-04-30 00:00:00-04:00,MMM,123.114761,1.035,Industrials,2019,-0.003680,-0.011270
3,2019-05-01 00:00:00-04:00,MMM,120.879982,1.035,Industrials,2019,-0.018152,-0.029217
4,2019-05-02 00:00:00-04:00,MMM,120.022476,1.035,Industrials,2019,-0.007094,-0.036104
...,...,...,...,...,...,...,...,...
624144,2024-04-22 00:00:00-04:00,ZTS,145.539993,0.848,Healthcare,2024,-0.006553,-0.255807
624145,2024-04-23 00:00:00-04:00,ZTS,149.559998,0.848,Healthcare,2024,0.027621,-0.235252
624146,2024-04-24 00:00:00-04:00,ZTS,150.880005,0.848,Healthcare,2024,0.008826,-0.228502
624147,2024-04-25 00:00:00-04:00,ZTS,153.360001,0.848,Healthcare,2024,0.016437,-0.215821


In [6]:
sp500_df.groupby(['Ticker', 'Year'])['Cumulative_Return'].last()

Ticker  Year
A       2019    0.109050
        2020    0.387500
        2021    0.351167
        2022   -0.037356
        2023   -0.066648
                  ...   
ZTS     2020    0.240854
        2021    0.499936
        2022   -0.369204
        2023    0.356284
        2024   -0.194626
Name: Cumulative_Return, Length: 2979, dtype: float64

In [7]:
#get the standard deviation of the returns for each ticker per year
sp500_df['Firm-Year_Standard_Deviation'] = sp500_df.groupby(['Ticker', 'Year'])['Return'].transform(lambda x: x.std())   
sp500_df

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Return,Cumulative_Return,Firm-Year_Standard_Deviation
0,2019-04-26 00:00:00-04:00,MMM,124.518021,1.035,Industrials,2019,,,0.014075
1,2019-04-29 00:00:00-04:00,MMM,123.569542,1.035,Industrials,2019,-0.007617,-0.007617,0.014075
2,2019-04-30 00:00:00-04:00,MMM,123.114761,1.035,Industrials,2019,-0.003680,-0.011270,0.014075
3,2019-05-01 00:00:00-04:00,MMM,120.879982,1.035,Industrials,2019,-0.018152,-0.029217,0.014075
4,2019-05-02 00:00:00-04:00,MMM,120.022476,1.035,Industrials,2019,-0.007094,-0.036104,0.014075
...,...,...,...,...,...,...,...,...,...
624144,2024-04-22 00:00:00-04:00,ZTS,145.539993,0.848,Healthcare,2024,-0.006553,-0.255807,0.018994
624145,2024-04-23 00:00:00-04:00,ZTS,149.559998,0.848,Healthcare,2024,0.027621,-0.235252,0.018994
624146,2024-04-24 00:00:00-04:00,ZTS,150.880005,0.848,Healthcare,2024,0.008826,-0.228502,0.018994
624147,2024-04-25 00:00:00-04:00,ZTS,153.360001,0.848,Healthcare,2024,0.016437,-0.215821,0.018994


In [11]:
missing_values = sp500_df.isna().sum()
print(missing_values)

Date                                0
Ticker                              0
Price                               0
Beta                            42351
Sector                             20
Year                                0
Return                           2979
Cumulative_Return                2979
Firm-Year_Standard_Deviation        0
dtype: int64


In [None]:
#Append ESG Information

In [None]:
#Define portfolio theme lists for each theme

In [12]:
#Subset main data frame into smaller data frames for each year
def calc_ret(stock_returns_list, ticker, start, end) :
    filing_date = filing_dates[ticker]
    start_date = pd.to_datetime(filing_date,format='%m-%d-%Y') + pd.offsets.BDay(start)
    end_date = pd.to_datetime(filing_date,format='%m-%d-%Y') + pd.offsets.BDay(end)
    stock_returns_ticker = stock_returns_list[(stock_returns_list['ticker'] == ticker) &
                                              (stock_returns_list['date'] >= start_date) & 
                                              (stock_returns_list['date'] <= end_date)]
    buy_hold_return = (1 + stock_returns_ticker['ret']).prod() - 1
    return buy_hold_return


    #essentially integrate and modify this for just the subsetting part.

#modify to subset by year instead of calculate returns