In [1]:
#Get Data
#Need Ticker, returns from past 5 years, current price, beta, ESG score, and sector
#Need to get data from Yahoo Finance, ESG, and Sector

import yfinance as yf
import requests
import json
import pandas as pd
import numpy as np
import os
import time
import datetime
# %pip install yesg
import yesg


In [2]:

def get_sp500_tickers():
    # Get S&P 500 tickers from Wikipedia
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    sp500 = pd.read_html(url)
    sp500 = sp500[0]
    return sp500["Symbol"].tolist()


def get_sp500_data():
    # Get S&P 500 tickers
    sp500 = get_sp500_tickers()

    # Filter out any tickers that may not be valid
    valid_tickers = []
    for ticker in sp500:
        try:
            yf.Ticker(ticker).info
            valid_tickers.append(ticker)
        except:
            pass

    # Initialize lists to store data
    rows = []

    # Fetch data for valid tickers
    for ticker in valid_tickers:
        # Fetch data for each ticker
        ticker_data = yf.Ticker(ticker)

        # Get historical price data
        history = ticker_data.history(period="5y")

        # Get beta and sector
        info = ticker_data.info
        beta = info.get("beta", None)
        sector = info.get("sector", None)

        # Append each row with date, ticker, price, beta, sector, and ESG score
        for date, row in history.iterrows():
            rows.append({
                "Date": date,
                "Ticker": ticker,
                "Price": row["Close"],
                "Beta": beta,
                "Sector": sector,
            })

    # Create DataFrame
    df = pd.DataFrame(rows)

    return df

# Example usage
sp500_df = get_sp500_data()


BRK.B: No data found, symbol may be delisted
BF.B: No price data found, symbol may be delisted (period=5y)


In [3]:
sp500_df

Unnamed: 0,Date,Ticker,Price,Beta,Sector
0,2019-04-29 00:00:00-04:00,MMM,140.476929,1.035,Industrials
1,2019-04-30 00:00:00-04:00,MMM,139.959946,1.035,Industrials
2,2019-05-01 00:00:00-04:00,MMM,137.419403,1.035,Industrials
3,2019-05-02 00:00:00-04:00,MMM,136.444534,1.035,Industrials
4,2019-05-03 00:00:00-04:00,MMM,136.791672,1.035,Industrials
...,...,...,...,...,...
623654,2024-04-22 00:00:00-04:00,ZTS,145.539993,0.848,Healthcare
623655,2024-04-23 00:00:00-04:00,ZTS,149.559998,0.848,Healthcare
623656,2024-04-24 00:00:00-04:00,ZTS,150.880005,0.848,Healthcare
623657,2024-04-25 00:00:00-04:00,ZTS,153.360001,0.848,Healthcare


In [4]:
#calculate returns for each calendar year
sp500_df['Year'] = sp500_df['Date'].dt.year
sp500_df['Year'] = sp500_df['Year'].astype(int)
sp500_df['Year'] = sp500_df['Year'].astype(str)
#calculate the returns per calendary year
sp500_df['Return'] = sp500_df.groupby(['Ticker', 'Year'])['Price'].pct_change()
sp500_df

#calculate the cumulative return for each ticker per year
sp500_df['Cumulative_Return'] = sp500_df.groupby(['Ticker', 'Year'])['Return'].transform(lambda x: (1 + x).cumprod() - 1)


In [5]:
sp500_df

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Return,Cumulative_Return
0,2019-04-29 00:00:00-04:00,MMM,140.476929,1.035,Industrials,2019,,
1,2019-04-30 00:00:00-04:00,MMM,139.959946,1.035,Industrials,2019,-0.003680,-0.003680
2,2019-05-01 00:00:00-04:00,MMM,137.419403,1.035,Industrials,2019,-0.018152,-0.021765
3,2019-05-02 00:00:00-04:00,MMM,136.444534,1.035,Industrials,2019,-0.007094,-0.028705
4,2019-05-03 00:00:00-04:00,MMM,136.791672,1.035,Industrials,2019,0.002544,-0.026234
...,...,...,...,...,...,...,...,...
623654,2024-04-22 00:00:00-04:00,ZTS,145.539993,0.848,Healthcare,2024,-0.006553,-0.259602
623655,2024-04-23 00:00:00-04:00,ZTS,149.559998,0.848,Healthcare,2024,0.027621,-0.239151
623656,2024-04-24 00:00:00-04:00,ZTS,150.880005,0.848,Healthcare,2024,0.008826,-0.232436
623657,2024-04-25 00:00:00-04:00,ZTS,153.360001,0.848,Healthcare,2024,0.016437,-0.219820


In [6]:
sp500_df.groupby(['Ticker', 'Year'])['Cumulative_Return'].last()

Ticker  Year
A       2019    0.110341
        2020    0.387500
        2021    0.351167
        2022   -0.043648
        2023   -0.073380
                  ...   
ZTS     2020    0.240854
        2021    0.499937
        2022   -0.372793
        2023    0.344024
        2024   -0.194078
Name: Cumulative_Return, Length: 2979, dtype: float64

In [7]:
#get the standard deviation of the returns for each ticker per year
sp500_df['Firm-Year_Standard_Deviation'] = sp500_df.groupby(['Ticker', 'Year'])['Return'].transform(lambda x: x.std())   
sp500_df

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Return,Cumulative_Return,Firm-Year_Standard_Deviation
0,2019-04-29 00:00:00-04:00,MMM,140.476929,1.035,Industrials,2019,,,0.014105
1,2019-04-30 00:00:00-04:00,MMM,139.959946,1.035,Industrials,2019,-0.003680,-0.003680,0.014105
2,2019-05-01 00:00:00-04:00,MMM,137.419403,1.035,Industrials,2019,-0.018152,-0.021765,0.014105
3,2019-05-02 00:00:00-04:00,MMM,136.444534,1.035,Industrials,2019,-0.007094,-0.028705,0.014105
4,2019-05-03 00:00:00-04:00,MMM,136.791672,1.035,Industrials,2019,0.002544,-0.026234,0.014105
...,...,...,...,...,...,...,...,...,...
623654,2024-04-22 00:00:00-04:00,ZTS,145.539993,0.848,Healthcare,2024,-0.006553,-0.259602,0.019098
623655,2024-04-23 00:00:00-04:00,ZTS,149.559998,0.848,Healthcare,2024,0.027621,-0.239151,0.019098
623656,2024-04-24 00:00:00-04:00,ZTS,150.880005,0.848,Healthcare,2024,0.008826,-0.232436,0.019098
623657,2024-04-25 00:00:00-04:00,ZTS,153.360001,0.848,Healthcare,2024,0.016437,-0.219820,0.019098


In [8]:
missing_values = sp500_df.isna().sum()
print(missing_values)

Date                               0
Ticker                             0
Price                              0
Beta                            3289
Sector                             0
Year                               0
Return                          2979
Cumulative_Return               2979
Firm-Year_Standard_Deviation       0
dtype: int64


In [9]:
#Append ESG Information

In [23]:
esg_df = pd.read_csv("esg_scores.csv")

sp500_df['Year'] = sp500_df['Year'].astype(int) 
esg_df['year'] = esg_df['year'].astype(int)

sp500_df_lim23 = sp500_df[sp500_df['Year'] <= 2023]

sp500_df_w_scores = sp500_df_lim23.merge( 
                                    esg_df,
                                    left_on=['Year','Ticker'],
                                    right_on=['year','Company_Symbol'],
                                    how='left'
)

sp500_df_w_scores.drop(columns=['year','Company_Symbol'], inplace=True)
sp500_df_w_scores = sp500_df_w_scores.dropna(subset=['Total-Score'])
sp500_df_w_scores 

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Return,Cumulative_Return,Firm-Year_Standard_Deviation,Total-Score,E-Score,S-Score,G-Score
1178,2019-04-29 00:00:00-04:00,AOS,53.028980,1.253,Industrials,2019,,,0.017354,46.630833,43.87,41.02,58.906667
1179,2019-04-30 00:00:00-04:00,AOS,49.950428,1.253,Industrials,2019,-0.058054,-0.058054,0.017354,46.630833,43.87,41.02,58.906667
1180,2019-05-01 00:00:00-04:00,AOS,49.883919,1.253,Industrials,2019,-0.001332,-0.059308,0.017354,46.630833,43.87,41.02,58.906667
1181,2019-05-02 00:00:00-04:00,AOS,49.902924,1.253,Industrials,2019,0.000381,-0.058950,0.017354,46.630833,43.87,41.02,58.906667
1182,2019-05-03 00:00:00-04:00,AOS,50.967110,1.253,Industrials,2019,0.021325,-0.038882,0.017354,46.630833,43.87,41.02,58.906667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
583195,2023-12-22 00:00:00-05:00,ZTS,194.979996,0.848,Healthcare,2023,0.001644,0.327749,0.015758,18.770000,3.24,6.81,8.720000
583196,2023-12-26 00:00:00-05:00,ZTS,195.500000,0.848,Healthcare,2023,0.002667,0.331290,0.015758,18.770000,3.24,6.81,8.720000
583197,2023-12-27 00:00:00-05:00,ZTS,196.899994,0.848,Healthcare,2023,0.007161,0.340824,0.015758,18.770000,3.24,6.81,8.720000
583198,2023-12-28 00:00:00-05:00,ZTS,197.160004,0.848,Healthcare,2023,0.001321,0.342594,0.015758,18.770000,3.24,6.81,8.720000


In [25]:
#sp500_df_w_scores[sp500_df_w_scores['Ticker'] == 'GOOGL']
# proof of concept

In [None]:
#Define portfolio theme lists for each theme

In [28]:
#Subset main data frame into smaller data frames for each year
def subset_year(stock_returns_list, start, end) :
    stock_returns_ticker = stock_returns_list[(stock_returns_list['Year'] >= start) & 
                                              (stock_returns_list['Year'] <= end)]
    return stock_returns_ticker


    #essentially integrate and modify this for just the subsetting part.

#modify to subset by year instead of calculate returns
def subset_by_year(df, year):
    df_subset = df[df['Year'] == year]
    return df_subset




In [None]:
sp500_df['Sector'].unique()

In [29]:
subset_year(sp500_df,2019,2023)
#proof of concept

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Return,Cumulative_Return,Firm-Year_Standard_Deviation
0,2019-04-29 00:00:00-04:00,MMM,140.476929,1.035,Industrials,2019,,,0.014105
1,2019-04-30 00:00:00-04:00,MMM,139.959946,1.035,Industrials,2019,-0.003680,-0.003680,0.014105
2,2019-05-01 00:00:00-04:00,MMM,137.419403,1.035,Industrials,2019,-0.018152,-0.021765,0.014105
3,2019-05-02 00:00:00-04:00,MMM,136.444534,1.035,Industrials,2019,-0.007094,-0.028705,0.014105
4,2019-05-03 00:00:00-04:00,MMM,136.791672,1.035,Industrials,2019,0.002544,-0.026234,0.014105
...,...,...,...,...,...,...,...,...,...
623573,2023-12-22 00:00:00-05:00,ZTS,194.979996,0.848,Healthcare,2023,0.001644,0.327749,0.015758
623574,2023-12-26 00:00:00-05:00,ZTS,195.500000,0.848,Healthcare,2023,0.002667,0.331290,0.015758
623575,2023-12-27 00:00:00-05:00,ZTS,196.899994,0.848,Healthcare,2023,0.007161,0.340824,0.015758
623576,2023-12-28 00:00:00-05:00,ZTS,197.160004,0.848,Healthcare,2023,0.001321,0.342594,0.015758
