In [19]:
#Get Data
#Need Ticker, returns from past 5 years, current price, beta, ESG score, and sector
#Need to get data from Yahoo Finance, ESG, and Sector

import yfinance as yf
import requests
import json
import pandas as pd
import numpy as np
import os
import time
import datetime
import pandas_datareader as pdr
from pandas_datareader import DataReader
# %pip install yesg
import yesg


In [7]:

def get_sp500_tickers():
    # Get S&P 500 tickers from Wikipedia
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    sp500 = pd.read_html(url)
    sp500 = sp500[0]
    return sp500["Symbol"].tolist()


def get_sp500_data():
    # Get S&P 500 tickers
    sp500 = get_sp500_tickers()

    # Filter out any tickers that may not be valid
    valid_tickers = []
    for ticker in sp500:
        try:
            yf.Ticker(ticker).info
            valid_tickers.append(ticker)
        except:
            pass

    # Initialize lists to store data
    rows = []

    # Fetch data for valid tickers
    for ticker in valid_tickers:
        # Fetch data for each ticker
        ticker_data = yf.Ticker(ticker)

        # Get historical price data
        history = ticker_data.history(period="5y")

        # Get beta and sector
        info = ticker_data.info
        beta = info.get("beta", None)
        sector = info.get("sector", None)

        # Append each row with date, ticker, price, beta, sector, and ESG score
        for date, row in history.iterrows():
            rows.append({
                "Date": date,
                "Ticker": ticker,
                "Price": row["Close"],
                "Beta": beta,
                "Sector": sector,
            })

    # Create DataFrame
    df = pd.DataFrame(rows)

    return df

# Example usage
sp500_df = get_sp500_data()


BRK.B: No data found, symbol may be delisted
BF.B: No price data found, symbol may be delisted (period=5y)


In [10]:
sp500_df['Year'] = sp500_df['Date'].dt.year
sp500_df['Year'] = sp500_df['Year'].astype(int)

In [11]:
esg_df = pd.read_csv("esg_scores.csv")

sp500_df['Year'] = sp500_df['Year'].astype(int) 
esg_df['year'] = esg_df['year'].astype(int)

sp500_df_lim23 = sp500_df[sp500_df['Year'] <= 2023]

sp500_df_w_scores = sp500_df_lim23.merge( 
                                    esg_df,
                                    left_on=['Year','Ticker'],
                                    right_on=['year','Company_Symbol'],
                                    how='left',
                                    validate="m:1"
)

sp500_df_w_scores.drop(columns=['year','Company_Symbol'], inplace=True)
sp500_df_w_scores = sp500_df_w_scores.dropna(subset=['Total-Score'])
sp500_df_w_scores 

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Total-Score,E-Score,S-Score,G-Score
1178,2019-04-29 00:00:00-04:00,AOS,51.099239,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667
1179,2019-04-30 00:00:00-04:00,AOS,48.132710,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667
1180,2019-05-01 00:00:00-04:00,AOS,48.068623,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667
1181,2019-05-02 00:00:00-04:00,AOS,48.086941,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667
1182,2019-05-03 00:00:00-04:00,AOS,49.112408,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667
...,...,...,...,...,...,...,...,...,...,...
583200,2023-12-22 00:00:00-05:00,ZTS,193.985657,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000
583201,2023-12-26 00:00:00-05:00,ZTS,194.503006,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000
583202,2023-12-27 00:00:00-05:00,ZTS,195.895859,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000
583203,2023-12-28 00:00:00-05:00,ZTS,196.154541,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000


In [16]:
first_date = pd.to_datetime(sp500_df_w_scores.iloc[0]['Date']).date()
last_date = pd.to_datetime(sp500_df_w_scores.iloc[-1]['Date']).date()

print("First date:", first_date)
print("Last date:", last_date)

First date: 2019-04-29
Last date: 2023-12-29


In [21]:
risk_free_rate = DataReader("IRLTLT01USM156N", "fred", start= first_date, end= last_date)
#risk_free_rate

Unnamed: 0_level_0,IRLTLT01USM156N
DATE,Unnamed: 1_level_1
2019-05-01,2.4
2019-06-01,2.07
2019-07-01,2.06
2019-08-01,1.63
2019-09-01,1.7
2019-10-01,1.71
2019-11-01,1.81
2019-12-01,1.86
2020-01-01,1.76
2020-02-01,1.5


In [4]:
sp500_df

Unnamed: 0,Date,Ticker,Price,Beta,Sector
0,2019-04-29 00:00:00-04:00,MMM,123.569542,1.035,Industrials
1,2019-04-30 00:00:00-04:00,MMM,123.114777,1.035,Industrials
2,2019-05-01 00:00:00-04:00,MMM,120.879997,1.035,Industrials
3,2019-05-02 00:00:00-04:00,MMM,120.022461,1.035,Industrials
4,2019-05-03 00:00:00-04:00,MMM,120.327812,1.035,Industrials
...,...,...,...,...,...
623658,2024-04-22 00:00:00-04:00,ZTS,145.539993,0.848,Healthcare
623659,2024-04-23 00:00:00-04:00,ZTS,149.559998,0.848,Healthcare
623660,2024-04-24 00:00:00-04:00,ZTS,150.880005,0.848,Healthcare
623661,2024-04-25 00:00:00-04:00,ZTS,153.360001,0.848,Healthcare


In [5]:
#calculate returns for each calendar year
sp500_df['Year'] = sp500_df['Date'].dt.year
sp500_df['Year'] = sp500_df['Year'].astype(int)
sp500_df['Year'] = sp500_df['Year'].astype(str)
#calculate the returns per calendary year
sp500_df['Return'] = sp500_df.groupby(['Ticker', 'Year'])['Price'].pct_change()
sp500_df

#calculate the cumulative return for each ticker per year
sp500_df['Cumulative_Return'] = sp500_df.groupby(['Ticker', 'Year'])['Return'].transform(lambda x: (1 + x).cumprod() - 1)


  sp500_df['Return'] = sp500_df.groupby(['Ticker', 'Year'])['Price'].pct_change()


In [6]:
sp500_df

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Return,Cumulative_Return
0,2019-04-29 00:00:00-04:00,MMM,123.569542,1.035,Industrials,2019,,
1,2019-04-30 00:00:00-04:00,MMM,123.114777,1.035,Industrials,2019,-0.003680,-0.003680
2,2019-05-01 00:00:00-04:00,MMM,120.879997,1.035,Industrials,2019,-0.018152,-0.021765
3,2019-05-02 00:00:00-04:00,MMM,120.022461,1.035,Industrials,2019,-0.007094,-0.028705
4,2019-05-03 00:00:00-04:00,MMM,120.327812,1.035,Industrials,2019,0.002544,-0.026234
...,...,...,...,...,...,...,...,...
623658,2024-04-22 00:00:00-04:00,ZTS,145.539993,0.848,Healthcare,2024,-0.006553,-0.259602
623659,2024-04-23 00:00:00-04:00,ZTS,149.559998,0.848,Healthcare,2024,0.027621,-0.239151
623660,2024-04-24 00:00:00-04:00,ZTS,150.880005,0.848,Healthcare,2024,0.008826,-0.232436
623661,2024-04-25 00:00:00-04:00,ZTS,153.360001,0.848,Healthcare,2024,0.016437,-0.219820


In [7]:
sp500_df.groupby(['Ticker', 'Year'])['Cumulative_Return'].last()

Ticker  Year
A       2019    0.110341
        2020    0.387500
        2021    0.351167
        2022   -0.037355
        2023   -0.066648
                  ...   
ZTS     2020    0.240854
        2021    0.499937
        2022   -0.372793
        2023    0.344024
        2024   -0.194078
Name: Cumulative_Return, Length: 2979, dtype: float64

In [8]:
#get the standard deviation of the returns for each ticker per year
sp500_df['Firm-Year_Standard_Deviation'] = sp500_df.groupby(['Ticker', 'Year'])['Return'].transform(lambda x: x.std())   
sp500_df

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Return,Cumulative_Return,Firm-Year_Standard_Deviation
0,2019-04-29 00:00:00-04:00,MMM,123.569542,1.035,Industrials,2019,,,0.014105
1,2019-04-30 00:00:00-04:00,MMM,123.114777,1.035,Industrials,2019,-0.003680,-0.003680,0.014105
2,2019-05-01 00:00:00-04:00,MMM,120.879997,1.035,Industrials,2019,-0.018152,-0.021765,0.014105
3,2019-05-02 00:00:00-04:00,MMM,120.022461,1.035,Industrials,2019,-0.007094,-0.028705,0.014105
4,2019-05-03 00:00:00-04:00,MMM,120.327812,1.035,Industrials,2019,0.002544,-0.026234,0.014105
...,...,...,...,...,...,...,...,...,...
623658,2024-04-22 00:00:00-04:00,ZTS,145.539993,0.848,Healthcare,2024,-0.006553,-0.259602,0.019098
623659,2024-04-23 00:00:00-04:00,ZTS,149.559998,0.848,Healthcare,2024,0.027621,-0.239151,0.019098
623660,2024-04-24 00:00:00-04:00,ZTS,150.880005,0.848,Healthcare,2024,0.008826,-0.232436,0.019098
623661,2024-04-25 00:00:00-04:00,ZTS,153.360001,0.848,Healthcare,2024,0.016437,-0.219820,0.019098


In [9]:
missing_values = sp500_df.isna().sum()
print(missing_values)

Date                               0
Ticker                             0
Price                              4
Beta                             771
Sector                             0
Year                               0
Return                          2983
Cumulative_Return               2983
Firm-Year_Standard_Deviation       0
dtype: int64


In [10]:
#Append ESG Information

In [19]:
esg_df = pd.read_csv("esg_scores.csv")

sp500_df['Year'] = sp500_df['Year'].astype(int) 
esg_df['year'] = esg_df['year'].astype(int)

sp500_df_lim23 = sp500_df[sp500_df['Year'] <= 2023]

sp500_df_w_scores = sp500_df_lim23.merge( 
                                    esg_df,
                                    left_on=['Year','Ticker'],
                                    right_on=['year','Company_Symbol'],
                                    how='left',
                                    validate="m:1"
)

sp500_df_w_scores.drop(columns=['year','Company_Symbol'], inplace=True)
sp500_df_w_scores = sp500_df_w_scores.dropna(subset=['Total-Score'])
sp500_df_w_scores 

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Return,Cumulative_Return,Firm-Year_Standard_Deviation,Total-Score,E-Score,S-Score,G-Score
1178,2019-04-29 00:00:00-04:00,AOS,51.099239,1.253,Industrials,2019,,,0.017354,46.630833,43.87,41.02,58.906667
1179,2019-04-30 00:00:00-04:00,AOS,48.132717,1.253,Industrials,2019,-0.058054,-0.058054,0.017354,46.630833,43.87,41.02,58.906667
1180,2019-05-01 00:00:00-04:00,AOS,48.068626,1.253,Industrials,2019,-0.001332,-0.059308,0.017354,46.630833,43.87,41.02,58.906667
1181,2019-05-02 00:00:00-04:00,AOS,48.086937,1.253,Industrials,2019,0.000381,-0.058950,0.017354,46.630833,43.87,41.02,58.906667
1182,2019-05-03 00:00:00-04:00,AOS,49.112400,1.253,Industrials,2019,0.021325,-0.038882,0.017354,46.630833,43.87,41.02,58.906667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
583199,2023-12-22 00:00:00-05:00,ZTS,194.979996,0.848,Healthcare,2023,0.001644,0.327749,0.015758,18.770000,3.24,6.81,8.720000
583200,2023-12-26 00:00:00-05:00,ZTS,195.500000,0.848,Healthcare,2023,0.002667,0.331290,0.015758,18.770000,3.24,6.81,8.720000
583201,2023-12-27 00:00:00-05:00,ZTS,196.899994,0.848,Healthcare,2023,0.007161,0.340824,0.015758,18.770000,3.24,6.81,8.720000
583202,2023-12-28 00:00:00-05:00,ZTS,197.160004,0.848,Healthcare,2023,0.001321,0.342594,0.015758,18.770000,3.24,6.81,8.720000


In [12]:
sp500_df_w_scores[sp500_df_w_scores['Ticker'] == 'GOOGL']
# proof of concept

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Return,Cumulative_Return,Firm-Year_Standard_Deviation,Total-Score,E-Score,S-Score,G-Score
21972,2019-04-29 00:00:00-04:00,GOOGL,64.809998,1.054,Communication Services,2019,,,0.015611,60.63,69.905833,62.031667,40.625
21973,2019-04-30 00:00:00-04:00,GOOGL,59.948002,1.054,Communication Services,2019,-0.075019,-0.075019,0.015611,60.63,69.905833,62.031667,40.625
21974,2019-05-01 00:00:00-04:00,GOOGL,58.666000,1.054,Communication Services,2019,-0.021385,-0.094800,0.015611,60.63,69.905833,62.031667,40.625
21975,2019-05-02 00:00:00-04:00,GOOGL,58.325500,1.054,Communication Services,2019,-0.005804,-0.100054,0.015611,60.63,69.905833,62.031667,40.625
21976,2019-05-03 00:00:00-04:00,GOOGL,59.477501,1.054,Communication Services,2019,0.019751,-0.082279,0.015611,60.63,69.905833,62.031667,40.625
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23145,2023-12-22 00:00:00-05:00,GOOGL,141.490005,1.054,Communication Services,2023,0.007620,0.587635,0.019149,24.22,1.560000,11.160000,11.510
23146,2023-12-26 00:00:00-05:00,GOOGL,141.520004,1.054,Communication Services,2023,0.000212,0.587971,0.019149,24.22,1.560000,11.160000,11.510
23147,2023-12-27 00:00:00-05:00,GOOGL,140.369995,1.054,Communication Services,2023,-0.008126,0.575067,0.019149,24.22,1.560000,11.160000,11.510
23148,2023-12-28 00:00:00-05:00,GOOGL,140.229996,1.054,Communication Services,2023,-0.000997,0.573496,0.019149,24.22,1.560000,11.160000,11.510


In [13]:
#Define portfolio theme lists for each theme

In [14]:
#Subset main data frame into smaller data frames for each year
def subset_year(stock_returns_list, start, end) :
    stock_returns_ticker = stock_returns_list[(stock_returns_list['Year'] >= start) & 
                                              (stock_returns_list['Year'] <= end)]
    return stock_returns_ticker


    #essentially integrate and modify this for just the subsetting part.

#modify to subset by year instead of calculate returns
def subset_by_year(df, year):
    df_subset = df[df['Year'] == year]
    return df_subset




In [15]:
sp500_df['Sector'].unique()

array(['Industrials', 'Healthcare', 'Technology', 'Utilities',
       'Financial Services', 'Basic Materials', 'Consumer Cyclical',
       'Real Estate', 'Communication Services', 'Consumer Defensive',
       'Energy'], dtype=object)

In [16]:
subset_year(sp500_df,2019,2023)
#proof of concept

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Return,Cumulative_Return,Firm-Year_Standard_Deviation
0,2019-04-29 00:00:00-04:00,MMM,123.569542,1.035,Industrials,2019,,,0.014105
1,2019-04-30 00:00:00-04:00,MMM,123.114777,1.035,Industrials,2019,-0.003680,-0.003680,0.014105
2,2019-05-01 00:00:00-04:00,MMM,120.879997,1.035,Industrials,2019,-0.018152,-0.021765,0.014105
3,2019-05-02 00:00:00-04:00,MMM,120.022461,1.035,Industrials,2019,-0.007094,-0.028705,0.014105
4,2019-05-03 00:00:00-04:00,MMM,120.327812,1.035,Industrials,2019,0.002544,-0.026234,0.014105
...,...,...,...,...,...,...,...,...,...
623577,2023-12-22 00:00:00-05:00,ZTS,194.979996,0.848,Healthcare,2023,0.001644,0.327749,0.015758
623578,2023-12-26 00:00:00-05:00,ZTS,195.500000,0.848,Healthcare,2023,0.002667,0.331290,0.015758
623579,2023-12-27 00:00:00-05:00,ZTS,196.899994,0.848,Healthcare,2023,0.007161,0.340824,0.015758
623580,2023-12-28 00:00:00-05:00,ZTS,197.160004,0.848,Healthcare,2023,0.001321,0.342594,0.015758
