In [22]:
#Get Data
#Need Ticker, returns from past 5 years, current price, beta, ESG score, and sector
#Need to get data from Yahoo Finance, ESG, and Sector
import yfinance as yf
import requests
import json
import pandas as pd
import numpy as np
import os
import time
import datetime
import pandas_datareader as pdr
from pandas_datareader import DataReader
from pypfopt import expected_returns, risk_models
# pip install PyPortfolioOpt
from itertools import combinations
import yesg


In [23]:

def get_sp500_tickers():
    # Get S&P 500 tickers from Wikipedia
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    sp500 = pd.read_html(url)
    sp500 = sp500[0]
    return sp500["Symbol"].tolist()


def get_sp500_data():
    # Get S&P 500 tickers
    sp500 = get_sp500_tickers()

    # Filter out any tickers that may not be valid
    valid_tickers = []
    for ticker in sp500:
        try:
            yf.Ticker(ticker).info
            valid_tickers.append(ticker)
        except:
            pass

    # Initialize lists to store data
    rows = []

    # Fetch data for valid tickers
    for ticker in valid_tickers:
        # Fetch data for each ticker
        ticker_data = yf.Ticker(ticker)

        # Get historical price data
        history = ticker_data.history(period="5y")

        # Get beta and sector
        info = ticker_data.info
        beta = info.get("beta", None)
        sector = info.get("sector", None)

        # Append each row with date, ticker, price, beta, sector, and ESG score
        for date, row in history.iterrows():
            rows.append({
                "Date": date,
                "Ticker": ticker,
                "Price": row["Close"],
                "Beta": beta,
                "Sector": sector,
            })

    # Create DataFrame
    df = pd.DataFrame(rows)

    return df

# Example usage
sp500_df = get_sp500_data()


BRK.B: No data found, symbol may be delisted
BF.B: No price data found, symbol may be delisted (period=5y)


In [24]:
sp500_df['Year'] = sp500_df['Date'].dt.year
sp500_df['Year'] = sp500_df['Year'].astype(int)

In [25]:
sp500_df['Ticker'].nunique()

501

In [29]:
esg_df = pd.read_csv("esg_scores.csv")

sp500_df['Year'] = sp500_df['Year'].astype(int) 
esg_df['year'] = esg_df['year'].astype(int)

sp500_df_lim23 = sp500_df[sp500_df['Year'] <= 2023]

sp500_df_w_scores = sp500_df_lim23.merge( 
                                    esg_df,
                                    left_on=['Year','Ticker'],
                                    right_on=['year','Company_Symbol'],
                                    how='left',
                                    validate="m:1"
)

sp500_df_w_scores.drop(columns=['year','Company_Symbol'], inplace=True)
sp500_df_w_scores = sp500_df_w_scores.dropna(subset=['Total-Score'])
sp500_df_w_scores = sp500_df_w_scores.reset_index(drop=True)
sp500_df_w_scores

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Total-Score,E-Score,S-Score,G-Score
0,2019-04-29 00:00:00-04:00,AOS,51.099236,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667
1,2019-04-30 00:00:00-04:00,AOS,48.132706,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667
2,2019-05-01 00:00:00-04:00,AOS,48.068623,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667
3,2019-05-02 00:00:00-04:00,AOS,48.086941,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667
4,2019-05-03 00:00:00-04:00,AOS,49.112396,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667
...,...,...,...,...,...,...,...,...,...,...
494509,2023-12-22 00:00:00-05:00,ZTS,194.979996,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000
494510,2023-12-26 00:00:00-05:00,ZTS,195.500000,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000
494511,2023-12-27 00:00:00-05:00,ZTS,196.899994,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000
494512,2023-12-28 00:00:00-05:00,ZTS,197.160004,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000


In [30]:
sp500_df_w_scores['Ticker'].nunique()

434

In [31]:
first_date = pd.to_datetime(sp500_df_w_scores.iloc[0]['Date']).date()
last_date = pd.to_datetime(sp500_df_w_scores.iloc[-1]['Date']).date()

print("First date:", first_date)
print("Last date:", last_date)

First date: 2019-04-29
Last date: 2023-12-29


In [32]:
risk_free_rate = DataReader("IRLTLT01USM156N", "fred", start= first_date, end= last_date)
risk_free_rate = risk_free_rate.iloc[-1]/100
risk_free_rate = risk_free_rate.item()
risk_free_rate

0.04019999999999999

In [33]:
prices_firm = sp500_df_w_scores[['Ticker', 'Price']]
prices_firm

Unnamed: 0,Ticker,Price
0,AOS,51.099236
1,AOS,48.132706
2,AOS,48.068623
3,AOS,48.086941
4,AOS,49.112396
...,...,...
494509,ZTS,194.979996
494510,ZTS,195.500000
494511,ZTS,196.899994
494512,ZTS,197.160004


In [34]:
grouped_data = sp500_df_w_scores.groupby('Ticker')

# Initialize empty dictionaries to store expected returns and covariance matrices for each firm
e_returns_dict = {}
cov_matrix_dict = {}

# Loop over each group (firm) and calculate expected returns and covariance matrix
for ticker, group in grouped_data:
    # Calculate expected returns using CAPM for the current firm
    e_returns_dict[ticker] = expected_returns.capm_return(group['Price'], risk_free_rate=risk_free_rate)
# Convert dictionaries to DataFrame
e_returns_df = pd.DataFrame(e_returns_dict)
e_returns_df= e_returns_df.T
e_returns_df.columns = ['Expected Returns']






In [35]:
e_returns_df

Unnamed: 0,Expected Returns
A,0.142040
AAL,-0.172779
AAPL,0.338631
ABBV,0.203620
ABT,0.093182
...,...
XOM,0.105403
XYL,0.078104
YUM,0.061124
ZBH,0.008193


In [36]:
# Calculate returns for each firm
sp500_df_w_scores['Return'] = sp500_df_w_scores.groupby('Ticker')['Price'].pct_change()

# Group the DataFrame by 'Ticker'
grouped_data = sp500_df_w_scores.groupby('Ticker')

# Get unique tickers
tickers = list(grouped_data.groups.keys())

# Initialize an empty DataFrame to store covariance matrix
cov_matrix_df = pd.DataFrame(index=tickers, columns=tickers)

# Loop over each pair of firms and calculate the covariance between their return series
for ticker1, ticker2 in combinations(tickers, 2):
    returns1 = grouped_data.get_group(ticker1)['Return'].dropna()
    returns2 = grouped_data.get_group(ticker2)['Return'].dropna()
    
    # Ensure that both return series have the same length
    min_length = min(len(returns1), len(returns2))
    returns1 = returns1[:min_length]
    returns2 = returns2[:min_length]
    
    # Calculate the covariance between the return series of ticker1 and ticker2
    covariance = np.cov(returns1, returns2)[0, 1]
    
    # Fill in the covariance matrix (since it's symmetric)
    cov_matrix_df.loc[ticker1, ticker2] = covariance
    cov_matrix_df.loc[ticker2, ticker1] = covariance

# Replace NaN values for firms compared against themselves with variance
for ticker in tickers:
    variance = np.var(grouped_data.get_group(ticker)['Return'].dropna())
    cov_matrix_df.loc[ticker, ticker] = variance

# Save the covariance matrix DataFrame as a CSV file
cov_matrix_df.to_csv('covariance_matrix_returns.csv')

# Display the covariance matrix DataFrame
print("Covariance Matrix:")
cov_matrix_df


  sp500_df_w_scores['Return'] = sp500_df_w_scores.groupby('Ticker')['Price'].pct_change()


Covariance Matrix:


Unnamed: 0,A,AAL,AAPL,ABBV,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WRK,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZTS
A,0.000354,0.000226,0.000209,0.000113,0.000191,0.000176,0.000208,0.000242,0.000242,0.000147,...,0.000018,-0.000014,-0.000047,-0.000036,-0.000028,0.000135,-0.00003,-0.000028,-0.000006,-0.000019
AAL,0.000226,0.001553,0.000249,0.000104,0.000136,0.000345,0.000259,0.000208,0.00033,0.00027,...,0.000041,-0.000038,0.000018,0.000041,0.000013,0.000345,0.000047,0.000016,0.000013,0.00001
AAPL,0.000209,0.000249,0.000417,0.000112,0.000178,0.000174,0.000236,0.000326,0.000287,0.000147,...,0.000021,-0.000025,-0.000071,-0.000051,-0.000042,0.000143,-0.000056,-0.000025,-0.000003,-0.00003
ABBV,0.000113,0.000104,0.000112,0.000258,0.00012,0.000128,0.000118,0.000115,0.000122,0.000101,...,0.000015,-0.000027,-0.000057,-0.000054,-0.000024,0.000116,-0.000042,-0.000024,-0.000012,-0.000027
ABT,0.000191,0.000136,0.000178,0.00012,0.000281,0.000149,0.000175,0.000193,0.000183,0.00012,...,-0.000009,-0.000009,-0.000061,-0.00006,-0.000027,0.00009,-0.000045,-0.000027,-0.000021,-0.00002
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
XOM,0.000135,0.000345,0.000143,0.000116,0.00009,0.000239,0.000158,0.000109,0.000204,0.000239,...,0.000019,-0.000022,-0.000049,-0.000047,-0.000027,0.00049,-0.000027,-0.000019,-0.000007,-0.000028
XYL,-0.00003,0.000047,-0.000056,-0.000042,-0.000045,-0.00005,-0.000043,-0.000055,-0.000044,-0.000022,...,0.000158,-0.000004,0.000319,0.000309,0.000151,-0.000027,0.000403,0.000176,0.0002,0.000181
YUM,-0.000028,0.000016,-0.000025,-0.000024,-0.000027,-0.000043,-0.000045,-0.000028,-0.000013,-0.000029,...,0.000032,-0.000022,0.000245,0.000271,0.000118,-0.000019,0.000176,0.000281,0.000185,0.000161
ZBH,-0.000006,0.000013,-0.000003,-0.000012,-0.000021,-0.000013,-0.000016,0.000002,0.000001,-0.0,...,0.000086,-0.000015,0.000263,0.000353,0.000103,-0.000007,0.0002,0.000185,0.000404,0.000162


In [53]:
#Subset main data frame into smaller data frames for each year
def subset_year(stock_returns_list, start, end) :
    stock_returns_ticker = stock_returns_list[(stock_returns_list['Year'] >= start) & 
                                              (stock_returns_list['Year'] <= end)]
    return stock_returns_ticker


    #essentially integrate and modify this for just the subsetting part.

#modify to subset by year instead of calculate returns
def subset_by_year(df, year):
    df_subset = df[df['Year'] == year]
    return df_subset


subset_year(sp500_df_w_scores,2019,2023)

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Total-Score,E-Score,S-Score,G-Score,Return
0,2019-04-29 00:00:00-04:00,AOS,51.099236,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667,
1,2019-04-30 00:00:00-04:00,AOS,48.132706,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667,-0.058054
2,2019-05-01 00:00:00-04:00,AOS,48.068623,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667,-0.001331
3,2019-05-02 00:00:00-04:00,AOS,48.086941,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667,0.000381
4,2019-05-03 00:00:00-04:00,AOS,49.112396,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667,0.021325
...,...,...,...,...,...,...,...,...,...,...,...
494509,2023-12-22 00:00:00-05:00,ZTS,194.979996,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000,0.001644
494510,2023-12-26 00:00:00-05:00,ZTS,195.500000,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000,0.002667
494511,2023-12-27 00:00:00-05:00,ZTS,196.899994,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000,0.007161
494512,2023-12-28 00:00:00-05:00,ZTS,197.160004,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000,0.001321


In [None]:
# Theming

In [39]:
sp500_df_w_scores

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Total-Score,E-Score,S-Score,G-Score,Return
0,2019-04-29 00:00:00-04:00,AOS,51.099236,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667,
1,2019-04-30 00:00:00-04:00,AOS,48.132706,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667,-0.058054
2,2019-05-01 00:00:00-04:00,AOS,48.068623,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667,-0.001331
3,2019-05-02 00:00:00-04:00,AOS,48.086941,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667,0.000381
4,2019-05-03 00:00:00-04:00,AOS,49.112396,1.253,Industrials,2019,46.630833,43.87,41.02,58.906667,0.021325
...,...,...,...,...,...,...,...,...,...,...,...
494509,2023-12-22 00:00:00-05:00,ZTS,194.979996,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000,0.001644
494510,2023-12-26 00:00:00-05:00,ZTS,195.500000,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000,0.002667
494511,2023-12-27 00:00:00-05:00,ZTS,196.899994,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000,0.007161
494512,2023-12-28 00:00:00-05:00,ZTS,197.160004,0.848,Healthcare,2023,18.770000,3.24,6.81,8.720000,0.001321


In [41]:
chosen_sectors = []

#seemless for checkbox frontend 
# maybe this should be reworked as a toggle? AGAIN idk how the frontend works for the checkboxes
def add_sector(sector_list, new_sector) :
    if new_sector not in sector_list :
        sector_list.append(new_sector)

add_sector(chosen_sectors, "Industrials") 
add_sector(chosen_sectors, "Technology") 

chosen_sectors


#note: can combine add_sector and narrow_by_years, narrow_by_sector to one seamless function with parameters (sector_list, new_sectors (array), start, end)

['Industrials', 'Technology']

In [54]:
narrow_by_years = subset_year(sp500_df_w_scores,2019,2023)
narrow_by_sector = narrow_by_years[narrow_by_years["Sector"].isin(chosen_sectors)]
growth_subset = narrow_by_sector.sort_values(by="Return", ascending=False).dropna()

growth_subset


#doing this, now i am confused:
# we would need to average the data in order to subset, right?
# because we would need to groupby Sector or groupby Year 

# I am not sure how to continue with this in mind

# also, for firms HQ'd in PA, we would need a separate query (i.e. Bloomberg?) in order to match the data
# could upload a manual bloomberg file of all firms HQd in PA and just merge with the sp500_w_scores df for this
# otherwise, not sure if yfinance has any capabilities. At the very least, it is not in sp500_df yet

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Total-Score,E-Score,S-Score,G-Score,Return
22660,2020-06-04 00:00:00-04:00,AAL,16.719999,1.569,Industrials,2020,30.886667,11.146667,14.038333,5.701667,0.410970
22610,2020-03-24 00:00:00-04:00,AAL,13.920000,1.569,Industrials,2020,30.886667,11.146667,14.038333,5.701667,0.358049
404378,2020-08-26 00:00:00-04:00,CRM,272.320007,1.279,Technology,2020,20.873333,6.028889,9.287778,5.554444,0.260449
455228,2020-03-24 00:00:00-04:00,UAL,33.000000,1.538,Industrials,2020,30.346667,10.785000,14.038333,5.523333,0.257143
419070,2020-03-24 00:00:00-04:00,SWK,87.692635,1.329,Industrials,2020,32.225000,7.135000,16.540000,8.555000,0.253228
...,...,...,...,...,...,...,...,...,...,...,...
22606,2020-03-18 00:00:00-04:00,AAL,11.650000,1.569,Industrials,2020,30.886667,11.146667,14.038333,5.701667,-0.252246
142089,2020-03-18 00:00:00-04:00,DAL,23.319847,1.428,Industrials,2020,24.851667,8.520000,11.315000,5.021667,-0.259924
346846,2020-03-18 00:00:00-04:00,ON,8.450000,1.790,Technology,2020,25.190000,12.815000,6.340000,6.028333,-0.268398
191974,2022-11-03 00:00:00-04:00,FIS,57.180000,0.977,Technology,2022,18.130000,1.270000,11.140000,5.730000,-0.280483


In [52]:
esg_ss = narrow_by_sector[narrow_by_sector["Total-Score"] > 50].dropna()
esg_ss

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Total-Score,E-Score,S-Score,G-Score,Return
3535,2019-04-30 00:00:00-04:00,ACN,170.436157,1.181,Technology,2019,69.081667,75.860833,63.535833,67.668333,0.008057
3536,2019-05-01 00:00:00-04:00,ACN,169.167221,1.181,Technology,2019,69.081667,75.860833,63.535833,67.668333,-0.007445
3537,2019-05-02 00:00:00-04:00,ACN,167.375854,1.181,Technology,2019,69.081667,75.860833,63.535833,67.668333,-0.010589
3538,2019-05-03 00:00:00-04:00,ACN,165.127228,1.181,Technology,2019,69.081667,75.860833,63.535833,67.668333,-0.013435
3539,2019-05-06 00:00:00-04:00,ACN,164.455444,1.181,Technology,2019,69.081667,75.860833,63.535833,67.668333,-0.004068
...,...,...,...,...,...,...,...,...,...,...,...
489972,2019-12-24 00:00:00-05:00,XYL,76.626442,1.102,Industrials,2019,78.196667,85.763333,73.469167,72.683333,-0.003926
489973,2019-12-26 00:00:00-05:00,XYL,76.587448,1.102,Industrials,2019,78.196667,85.763333,73.469167,72.683333,-0.000509
489974,2019-12-27 00:00:00-05:00,XYL,76.821312,1.102,Industrials,2019,78.196667,85.763333,73.469167,72.683333,0.003054
489975,2019-12-30 00:00:00-05:00,XYL,76.723869,1.102,Industrials,2019,78.196667,85.763333,73.469167,72.683333,-0.001268


In [56]:
expensive_ss = narrow_by_sector.sort_values(by="Price",ascending=False)
expensive_ss

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Total-Score,E-Score,S-Score,G-Score,Return
75385,2023-12-18 00:00:00-05:00,AVGO,1136.908203,1.268,Technology,2023,19.980000,7.340000,6.000000,6.64,0.015278
75386,2023-12-19 00:00:00-05:00,AVGO,1134.747314,1.268,Technology,2023,19.980000,7.340000,6.000000,6.64,-0.001901
75390,2023-12-26 00:00:00-05:00,AVGO,1127.089966,1.268,Technology,2023,19.980000,7.340000,6.000000,6.64,0.008833
75388,2023-12-21 00:00:00-05:00,AVGO,1122.509521,1.268,Technology,2023,19.980000,7.340000,6.000000,6.64,0.015229
75391,2023-12-27 00:00:00-05:00,AVGO,1121.394287,1.268,Technology,2023,19.980000,7.340000,6.000000,6.64,-0.005053
...,...,...,...,...,...,...,...,...,...,...,...
223916,2020-10-28 00:00:00-04:00,HPE,8.027882,1.222,Technology,2020,17.288333,0.743333,9.886667,6.65,-0.048864
223760,2020-03-18 00:00:00-04:00,HPE,7.995320,1.222,Technology,2020,17.288333,0.743333,9.886667,6.65,0.031439
223758,2020-03-16 00:00:00-04:00,HPE,7.826604,1.222,Technology,2020,17.288333,0.743333,9.886667,6.65,-0.153144
223759,2020-03-17 00:00:00-04:00,HPE,7.751618,1.222,Technology,2020,17.288333,0.743333,9.886667,6.65,-0.009581


In [57]:
lehigh_ss = narrow_by_sector[narrow_by_sector["Ticker"].str.contains('[LEHIGH]', regex=True)]
lehigh_ss

Unnamed: 0,Date,Ticker,Price,Beta,Sector,Year,Total-Score,E-Score,S-Score,G-Score,Return
4712,2019-04-29 00:00:00-04:00,ADBE,286.140015,1.272,Technology,2019,63.565833,70.188333,58.160833,62.5425,
4713,2019-04-30 00:00:00-04:00,ADBE,289.250000,1.272,Technology,2019,63.565833,70.188333,58.160833,62.5425,0.010869
4714,2019-05-01 00:00:00-04:00,ADBE,283.350006,1.272,Technology,2019,63.565833,70.188333,58.160833,62.5425,-0.020398
4715,2019-05-02 00:00:00-04:00,ADBE,279.640015,1.272,Technology,2019,63.565833,70.188333,58.160833,62.5425,-0.013093
4716,2019-05-03 00:00:00-04:00,ADBE,285.579987,1.272,Technology,2019,63.565833,70.188333,58.160833,62.5425,0.021241
...,...,...,...,...,...,...,...,...,...,...,...
490978,2023-12-22 00:00:00-05:00,XYL,112.639999,1.102,Industrials,2023,17.085000,3.915000,8.030000,5.1350,0.008506
490979,2023-12-26 00:00:00-05:00,XYL,113.620003,1.102,Industrials,2023,17.085000,3.915000,8.030000,5.1350,0.008700
490980,2023-12-27 00:00:00-05:00,XYL,114.379997,1.102,Industrials,2023,17.085000,3.915000,8.030000,5.1350,0.006689
490981,2023-12-28 00:00:00-05:00,XYL,114.320000,1.102,Industrials,2023,17.085000,3.915000,8.030000,5.1350,-0.000525


In [16]:
#Output the expected returns to a csv file
e_returns_df.to_csv('expected_returns.csv')

#Output the the sp500 data with scores to a csv file
sp500_df_w_scores.to_csv('sp500_data_with_scores.csv')