In [2]:
from datetime import datetime
import os
import pandas as pd

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_rows', 600)

# -*- encoding: utf-8 -*-
%matplotlib inline

In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}

<IPython.core.display.Javascript object>

In [4]:
def get_table(filename):
    if os.path.isfile(filename):
        df = pd.read_csv(filename, index_col='date')
        return df

In [5]:
filename = 'S&P 500 Historical Components & Changes.csv'
df = get_table(filename)
# df.head()

In [6]:
# Convert ticker column from csv to list, then sort.
df['tickers'] = df['tickers'].apply(lambda x: sorted(x.split(',')))
# df.tail()

In [7]:
# Replace SYMBOL-yyyymm with SYMBOL.
df['tickers'] = [[ticker.split('-')[0] for ticker in tickers] for tickers in df['tickers']]
# df.head()

In [8]:
# Remove duplicates in each row.
df['tickers'] = [sorted(list(set(tickers))) for tickers in df['tickers']]
df.tail()
# num = []
# for i, row in df.iterrows():
#     l = list(row['tickers'])
#     num.append(len(l))
#     print(i, len(l))
# pd.Series(num).describe()

Unnamed: 0_level_0,tickers
date,Unnamed: 1_level_1
2025-10-30,"[A, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADBE, AD..."
2025-10-31,"[A, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADBE, AD..."
2025-11-03,"[A, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADBE, AD..."
2025-11-04,"[A, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADBE, AD..."
2025-11-11,"[A, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADBE, AD..."


In [9]:
# Ensure index is datetime
df.index = pd.to_datetime(df.index)

# Extract year from the index
df['year'] = df.index.year

# Get the first available date for each year
first_dates_per_year = df.groupby('year').apply(lambda x: x.index.min())

# Filter to keep only these first-of-year records
df_yearly = df[df.index.isin(first_dates_per_year)].copy()

# Drop the temporary year column
df_yearly = df_yearly.drop('year', axis=1)

# Display the result
print(df_yearly)

                                                      tickers
date                                                         
1996-01-02  [AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...
1997-01-02  [AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...
1998-01-02  [AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, ACV, A...
1999-01-04  [AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, ACV, A...
2000-01-03  [AABA, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, ...
2001-01-02  [A, AABA, AAMRQ, AAPL, ABI, ABKFQ, ABS, ABT, A...
2002-01-02  [A, AABA, AAMRQ, AAPL, ABC, ABI, ABKFQ, ABS, A...
2003-01-06  [A, AABA, AAMRQ, AAPL, ABC, ABI, ABKFQ, ABS, A...
2004-01-07  [A, AABA, AAPL, ABC, ABI, ABKFQ, ABS, ABT, ACV...
2005-01-06  [A, AABA, AAPL, ABC, ABI, ABKFQ, ABS, ABT, ACS...
2006-01-03  [A, AABA, AAPL, ABC, ABI, ABKFQ, ABS, ABT, ACS...
2007-01-03  [A, AABA, AAPL, ABC, ABI, ABKFQ, ABT, ACS, ADB...
2008-01-02  [A, AABA, AAPL, ABC, ABI, ABKFQ, ABT, ACAS, AC...
2009-01-02  [A, AABA, AAPL, ABC, ABT, ACAS, ACS, ADBE, ADI...
2010-01-

  first_dates_per_year = df.groupby('year').apply(lambda x: x.index.min())


In [10]:
# Ensure datetime index and sort
df.index = pd.to_datetime(df.index)
df = df.sort_index()

# Initialize lists to store changes
additions = []
removals = []

# Iterate through consecutive dates to find changes
for i in range(1, len(df)):
    current_date = df.index[i]
    previous_date = df.index[i-1]
    
    current_tickers = set(df.iloc[i]['tickers'])
    previous_tickers = set(df.iloc[i-1]['tickers'])
    
    # Find additions (in current but not previous)
    added = list(current_tickers - previous_tickers)
    for ticker in added:
        additions.append({'date': current_date, 'ticker': ticker, 'action': 'added'})
    
    # Find removals (in previous but not current)
    removed = list(previous_tickers - current_tickers)
    for ticker in removed:
        removals.append({'date': current_date, 'ticker': ticker, 'action': 'removed'})

# Create DataFrames
df_added = pd.DataFrame(additions)
df_removed = pd.DataFrame(removals)

# Combine into one changes DataFrame
df_changes = pd.merge(df_added, df_removed, on="date",)
df_changes = df_changes.sort_values('date')

# print("All Changes:")
# print(df_changes.head(10))

# If you want separate DataFrames
# print("\nAdditions:")
# print(df_added.head())

# print("\nRemovals:")
# print(df_removed.head())

In [11]:
df_changes['quarter_num'] = df_changes['date'].dt.quarter

In [12]:
df_changes

Unnamed: 0,date,ticker_x,action_x,ticker_y,action_y,quarter_num
0,1996-01-22,CSE,added,BCO,removed,1
1,1996-02-12,BAY,added,CCB,removed,1
2,1996-03-08,FITB,added,HDLM,removed,1
3,1996-03-13,GNT,added,FBO,removed,1
4,1996-03-28,EMC,added,CYR,removed,1
...,...,...,...,...,...,...
1087,2025-09-22,EME,added,MKTX,removed,3
1086,2025-09-22,EME,added,ENPH,removed,3
1085,2025-09-22,EME,added,CZR,removed,3
1088,2025-09-22,HOOD,added,CZR,removed,3


## CODE

In [13]:
year  = 2024

In [14]:
# tickers at the first valid date of the year
tickers_at_start_of_year = df_yearly.loc[df_yearly.index.year == year, 'tickers'].values[0]

In [15]:
len(tickers_at_start_of_year)

503

In [16]:
ticker_changes = df_changes[(df_changes['date'] >= str(year)+'-01-01') & (df_changes['date'] <= str(year)+'-12-31')]
ticker_changes.head()

Unnamed: 0,date,ticker_x,action_x,ticker_y,action_y,quarter_num
1022,2024-02-01,DAY,added,CDAY,removed,1
1023,2024-03-04,DOC,added,PEAK,removed,1
1025,2024-03-18,SMCI,added,WHR,removed,1
1026,2024-03-18,DECK,added,ZION,removed,1
1024,2024-03-18,SMCI,added,ZION,removed,1


In [17]:
ticker_changes.head()

Unnamed: 0,date,ticker_x,action_x,ticker_y,action_y,quarter_num
1022,2024-02-01,DAY,added,CDAY,removed,1
1023,2024-03-04,DOC,added,PEAK,removed,1
1025,2024-03-18,SMCI,added,WHR,removed,1
1026,2024-03-18,DECK,added,ZION,removed,1
1024,2024-03-18,SMCI,added,ZION,removed,1


In [18]:
added_dict = (ticker_changes.groupby('quarter_num')['ticker_x'].apply(list).to_dict())
removed_dict = (ticker_changes.groupby('quarter_num')['ticker_y'].apply(list).to_dict())

In [19]:
added_dict

{1: ['DAY', 'DOC', 'SMCI', 'DECK', 'SMCI', 'DECK', 'CPAY'],
 2: ['SOLV',
  'SOLV',
  'GEV',
  'GEV',
  'VST',
  'GDDY',
  'GDDY',
  'GDDY',
  'KKR',
  'KKR',
  'KKR',
  'CRWD',
  'CRWD',
  'CRWD'],
 3: ['SW',
  'DELL',
  'DELL',
  'DELL',
  'ERIE',
  'ERIE',
  'ERIE',
  'PLTR',
  'PLTR',
  'PLTR'],
 4: ['TPL', 'APO', 'APO', 'APO', 'LII', 'LII', 'LII', 'WDAY', 'WDAY', 'WDAY']}

In [20]:
added_tickers_list = ticker_changes["ticker_x"].tolist()

In [21]:
wrds_tickers_list = list(set(tickers_at_start_of_year + added_tickers_list))
wrds_tickers_list.sort()
len(wrds_tickers_list)

521

In [22]:
# write tickers to a text file
with open(f'Data/sp500_tickers_{year}.txt', 'w') as f:
    for ticker in wrds_tickers_list:
        f.write(f"{ticker}\n")

In [23]:
# now get file from wrds (fundamental )
dd = pd.read_csv("sp500_wrds_2024.csv")
wrds_dwnld = dd["TICKER"].unique().tolist()

In [24]:
wrds_tickers_file = "sp500_wrds_2024.csv"
wrds_file = pd.read_csv(wrds_tickers_file)
wrds_file.head()

Unnamed: 0,gvkey,permno,adate,qdate,public_date,CAPEI,bm,evm,pe_op_basic,pe_op_dil,...,sale_nwc,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,divyield,TICKER,cusip
0,12142,10104,2023-05-31,2023-11-30,2024-01-31,30.526,0.029,18.359,29.089,30.027,...,,0.171,0.0,0.0,-0.052,33.705,9.062,1.43%,ORCL,68389X10
1,12142,10104,2023-05-31,2023-11-30,2024-02-29,30.521,0.029,18.359,29.083,30.022,...,,0.171,0.0,0.0,-0.052,33.699,9.06,1.43%,ORCL,68389X10
2,12142,10104,2023-05-31,2023-11-30,2024-03-31,34.323,0.029,18.359,32.711,33.766,...,,0.171,0.0,0.0,-0.052,37.897,10.191,1.27%,ORCL,68389X10
3,12142,10104,2023-05-31,2024-02-29,2024-04-30,31.078,0.033,18.71,28.871,29.777,...,,0.17,0.0,0.0,-0.056,30.936,28.323,1.41%,ORCL,68389X10
4,12142,10104,2023-05-31,2024-02-29,2024-05-31,32.018,0.033,18.71,29.744,30.678,...,,0.17,0.0,0.0,-0.056,31.872,29.179,1.37%,ORCL,68389X10


In [25]:
quater_map = {1: str(year)+'-03-31', 2: str(year)+'-06-30', 3: str(year)+'-09-30', 4: str(year)+'-12-31'}

In [26]:
import pandas as pd

# Example: quarter end dates
quarter_map = {1: str(year)+'-03-31', 2: str(year)+'-06-30', 3: str(year)+'-09-30', 4: str(year)+'-12-31'}

# Ensure 'date' column is datetime
wrds_file['public_date'] = pd.to_datetime(wrds_file['public_date'])

for quarter in range(1,5):
    added_this_quarter = added_dict.get(quarter, [])
    removed_this_quarter = removed_dict.get(quarter, [])

    # Get the quarter start date
    if quarter == 1:
        quarter_start = pd.to_datetime(f'{year}-01-01')
    elif quarter == 2:
        quarter_start = pd.to_datetime(f'{year}-04-01')
    elif quarter == 3:
        quarter_start = pd.to_datetime(f'{year}-07-01')
    else:
        quarter_start = pd.to_datetime(f'{year}-10-01')

    # Remove tickers in added_this_quarter for months BEFORE quarter_start
    wrds_file = wrds_file[~((wrds_file['TICKER'].isin(added_this_quarter)) & (wrds_file['public_date'] < quarter_start))]
    wrds_file = wrds_file[~((wrds_file['TICKER'].isin(removed_this_quarter)) & (wrds_file['public_date'] > quarter_start))]
    # print(f"Q{quarter} {year}: Added {len(added_this_quarter)} tickers, Removed {len(removed_this_quarter)} tickers")




In [27]:
wrds_file["TICKER"].nunique()

468

In [28]:
wrds_file.to_csv(f"Data/sp500_wrds_{year}_fundamental.csv", index=False)
# after saving, use data_cleaning.ipynb to clean and merge with price data