In [76]:
from datetime import datetime
import os
import pandas as pd

pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_rows', 600)

# -*- encoding: utf-8 -*-
%matplotlib inline

In [77]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}

<IPython.core.display.Javascript object>

In [78]:
def get_table(filename):
    if os.path.isfile(filename):
        df = pd.read_csv(filename, index_col='date')
        return df

In [79]:
filename = 'S&P 500 Historical Components & Changes.csv'
df = get_table(filename)
# df.head()

In [80]:
# Convert ticker column from csv to list, then sort.
df['tickers'] = df['tickers'].apply(lambda x: sorted(x.split(',')))
# df.tail()

In [81]:
# Replace SYMBOL-yyyymm with SYMBOL.
df['tickers'] = [[ticker.split('-')[0] for ticker in tickers] for tickers in df['tickers']]
# df.head()

In [82]:
# Remove duplicates in each row.
df['tickers'] = [sorted(list(set(tickers))) for tickers in df['tickers']]
df.tail()
# num = []
# for i, row in df.iterrows():
#     l = list(row['tickers'])
#     num.append(len(l))
#     print(i, len(l))
# pd.Series(num).describe()

Unnamed: 0_level_0,tickers
date,Unnamed: 1_level_1
2025-10-30,"[A, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADBE, AD..."
2025-10-31,"[A, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADBE, AD..."
2025-11-03,"[A, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADBE, AD..."
2025-11-04,"[A, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADBE, AD..."
2025-11-11,"[A, AAPL, ABBV, ABNB, ABT, ACGL, ACN, ADBE, AD..."


In [83]:
# Ensure index is datetime
df.index = pd.to_datetime(df.index)

# Extract year from the index
df['year'] = df.index.year

# Get the first available date for each year
first_dates_per_year = df.groupby('year').apply(lambda x: x.index.min())

# Filter to keep only these first-of-year records
df_yearly = df[df.index.isin(first_dates_per_year)].copy()

# Drop the temporary year column
df_yearly = df_yearly.drop('year', axis=1)

# Display the result
print(df_yearly)

                                                      tickers
date                                                         
1996-01-02  [AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...
1997-01-02  [AAL, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, A...
1998-01-02  [AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, ACV, A...
1999-01-04  [AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, ACV, A...
2000-01-03  [AABA, AAMRQ, AAPL, ABI, ABS, ABT, ABX, ACKH, ...
2001-01-02  [A, AABA, AAMRQ, AAPL, ABI, ABKFQ, ABS, ABT, A...
2002-01-02  [A, AABA, AAMRQ, AAPL, ABC, ABI, ABKFQ, ABS, A...
2003-01-06  [A, AABA, AAMRQ, AAPL, ABC, ABI, ABKFQ, ABS, A...
2004-01-07  [A, AABA, AAPL, ABC, ABI, ABKFQ, ABS, ABT, ACV...
2005-01-06  [A, AABA, AAPL, ABC, ABI, ABKFQ, ABS, ABT, ACS...
2006-01-03  [A, AABA, AAPL, ABC, ABI, ABKFQ, ABS, ABT, ACS...
2007-01-03  [A, AABA, AAPL, ABC, ABI, ABKFQ, ABT, ACS, ADB...
2008-01-02  [A, AABA, AAPL, ABC, ABI, ABKFQ, ABT, ACAS, AC...
2009-01-02  [A, AABA, AAPL, ABC, ABT, ACAS, ACS, ADBE, ADI...
2010-01-

  first_dates_per_year = df.groupby('year').apply(lambda x: x.index.min())


In [84]:
# Ensure datetime index and sort
df.index = pd.to_datetime(df.index)
df = df.sort_index()

# Initialize lists to store changes
additions = []
removals = []

# Iterate through consecutive dates to find changes
for i in range(1, len(df)):
    current_date = df.index[i]
    previous_date = df.index[i-1]
    
    current_tickers = set(df.iloc[i]['tickers'])
    previous_tickers = set(df.iloc[i-1]['tickers'])
    
    # Find additions (in current but not previous)
    added = list(current_tickers - previous_tickers)
    for ticker in added:
        additions.append({'date': current_date, 'ticker': ticker, 'action': 'added'})
    
    # Find removals (in previous but not current)
    removed = list(previous_tickers - current_tickers)
    for ticker in removed:
        removals.append({'date': current_date, 'ticker': ticker, 'action': 'removed'})

# Create DataFrames
df_added = pd.DataFrame(additions)
df_removed = pd.DataFrame(removals)

# Combine into one changes DataFrame
df_changes = pd.merge(df_added, df_removed, on="date",)
df_changes = df_changes.sort_values('date')

# print("All Changes:")
# print(df_changes.head(10))

# If you want separate DataFrames
# print("\nAdditions:")
# print(df_added.head())

# print("\nRemovals:")
# print(df_removed.head())

In [85]:
df_changes['quarter_num'] = df_changes['date'].dt.quarter

In [86]:
df_changes

Unnamed: 0,date,ticker_x,action_x,ticker_y,action_y,quarter_num
0,1996-01-22,CSE,added,BCO,removed,1
1,1996-02-12,BAY,added,CCB,removed,1
2,1996-03-08,FITB,added,HDLM,removed,1
3,1996-03-13,GNT,added,FBO,removed,1
4,1996-03-28,EMC,added,CYR,removed,1
...,...,...,...,...,...,...
1087,2025-09-22,EME,added,CZR,removed,3
1086,2025-09-22,EME,added,MKTX,removed,3
1085,2025-09-22,EME,added,ENPH,removed,3
1088,2025-09-22,HOOD,added,ENPH,removed,3


## CODE

In [87]:
year  = 2019

In [88]:
# tickers at the first valid date of the year
tickers_at_start_of_year = df_yearly.loc[df_yearly.index.year == year, 'tickers'].values[0]

In [89]:
len(tickers_at_start_of_year)

505

In [90]:
ticker_changes = df_changes[(df_changes['date'] >= str(year)+'-01-01') & (df_changes['date'] <= str(year)+'-12-31')]
ticker_changes.head()

Unnamed: 0,date,ticker_x,action_x,ticker_y,action_y,quarter_num
822,2019-01-02,FRC,added,SCG,removed,1
823,2019-01-18,TFX,added,PCG,removed,1
824,2019-02-15,ATO,added,NFX,removed,1
825,2019-02-27,WAB,added,GT,removed,1
826,2019-04-02,DOW,added,BHF,removed,2


In [91]:
ticker_changes.head()

Unnamed: 0,date,ticker_x,action_x,ticker_y,action_y,quarter_num
822,2019-01-02,FRC,added,SCG,removed,1
823,2019-01-18,TFX,added,PCG,removed,1
824,2019-02-15,ATO,added,NFX,removed,1
825,2019-02-27,WAB,added,GT,removed,1
826,2019-04-02,DOW,added,BHF,removed,2


In [92]:
added_dict = (ticker_changes.groupby('quarter_num')['ticker_x'].apply(list).to_dict())
removed_dict = (ticker_changes.groupby('quarter_num')['ticker_y'].apply(list).to_dict())

In [93]:
added_dict

{1: ['FRC', 'TFX', 'ATO', 'WAB'],
 2: ['DOW', 'LHX', 'CTVA', 'CTVA', 'DD', 'DD', 'AMCR'],
 3: ['MKTX', 'TMUS', 'GL', 'LDOS', 'LDOS', 'IEX', 'IEX', 'CDW', 'NVR'],
 4: ['LVS',
  'BKR',
  'NLOK',
  'NLOK',
  'PEAK',
  'PEAK',
  'NOW',
  'VIAC',
  'VIAC',
  'WRB',
  'WRB',
  'TFC',
  'TFC',
  'ODFL',
  'ODFL',
  'J',
  'LYV',
  'LYV',
  'LYV',
  'ZBRA',
  'STE',
  'ZBRA',
  'ZBRA',
  'STE',
  'STE']}

In [94]:
added_tickers_list = ticker_changes["ticker_x"].tolist()

In [95]:
wrds_tickers_list = list(set(tickers_at_start_of_year + added_tickers_list))
wrds_tickers_list.sort()
len(wrds_tickers_list)

533

In [96]:
# write tickers to a text file
with open(f'Data/sp500_tickers_{year}.txt', 'w') as f:
    for ticker in wrds_tickers_list:
        f.write(f"{ticker}\n")

In [98]:
# now get file from wrds (fundamental )
file_path = f"sp500_wrds_{year}.csv"
dd = pd.read_csv(file_path)
wrds_dwnld = dd["TICKER"].unique().tolist()

In [99]:
wrds_tickers_file = file_path
wrds_file = pd.read_csv(wrds_tickers_file)
wrds_file.head()

Unnamed: 0,gvkey,permno,adate,qdate,public_date,CAPEI,bm,evm,pe_op_basic,pe_op_dil,...,sale_nwc,rd_sale,adv_sale,staff_sale,accrual,ptb,PEG_trailing,divyield,TICKER,cusip
0,12142,10104,2018-05-31,2018-11-30,2019-01-31,21.045,0.173,14.981,18.199,18.535,...,0.787,0.154,0.003,0.0,-0.086,5.897,,1.51%,ORCL,68389X10
1,12142,10104,2018-05-31,2018-11-30,2019-02-28,21.841,0.173,14.981,18.888,19.236,...,0.787,0.154,0.003,0.0,-0.086,6.12,,1.46%,ORCL,68389X10
2,12142,10104,2018-05-31,2018-11-30,2019-03-31,21.429,0.173,14.981,19.46,19.819,...,0.787,0.154,0.003,0.0,-0.086,6.004,,1.42%,ORCL,68389X10
3,12142,10104,2018-05-31,2019-02-28,2019-04-30,22.139,0.132,14.442,19.903,20.417,...,0.924,0.153,0.003,0.0,-0.033,7.974,0.357,1.74%,ORCL,68389X10
4,12142,10104,2018-05-31,2019-02-28,2019-05-31,20.247,0.132,14.442,18.201,18.672,...,0.924,0.153,0.003,0.0,-0.033,7.292,0.326,1.90%,ORCL,68389X10


In [100]:
quater_map = {1: str(year)+'-03-31', 2: str(year)+'-06-30', 3: str(year)+'-09-30', 4: str(year)+'-12-31'}

In [101]:
import pandas as pd

# Example: quarter end dates
quarter_map = {1: str(year)+'-03-31', 2: str(year)+'-06-30', 3: str(year)+'-09-30', 4: str(year)+'-12-31'}

# Ensure 'date' column is datetime
wrds_file['public_date'] = pd.to_datetime(wrds_file['public_date'])

for quarter in range(1,5):
    added_this_quarter = added_dict.get(quarter, [])
    removed_this_quarter = removed_dict.get(quarter, [])

    # Get the quarter start date
    if quarter == 1:
        quarter_start = pd.to_datetime(f'{year}-01-01')
    elif quarter == 2:
        quarter_start = pd.to_datetime(f'{year}-04-01')
    elif quarter == 3:
        quarter_start = pd.to_datetime(f'{year}-07-01')
    else:
        quarter_start = pd.to_datetime(f'{year}-10-01')

    # Remove tickers in added_this_quarter for months BEFORE quarter_start
    wrds_file = wrds_file[~((wrds_file['TICKER'].isin(added_this_quarter)) & (wrds_file['public_date'] < quarter_start))]
    wrds_file = wrds_file[~((wrds_file['TICKER'].isin(removed_this_quarter)) & (wrds_file['public_date'] > quarter_start))]
    # print(f"Q{quarter} {year}: Added {len(added_this_quarter)} tickers, Removed {len(removed_this_quarter)} tickers")


In [102]:
wrds_file["TICKER"].nunique()

467

In [103]:
wrds_file.to_csv(f"Data/sp500_wrds_{year}_fundamental.csv", index=False)
# after saving, use data_cleaning.ipynb to clean and merge with price data