# Data Collection

This notebook aims to collect historical price data for stocks in the S&P 500 from the **yfinance** module.

In [4]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.tsa.stattools as ts
import statsmodels.tsa.vector_ar as ar
import datetime
from dateutil.relativedelta import relativedelta
import itertools
import math

Get symbols of Current Stocks in S&P500 and download data

In [5]:
payload = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
first_table = payload[0]
symbols = first_table['Symbol'].values.tolist()

# Update symbols to correct for yfinance accessibility
symbols = list(map(lambda x: x.replace('BRK.B', 'BRK-B'), symbols))
symbols = list(map(lambda x: x.replace('BF.B', 'BF-B'), symbols))

symbols

['MMM',
 'AOS',
 'ABT',
 'ABBV',
 'ACN',
 'ADBE',
 'AMD',
 'AES',
 'AFL',
 'A',
 'APD',
 'ABNB',
 'AKAM',
 'ALB',
 'ARE',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AEE',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'AON',
 'APA',
 'APO',
 'AAPL',
 'AMAT',
 'APTV',
 'ACGL',
 'ADM',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'ADP',
 'AZO',
 'AVB',
 'AVY',
 'AXON',
 'BKR',
 'BALL',
 'BAC',
 'BAX',
 'BDX',
 'BRK-B',
 'BBY',
 'TECH',
 'BIIB',
 'BLK',
 'BX',
 'BK',
 'BA',
 'BKNG',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BRO',
 'BF-B',
 'BLDR',
 'BG',
 'BXP',
 'CHRW',
 'CDNS',
 'CZR',
 'CPT',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'COR',
 'CNC',
 'CNP',
 'CF',
 'CRL',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',
 'CMCSA',
 'CAG',
 'COP',
 'ED',
 'STZ',
 'CEG',
 'COO',


In [6]:
# Download historical stock data based on the start and end dates
data = yf.download(symbols, start = "2015-01-01", end = "2025-01-01", interval="1d")

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  503 of 503 completed


Clean data for any timeseries that was missing

In [7]:
close_data = data['Close']

close_data.head()

Ticker,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,37.353024,24.320433,43.1562,,36.915024,18.539352,75.312714,72.339996,44.890644,38.700104,...,100.57869,24.115885,126.880753,26.153719,59.510223,33.441811,42.833294,100.714745,77.43,40.157524
2015-01-05,36.653099,23.635283,42.344044,,36.92326,18.428413,74.041122,71.980003,44.074306,37.361,...,99.856247,24.115885,124.721245,25.856848,57.881924,31.360485,41.96302,104.471764,76.339996,39.916451
2015-01-06,36.082123,23.637512,42.134453,,36.503956,18.469618,73.507065,70.529999,43.039734,36.624504,...,99.35955,24.089167,122.060829,25.994425,57.57423,31.176058,41.447952,103.586159,75.790001,39.527023
2015-01-07,36.561012,23.968962,43.837379,,36.799919,18.577387,75.049927,71.110001,43.492352,37.175011,...,100.781853,24.162632,125.645515,26.218884,58.157578,31.421953,42.821442,106.162422,77.720001,40.342964
2015-01-08,37.656921,24.8899,44.29586,,37.556301,18.900694,76.194382,72.919998,44.260201,36.631939,...,101.933258,24.423088,127.157097,26.523008,59.125587,31.659061,43.567413,107.289536,79.379997,40.964199


In [8]:
close_data.dropna(axis=1, inplace=True)

close_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  close_data.dropna(axis=1, inplace=True)


Ticker,A,AAPL,ABBV,ABT,ACGL,ACN,ADBE,ADI,ADM,ADP,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-02,37.353024,24.320433,43.1562,36.915024,18.539352,75.312714,72.339996,44.890644,38.700104,66.660553,...,100.57869,24.115885,126.880753,26.153719,59.510223,33.441811,42.833294,100.714745,77.43,40.157524
2015-01-05,36.653099,23.635283,42.344044,36.92326,18.428413,74.041122,71.980003,44.074306,37.361,66.612541,...,99.856247,24.115885,124.721245,25.856848,57.881924,31.360485,41.96302,104.471764,76.339996,39.916451
2015-01-06,36.082123,23.637512,42.134453,36.503956,18.469618,73.507065,70.529999,43.039734,36.624504,66.06031,...,99.35955,24.089167,122.060829,25.994425,57.57423,31.176058,41.447952,103.586159,75.790001,39.527023
2015-01-07,36.561012,23.968962,43.837379,36.799919,18.577387,75.049927,71.110001,43.492352,37.175011,66.636528,...,100.781853,24.162632,125.645515,26.218884,58.157578,31.421953,42.821442,106.162422,77.720001,40.342964
2015-01-08,37.656921,24.8899,44.29586,37.556301,18.900694,76.194382,72.919998,44.260201,36.631939,68.165192,...,101.933258,24.423088,127.157097,26.523008,59.125587,31.659061,43.567413,107.289536,79.379997,40.964199


In [9]:
close_data.to_csv('./data/Data.csv')

In [10]:
stock_tickers = close_data.columns

stock_tickers

Index(['A', 'AAPL', 'ABBV', 'ABT', 'ACGL', 'ACN', 'ADBE', 'ADI', 'ADM', 'ADP',
       ...
       'WTW', 'WY', 'WYNN', 'XEL', 'XOM', 'XYL', 'YUM', 'ZBH', 'ZBRA', 'ZTS'],
      dtype='object', name='Ticker', length=470)