The output of this file is a folder with csv data files for each stock. The list of stocks comes from an external file.

Do not run all the notebook in one go: Check whether you need to download stock data for the first time or update existing stock data

## Load relevant libraries

In [5]:
# Data management
import pandas as pd

# For managing files
import os
from os import listdir
from os.path import isfile, join
import time
from datetime import date

# For webscrapping yahoo finance
import yfinance as yf # need to read what this does and how
# https://github.com/ranaroussi/yfinance

## Constants

In [7]:
# Working directory
PATH = "D:\\webScrapping\\derekBanasTutorial\\" # change accordingly
# Folder to download stocks, create one if it doesn't exist
folder = PATH + "wilshire_stocks_weekly\\"
if not os.path.exists(folder): # may return false if permission is not granted
    os.mkdir(folder)

today = str(date.today()) # today's date in year-month-day format as string
period = '5y' # Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max 
interval = '1wk' # Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo

## Define some useful functions

In [9]:
def get_column_from_csv(file, col_name):
    '''
    Parameters
    ----------
    file : string -> path to file
    col_name : string -> column name

    Returns dataframe of column from csv file
    '''
    try:
        df = pd.read_csv(file)
    except FileNotFoundError:
        print("File not found")
    else:
        return df[col_name]

def get_stock_df_from_csv(ticker):
    try:
        df = pd.read_csv(folder + ticker + '.csv', index_col=0)
        
        # Check if df has duplicate indexes
        # Sometimes yahoo data comes duplicated
        if not df.index.is_unique:
            df = df.loc[~df.index.duplicated(), :] # ~ is the "invert" or "complement" operation
            # is the bitwise complement operator in python which essentially calculates -x - 1

        # Check if df has nan in 'Close' price and delete the whole row
        df.dropna(subset=['Close'])
            
    except FileNotFoundError:
        print("File doesn't exist!")
    else:
        return df        

## Get tickers from an external file

In [4]:
# These will be saved as a list of strings
tickers = get_column_from_csv(PATH + 'Wilshire-5000-Stocks.csv', 'Ticker')
# Wilshire: index of all equities that are actively traded in the United States
ntickers = len(tickers)
print('There are: ',  ntickers, 'tickers.')

There are:  3481 tickers.


## Function to download those tickers data from Yahoo finance for the first time

In [5]:
# Some stocks may be delisted
stocks_not_downloaded = []

def save_to_csv_from_yahoo(folder, ticker, period, interval):
    '''
    folder: string -> path where to save file
    ticker: string
    period: string -> Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max 
    interval: string - > Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
    Intraday data cannot extend last 60 days

    Downloads stock data from yahoo finance and saves it in a ticker.csv file in folder
    '''
    stock = yf.Ticker(ticker)
    # yahoo finance datetimes are received as UTC.
    
    try:
        print("Get data for: ", ticker)
        # Get historical closing price data
        df = stock.history(period=period, interval=interval)
        # 5 years is enough (medium term) maybe even 2, depends what you want to do
        
        # Wait a prudent time
        time.sleep(2) # in seconds
        
        # If there is no data from Yahoo
        if df.empty:
            stocks_not_downloaded.append(ticker)
        
        # Remove a possible period in the file name with a _
        the_file = folder + ticker.replace(".", "_") + '.csv'
        print(the_file, " saved!")
        df.to_csv(the_file)
    except Exception as ex:
        stocks_not_downloaded.append(ticker)
        # err_msg.append(ex)
        print("Could not get data for " + ticker)

## Get those files in batches for the first time

In [6]:
batch = 500

# since 0 % 500 is 0, this loop begins by waiting 20 seconds
for i in range(ntickers):
    if i % batch != 0:
        save_to_csv_from_yahoo(folder, tickers[i], period, interval)
    else:
        print("Finished ", i," !!")
        print("==============================================================")
        time.sleep(20)
print("Finished All!")

Finished  0  !!
Get data for:  AA
D:\webScrapping\derekBanasTutorial\wilshire_stocks_weekly\AA.csv  saved!
Get data for:  AAL
D:\webScrapping\derekBanasTutorial\wilshire_stocks_weekly\AAL.csv  saved!
Get data for:  AAME
D:\webScrapping\derekBanasTutorial\wilshire_stocks_weekly\AAME.csv  saved!
Get data for:  AAN
D:\webScrapping\derekBanasTutorial\wilshire_stocks_weekly\AAN.csv  saved!
Get data for:  AAOI
D:\webScrapping\derekBanasTutorial\wilshire_stocks_weekly\AAOI.csv  saved!
Get data for:  AAON
D:\webScrapping\derekBanasTutorial\wilshire_stocks_weekly\AAON.csv  saved!
Get data for:  AAP
D:\webScrapping\derekBanasTutorial\wilshire_stocks_weekly\AAP.csv  saved!
Get data for:  AAPL
D:\webScrapping\derekBanasTutorial\wilshire_stocks_weekly\AAPL.csv  saved!
Get data for:  AAT
D:\webScrapping\derekBanasTutorial\wilshire_stocks_weekly\AAT.csv  saved!
Get data for:  AAWW
D:\webScrapping\derekBanasTutorial\wilshire_stocks_weekly\AAWW.csv  saved!
Get data for:  AAXN
- AAXN: No data found, sym

## Delete empty files
Some times empty files or files with not enough data (< 16 rows) are downloaded, delete those

In [None]:
# Get tickers from downloaded stock files
files = [f for f in listdir(folder) if isfile(join(folder, f))] 
downloaded_tickers = [os.path.splitext(f)[0] for f in files] # split file extension. List of strings

for file in files:
    try:
        if os.path.getsize(folder+'\\'+file) < 2000: # 2 kB ~ 16 rows
            stocks_not_downloaded.append(os.path.splitext(file)[0])
            os.remove(folder+'\\'+file)
    except OSError as ose:
        print(ose)

print(len(stocks_not_downloaded), " stocks not downloaded.")
print(stocks_not_downloaded)        

## Update those downloaded files (only scrap what's necesary)
First read which tickers are in the folder and read their last date

In [11]:
files = [x for x in listdir(folder) if isfile(join(folder, x))] 
tickers = [os.path.splitext(x)[0] for x in files] # split file extension 

for file in files:
    try:
        last_date = get_column_from_csv(folder+file, "Date").tail(1).tolist()[0] # as string
        ticker = os.path.splitext(file)[0]
        stock = yf.Ticker(ticker)
        
        # Scrapping
        print('Updating: ', ticker)
        df = stock.history(interval = interval, start=last_date, end=today)

        # If there is no data from Yahoo don't try to concatenate
        if not df.empty:
            # Concatenate with previous dataframe
            df2 = pd.concat([get_stock_df_from_csv(ticker), df])

        # Wait a prudent time
        time.sleep(2) # in seconds
                
        # Remove a possible period in the file name with a _
        the_file = folder + ticker.replace(".", "_") + '.csv' # a string
        print(the_file, " saved!")
        df2.to_csv(the_file)

    except Exception as ex:
        print(ex)
        

Updating:  AA
unconverted data remains:  00:00:00
Updating:  AAL
unconverted data remains:  00:00:00
Updating:  AAME
unconverted data remains:  00:00:00
Updating:  AAN
unconverted data remains:  00:00:00
Updating:  AAOI
unconverted data remains:  00:00:00
Updating:  AAON
unconverted data remains:  00:00:00
Updating:  AAP
unconverted data remains:  00:00:00
Updating:  AAPL
unconverted data remains:  00:00:00
Updating:  AAT
unconverted data remains:  00:00:00
Updating:  AAWW
unconverted data remains:  00:00:00
Updating:  ABBV
unconverted data remains:  00:00:00
Updating:  ABC
unconverted data remains:  00:00:00
Updating:  ABCB
unconverted data remains:  00:00:00
Updating:  ABEO
unconverted data remains:  00:00:00
Updating:  ABG
unconverted data remains:  00:00:00
Updating:  ABIO
unconverted data remains:  00:00:00
Updating:  ABM
unconverted data remains:  00:00:00
Updating:  ABMD
D:\webScrapping\derekBanasTutorial\wilshire_stocks_weekly\ABMD.csv  saved!
Updating:  ABR
D:\webScrapping\der