The output of this file is a folder with csv data files for each stock. The list of stocks comes from an external file.

## Load relevant libraries

In [10]:
# Data management
import pandas as pd

# For managing files
import os
from os import listdir
from os.path import isfile, join
import time

# For webscrapping yahoo finance
import yfinance as yf # need to read what this does and how
# https://github.com/ranaroussi/yfinance

## Constants

In [11]:
# Working directory
PATH = "D:\\webScrapping\\derekBanasTutorial\\" # change accordingly
# Folder to download stocks, create one if it doesn't exist
folder = PATH + "wilshire_stocks\\"
if not os.path.exists(folder): # may return false if permission is not granted
    os.mkdir(folder)

period = '2y' # Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max 
interval = '1d' # Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo

## Define some useful functions

In [3]:
def get_column_from_csv(file, col_name):
    '''
    Parameters
    ----------
    file : string -> path to file
    col_name : string -> column name

    Returns dataframe of column from csv file
    '''
    try:
        df = pd.read_csv(file)
    except FileNotFoundError:
        print("File not found")
    else:
        return df[col_name]

## Get tickers from an external file

In [4]:
# These will be saved as a list of strings
tickers = get_column_from_csv(PATH + 'Wilshire-5000-Stocks.csv', 'Ticker')
# Wilshire: index of all equities that are actively traded in the United States
ntickers = len(tickers)
print('There are: ',  ntickers, 'tickers.')

There are:  3481 tickers.


## Function to download those tickers data from Yahoo finance

In [13]:
# Some stocks may be delisted
stocks_not_downloaded = []

def save_to_csv_from_yahoo(folder, ticker, period, interval):
    '''
    folder: string -> path where to save file
    ticker: string
    period: string -> Valid periods: 1d,5d,1mo,3mo,6mo,1y,2y,5y,10y,ytd,max 
    interval: string - > Valid intervals: 1m,2m,5m,15m,30m,60m,90m,1h,1d,5d,1wk,1mo,3mo
    Intraday data cannot extend last 60 days

    Downloads stock data from yahoo finance and saves it in a ticker.csv file in folder
    '''
    stock = yf.Ticker(ticker)
    # yahoo finance datetimes are received as UTC.
    
    try:
        print("Get data for: ", ticker)
        # Get historical closing price data
        df = stock.history(period=period, interval=interval)
        # 5 years is enough (medium term) maybe even 2, depends what you want to do
        
        # Wait a prudent time
        time.sleep(2) # in seconds
        
        # If there is no data from Yahoo
        if df.empty:
            stocks_not_downloaded.append(ticker)
        
        # Remove a possible period in the file name with a _
        the_file = folder + ticker.replace(".", "_") + '.csv'
        print(the_file, " saved!")
        df.to_csv(the_file)
    except Exception as ex:
        stocks_not_downloaded.append(ticker)
        # err_msg.append(ex)
        print("Could not get data for " + ticker)

## Get those files in batches

In [None]:
batch = 500

# since 0 % 500 is 0, this loop begins by waiting 20 seconds
for i in range(ntickers):
    if i % batch != 0:
        save_to_csv_from_yahoo(folder, tickers[i], period, interval)
    else:
        print("Finished ", i," !!")
        print("==============================================================")
        time.sleep(20)
print("Finished All!")

## Delete empty files
Some times empty files or files with not enough data (< 16 rows) are downloaded, delete those

In [16]:
# Get tickers from downloaded stock files
files = [f for f in listdir(folder) if isfile(join(folder, f))] 
downloaded_tickers = [os.path.splitext(f)[0] for f in files] # split file extension. List of strings

for file in files:
    try:
        if os.path.getsize(folder+'\\'+file) < 2000: # 2 kB ~ 16 rows
            stocks_not_downloaded.append(os.path.splitext(file)[0])
            os.remove(file)
    except OSError as ose:
        print(ose)

print(len(stocks_not_downloaded), " stocks not downloaded.")
print(stocks_not_downloaded)        

0  stocks not downloaded.
[]
