In [1]:
import os
import shutil
import time

import pandas as pd
import yfinance as yf

pd.set_option('display.max_rows', 50)

In [2]:
FIRST_DAY = '2019-01-01'

DJIA_STOCKS = [
  'AMZN', 'AXP', 'AMGN', 'AAPL', 'BA',
  'CAT', 'CSCO', 'CVX', 'GS', 'HD',
  'HON', 'IBM', 'INTC', 'JNJ', 'KO',
  'JPM', 'MCD', 'MMM', 'MRK', 'MSFT',
  'NKE', 'PG', 'TRV', 'UNH', 'CRM',
  'VZ', 'V', 'WMT', 'DIS', 'DOW'
]

NASDAQ_100_STOCKS = [
  'ADBE', 'ABNB', 'GOOGL', 'GOOG', 'AMZN', 'AMD', 'AEP', 'AMGN', 'ADI', 'ANSS',
  'AAPL', 'AMAT', 'ASML', 'AZN', 'TEAM', 'ADSK', 'ADP', 'BKR', 'BIIB', 'BKNG',
  'AVGO', 'CDNS', 'CHTR', 'CTAS', 'CSCO', 'CTSH', 'CMCSA', 'CEG', 'CPRT', 'CSGP',
  'COST', 'CRWD', 'CSX', 'DDOG', 'DXCM', 'FANG', 'DLTR', 'EA', 'EXC', 'FAST',
  'FTNT', 'GILD', 'GFS', 'HON', 'IDXX', 'ILMN', 'INTC', 'INTU', 'ISRG', 'KDP',
  'KLAC', 'LRCX', 'LIN', 'LULU', 'MAR', 'MRVL', 'MELI', 'META', 'MCHP', 'MU',
  'MSFT', 'MRNA', 'MDLZ', 'MNST', 'NFLX', 'NVDA', 'NXPI', 'ORLY', 'ODFL', 'ON',
  'PCAR', 'PANW', 'PAYX', 'PYPL', 'PEP', 'PDD', 'QCOM', 'REGN', 'ROST', 'SIRI',
  'SBUX', 'SNPS', 'TSLA', 'KHC', 'TMUS', 'VRSK', 'VRTX', 'WBA', 'WBD', 'WDAY',
  'XEL', 'ZS'
]

#symbols = list(set(DJIA_STOCKS).union(NASDAQ_100_STOCKS))
symbols = DJIA_STOCKS

We will populate the market-modeling/data/ directory as follows
```
\data
 |  \popular_stocks (this will be the name of the Market we create)
 |   |  \growth_timeseries
 |   |   |  TCKR1.csv (the timeseries for stock TCKR1)
 |   |   |  TCKR2.csv (the timeseries for stock TCKR2)
 |   |   |   ...
 |   |  asset_metadata.tsv (a metadata table with 1 row/ticker, each row will have: symbol (ID), name, static covariates, heirarchy data)
``` 

In [3]:
ROOT_DATA_DIR = '../data/'
mkt_dir_name = 'popular_stocks'
mkt_root_dir = os.path.join(ROOT_DATA_DIR, mkt_dir_name)
timeseries_root_dir = os.path.join(mkt_root_dir, 'growth_timeseries')

if os.path.exists(mkt_root_dir):
  shutil.rmtree(mkt_root_dir)
os.mkdir(mkt_root_dir)
os.mkdir(timeseries_root_dir)


all_ids = []
all_names = []
all_sectors = []
all_industries = []

for symbol in symbols:
  ticker = yf.Ticker(symbol)
  all_ids.append(symbol)
  all_names.append(ticker.info['shortName'].strip())
  all_sectors.append(ticker.info['sectorKey'])
  all_industries.append(ticker.info['industryKey'])
  
symbol_metadata = pd.DataFrame({'id':all_ids, 'name':all_names, 'sector':all_sectors, 'industry':all_industries})
symbol_metadata.to_csv(os.path.join(mkt_root_dir, 'asset_metadata.tsv'), index=False, sep='\t')

for symbol in symbols:
  ticker = yf.Ticker(symbol)
  hist_df = ticker.history(start=FIRST_DAY, interval='1d').reset_index()
  # convert Datetime from yfinance to Datetime.Date
  hist_df['Date'] = hist_df['Date'].dt.date
  # I recommend choosing either 'Open' or 'Close' as the signal because daily 'High' and 'Low' prices are themselves random variables 
  hist_df.rename(columns={'Date': 'date', 'Close':'price'}, inplace=True)
  hist_df[['date', 'price']].to_csv(os.path.join(timeseries_root_dir, f'{symbol}.csv'), index=False)