In [1]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
from pathlib import Path
import requests
from io import BytesIO
from zipfile import ZipFile, BadZipFile

import numpy as np
import pandas as pd
import pandas_datareader.data as web
from sklearn.datasets import fetch_openml

pd.set_option('display.expand_frame_repr', False)

In [12]:
# set data store path
DATA_STORE = Path('assets.h5')

### Quandi Wiki Prices

In [13]:

df = (pd.read_csv('wiki_prices.csv',
                 parse_dates=['date'],
                 index_col=['date', 'ticker'],
                 infer_datetime_format=True)
     .sort_index())

In [15]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 15389314 entries, (Timestamp('1962-01-02 00:00:00'), 'ARNC') to (Timestamp('2018-03-27 00:00:00'), 'ZUMZ')
Data columns (total 12 columns):
 #   Column       Dtype  
---  ------       -----  
 0   open         float64
 1   high         float64
 2   low          float64
 3   close        float64
 4   volume       float64
 5   ex-dividend  float64
 6   split_ratio  float64
 7   adj_open     float64
 8   adj_high     float64
 9   adj_low      float64
 10  adj_close    float64
 11  adj_volume   float64
dtypes: float64(12)
memory usage: 1.4+ GB
None


In [19]:
# covert to fast HDF format
with pd.HDFStore(DATA_STORE) as store:
    store.put('quandl/wiki/prices', df)

### Wiki Prices Metadata

In [22]:
df = pd.read_csv('wiki_stocks.csv')
print(df.info())
with pd.HDFStore(DATA_STORE) as store:
    store.put('quandl/wiki/stocks', df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3199 entries, 0 to 3198
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   code    3199 non-null   object
 1   name    3199 non-null   object
dtypes: object(2)
memory usage: 50.1+ KB
None


### S&P500 prices

In [23]:
df = web.DataReader(name='SP500', data_source='fred', start=2009).squeeze().to_frame('close')
print(df.info())
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/fred', df)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2609 entries, 2014-09-02 to 2024-08-30
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   close   2517 non-null   float64
dtypes: float64(1)
memory usage: 40.8 KB
None


In [27]:
# store data from 1950-2020
sp500_stooq = (pd.read_csv('^spx_d.csv', index_col=0,
                     parse_dates=True).loc['1950':'2019'].rename(columns=str.lower))
print(sp500_stooq.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17700 entries, 1950-01-03 to 2019-12-31
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   open    17700 non-null  float64
 1   high    17700 non-null  float64
 2   low     17700 non-null  float64
 3   close   17700 non-null  float64
 4   volume  17700 non-null  int64  
dtypes: float64(4), int64(1)
memory usage: 829.7 KB
None


In [28]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/stooq', sp500_stooq)

### S&P500 Consitiuents

In [29]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
df = pd.read_html(url, header=0)[0]

In [30]:
df.head()

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989


In [33]:
df.columns = ['ticker', 'name', 'gics_sector', 'gics_sub_industry',
              'location', 'first_added', 'cik', 'founded']
df = df.set_index('ticker')

In [35]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 503 entries, MMM to ZTS
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   name               503 non-null    object
 1   gics_sector        503 non-null    object
 2   gics_sub_industry  503 non-null    object
 3   location           503 non-null    object
 4   first_added        503 non-null    object
 5   cik                503 non-null    int64 
 6   founded            503 non-null    object
dtypes: int64(1), object(6)
memory usage: 31.4+ KB
None


In [36]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('sp500/stocks', df)

### Metadata on US-traded companies

In [38]:
NASDAQ = pd.read_csv("NASDAQ.csv")
AMEX = pd.read_csv("AMEX.csv")
NYSE = pd.read_csv("NYSE.csv")

In [45]:
df = pd.concat([NASDAQ, AMEX, NYSE]).dropna(how="all", axis=1)

In [46]:
df

Unnamed: 0,Symbol,Name,Last Sale,Net Change,% Change,Market Cap,Country,IPO Year,Volume,Sector,Industry
0,AACG,ATA Creativity Global American Depositary Shares,$0.59,-0.0022,-0.371%,1.888008e+07,China,2008.0,6116,Real Estate,Other Consumer Services
1,AADI,Aadi Bioscience Inc. Common Stock,$1.67,-0.1400,-7.735%,4.110677e+07,United States,,29877,Health Care,Biotechnology: Pharmaceutical Preparations
2,AAGR,African Agriculture Holdings Inc. Common Stock,$0.138,0.0050,3.759%,7.985623e+06,United States,2021.0,31035,Consumer Staples,Farming/Seeds/Milling
3,AAGRW,African Agriculture Holdings Inc. Warrant,$0.009,0.0000,0.00%,5.208010e+05,United States,2021.0,2600,Consumer Staples,Farming/Seeds/Milling
4,AAL,American Airlines Group Inc. Common Stock,$10.61,-0.0100,-0.094%,6.967629e+09,United States,,3326786,Consumer Discretionary,Air Freight/Delivery Services
...,...,...,...,...,...,...,...,...,...,...,...
2775,ZTR,Virtus Total Return Fund Inc.,$5.9097,-0.0166,-0.28%,0.000000e+00,United States,1988.0,36752,Finance,Investment Managers
2776,ZTS,Zoetis Inc. Class A Common Stock,$184.295,0.8050,0.439%,8.349503e+10,United States,2013.0,529044,Health Care,Biotechnology: Pharmaceutical Preparations
2777,ZUO,Zuora Inc. Class A Common Stock,$8.64,-0.1500,-1.706%,1.308960e+09,United States,2018.0,146903,Technology,Computer Software: Prepackaged Software
2778,ZVIA,Zevia PBC Class A Common Stock,$0.9801,-0.0299,-2.96%,7.126562e+07,United States,2021.0,67401,Consumer Staples,Beverages (Production/Distribution)


In [48]:
df = df.rename(columns=str.lower).set_index('symbol')

In [52]:
df = df.rename(columns={"market cap":"marketcap"})

In [53]:
df.head()

Unnamed: 0_level_0,name,last sale,net change,% change,marketcap,country,ipo year,volume,sector,industry
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AACG,ATA Creativity Global American Depositary Shares,$0.59,-0.0022,-0.371%,18880080.0,China,2008.0,6116,Real Estate,Other Consumer Services
AADI,Aadi Bioscience Inc. Common Stock,$1.67,-0.14,-7.735%,41106770.0,United States,,29877,Health Care,Biotechnology: Pharmaceutical Preparations
AAGR,African Agriculture Holdings Inc. Common Stock,$0.138,0.005,3.759%,7985623.0,United States,2021.0,31035,Consumer Staples,Farming/Seeds/Milling
AAGRW,African Agriculture Holdings Inc. Warrant,$0.009,0.0,0.00%,520801.0,United States,2021.0,2600,Consumer Staples,Farming/Seeds/Milling
AAL,American Airlines Group Inc. Common Stock,$10.61,-0.01,-0.094%,6967629000.0,United States,,3326786,Consumer Discretionary,Air Freight/Delivery Services


In [54]:
df = df[~df.index.duplicated()]

In [55]:
df

Unnamed: 0_level_0,name,last sale,net change,% change,marketcap,country,ipo year,volume,sector,industry
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AACG,ATA Creativity Global American Depositary Shares,$0.59,-0.0022,-0.371%,1.888008e+07,China,2008.0,6116,Real Estate,Other Consumer Services
AADI,Aadi Bioscience Inc. Common Stock,$1.67,-0.1400,-7.735%,4.110677e+07,United States,,29877,Health Care,Biotechnology: Pharmaceutical Preparations
AAGR,African Agriculture Holdings Inc. Common Stock,$0.138,0.0050,3.759%,7.985623e+06,United States,2021.0,31035,Consumer Staples,Farming/Seeds/Milling
AAGRW,African Agriculture Holdings Inc. Warrant,$0.009,0.0000,0.00%,5.208010e+05,United States,2021.0,2600,Consumer Staples,Farming/Seeds/Milling
AAL,American Airlines Group Inc. Common Stock,$10.61,-0.0100,-0.094%,6.967629e+09,United States,,3326786,Consumer Discretionary,Air Freight/Delivery Services
...,...,...,...,...,...,...,...,...,...,...
ZTR,Virtus Total Return Fund Inc.,$5.9097,-0.0166,-0.28%,0.000000e+00,United States,1988.0,36752,Finance,Investment Managers
ZTS,Zoetis Inc. Class A Common Stock,$184.295,0.8050,0.439%,8.349503e+10,United States,2013.0,529044,Health Care,Biotechnology: Pharmaceutical Preparations
ZUO,Zuora Inc. Class A Common Stock,$8.64,-0.1500,-1.706%,1.308960e+09,United States,2018.0,146903,Technology,Computer Software: Prepackaged Software
ZVIA,Zevia PBC Class A Common Stock,$0.9801,-0.0299,-2.96%,7.126562e+07,United States,2021.0,67401,Consumer Staples,Beverages (Production/Distribution)


In [56]:
df.marketcap.describe(percentiles=np.arange(.1, 1, .1).round(1)).apply(lambda x: f'{int(x):,d}')

count                6,576
mean        11,443,373,944
std         89,906,062,838
min                      0
10%                714,453
20%             16,022,801
30%             68,346,219
40%            202,630,401
50%            468,883,375
60%          1,022,424,394
70%          2,183,794,754
80%          5,086,933,014
90%         15,876,886,780
max      3,425,492,066,100
Name: marketcap, dtype: object

In [57]:
df = pd.read_csv('us_equities_meta_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6834 entries, 0 to 6833
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ticker     6834 non-null   object 
 1   name       6834 non-null   object 
 2   lastsale   6718 non-null   float64
 3   marketcap  5766 non-null   float64
 4   ipoyear    3038 non-null   float64
 5   sector     5288 non-null   object 
 6   industry   5288 non-null   object 
dtypes: float64(3), object(4)
memory usage: 373.9+ KB


In [58]:
with pd.HDFStore(DATA_STORE) as store:
    store.put('us_equities/stocks', df.set_index('ticker'))

### MNIST Data

In [59]:
mnist = fetch_openml('mnist_784', version=1)

In [60]:
print(mnist.DESCR)

**Author**: Yann LeCun, Corinna Cortes, Christopher J.C. Burges  
**Source**: [MNIST Website](http://yann.lecun.com/exdb/mnist/) - Date unknown  
**Please cite**:  

The MNIST database of handwritten digits with 784 features, raw data available at: http://yann.lecun.com/exdb/mnist/. It can be split in a training set of the first 60,000 examples, and a test set of 10,000 examples  

It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image b

In [61]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [62]:
mnist_path = Path('mnist')
if not mnist_path.exists():
    mnist_path.mkdir()

In [63]:
np.save(mnist_path / 'data', mnist.data.astype(np.uint8))
np.save(mnist_path / 'labels', mnist.target.astype(np.uint8))

### Fashion MNIST Image Data

In [64]:
fashion_mnist = fetch_openml(name='Fashion-MNIST')

In [65]:
print(fashion_mnist.DESCR)

**Author**: Han Xiao, Kashif Rasul, Roland Vollgraf  
**Source**: [Zalando Research](https://github.com/zalandoresearch/fashion-mnist)  
**Please cite**: Han Xiao and Kashif Rasul and Roland Vollgraf, Fashion-MNIST: a Novel Image Dataset for Benchmarking Machine Learning Algorithms, arXiv, cs.LG/1708.07747  

Fashion-MNIST is a dataset of Zalando's article images, consisting of a training set of 60,000 examples and a test set of 10,000 examples. Each example is a 28x28 grayscale image, associated with a label from 10 classes. Fashion-MNIST is intended to serve as a direct drop-in replacement for the original MNIST dataset for benchmarking machine learning algorithms. It shares the same image size and structure of training and testing splits. 

Raw data available at: https://github.com/zalandoresearch/fashion-mnist

### Target classes
Each training and test example is assigned to one of the following labels:
Label  Description  
0  T-shirt/top  
1  Trouser  
2  Pullover  
3  Dress  
4  

In [66]:
label_dict = {0: 'T-shirt/top',
              1: 'Trouser',
              2: 'Pullover',
              3: 'Dress',
              4: 'Coat',
              5: 'Sandal',
              6: 'Shirt',
              7: 'Sneaker',
              8: 'Bag',
              9: 'Ankle boot'}

In [67]:
fashion_path = Path('fashion_mnist')
if not fashion_path.exists():
    fashion_path.mkdir()

In [68]:
pd.Series(label_dict).to_csv(fashion_path / 'label_dict.csv', index=False, header=None)

In [69]:
np.save(fashion_path / 'data', fashion_mnist.data.astype(np.uint8))
np.save(fashion_path / 'labels', fashion_mnist.target.astype(np.uint8))