# Import

In [1]:
# Required version of fix_yahoo_finance is 0.021
from os import listdir
import time
import numpy as np
import pandas as pd
import fix_yahoo_finance as yf
import warnings
warnings.filterwarnings('ignore')

# Download & Read & Clean All U.S. Equity Tickers from Database

In [2]:
# read entities.csv downloaded from Entities Manager
all_us_tickers = pd.read_csv("us_entities_database.csv")
all_us_tickers.head()

Unnamed: 0,entity_share_class_figi,entity_ticker
0,BBG001S16908,HCHC
1,BBG001S169P1,MPC
2,BBG001S26WK3,ATE
3,BBG001S3DVN2,MEMP
4,BBG001S3RCX8,CFCB


In [3]:
# exclude tickers contain non-letters
# in our database, it could contain "-" or numbers
def non_letter(x):
    if "-" in x["entity_ticker"] or any(char.isdigit() for char in x["entity_ticker"]):
        return 1
    else:
        return 0
all_us_tickers["non_letter"] = all_us_tickers.apply(lambda x: non_letter(x), axis=1)
all_us_tickers = all_us_tickers[all_us_tickers["non_letter"] == 0]

# process "/" to "-"
def process_dot(x):
    if "/" in x["entity_ticker"]:
        return x["entity_ticker"].replace("/", "-")
    else:
        return x["entity_ticker"]
    
all_us_tickers["entity_ticker"] = all_us_tickers.apply(lambda x: process_dot(x), axis=1)
del all_us_tickers["non_letter"]
all_us_tickers = all_us_tickers.sort_values("entity_ticker").reset_index(drop=True)

# write to csv
all_us_tickers.to_csv("all_us_tickers.csv", index=False)

# Use `fix_yahoo_finance` to Download Price Data

In [4]:
# define some constants
start_date = '2005-01-01'
end_date = '2019-05-15'
ticker_list = all_us_tickers["entity_ticker"].to_list()
PRICE_MAX_RETRY = 5
PRICE_MAX_SLEEP = 5
YF_VERBOSE = False

In [5]:
len(ticker_list)

7368

In [None]:
for batch in range(1, 75):
    tickers = ticker_list[(batch-1)*100:batch*100]
    df_price = pd.DataFrame()
    retry = 0
    while tickers and retry < PRICE_MAX_RETRY:
        failed_list = []
        print("batch: " + str(batch) + " - Trial no. " + str(retry+1) + 
              ", " + str(len(tickers)) + " tickers left...")
        # Data returned excludes end_date
        price = pd.DataFrame()
        success_tickers = []
        try:
            pan = yf.download(
                tickers,
                start_date,
                end_date,
                progress=YF_VERBOSE
            )
            #print("download finished")
            if isinstance(pan, pd.core.panel.Panel):
                price = pan.to_frame().iloc[::-1].reset_index()
            else:
                pan["minor"] = tickers[0]
                price = pan.copy()
                price.reset_index(drop=False, inplace=True)
            success_tickers = price["minor"].unique().tolist()

            if not price.empty:
                print("batch: " + str(batch) + " - Basic data transformation")
                price = price.rename(columns={
                    "Date": "date",
                    "minor": "yahoo_quote",
                    "High": "high",
                    "Low": "low",
                    "Open": "open",
                    "Volume": "volume",
                    "Adj Close": "adj_close",
                    "Close": "close"
                })
                df_price = df_price.append(price)
        except Exception:  # pylint: disable=broad-except
            success_tickers = []

        print("batch: " + str(batch) + " - success rate: " + 
              str(len(success_tickers)) + " / " + str(len(tickers)) + "\n")
        
        failed_tickers = list(sorted(set(tickers) - set(success_tickers)))
        failed_list.extend(failed_tickers)

        time.sleep(int(PRICE_MAX_SLEEP))
        tickers = failed_list
        # TODO: Test if it makes YF download better
        np.random.shuffle(tickers)
        retry += 1
    print ("Saving batch " + str(batch) + " to csv\n")
    df_price.to_csv("./us_daily_price/batch_" + str(batch) + ".csv", index=False)

batch: 1 - Trial no. 1, 100 tickers left...


Review the data after the download. If the size of the data is too small, then find which batch the data belongs to, and go back to the above code and change the range and run it again. Rpeat this process 4-5 times and then we can download all of datat that we need. 

## Data Transformation

### Load All Downloaded CSV Files

In [14]:
# locate the data folder
data_path = "./us_daily_price/"
csv_files = [filename for filename in listdir(data_path) if filename.endswith(".csv")]

In [15]:
# read all csv price data
all_price = pd.DataFrame()
for filename in csv_files:
    price = pd.read_csv(data_path + filename, parse_dates=["major"])
    all_price = all_price.append(price)

In [16]:
# rename and sort
all_price = all_price.drop_duplicates()
all_price.rename(columns={"major": "date", "yahoo_quote": "ticker"}, inplace=True)
all_price = all_price.sort_values(["ticker", "date"]).reset_index(drop=True)

In [59]:
# get tickers who have been public traded since 2005-01-01
min_date = all_price.groupby("ticker").min()["date"].reset_index()
min_date_tickers = min_date.query("date == '2005-01-03'").reset_index(drop=True)
max_date = all_price.groupby("ticker").max()["date"].reset_index()
max_date_tickers = max_date.query("date == '2019-05-15'").reset_index(drop=True)
remain_tickers = list(set(min_date_tickers["ticker"]) & set(max_date_tickers["ticker"]))

In [60]:
len(remain_tickers)

2702

In [40]:
filtered_price = all_price[all_price["ticker"].isin(remain_tickers)]

## QA on Filtered Data

In [44]:
qa = filtered_price.groupby("ticker").count()["date"].reset_index()
# full range should have 3616 records
remove_tickers = list(qa.query("date != 3616").ticker)
# remove all tickers have less than 3616 price records
final_price = filtered_price[~filtered_price["ticker"].isin(remove_tickers)].reset_index(drop=True)

In [55]:
# Check if each ticker has 3616 days data
sum(final_price.groupby('ticker').count()["date"].reset_index().date != 3616)

0

In [89]:
# Some unporpular stocks may not have trade volumes for certain days
final_price[final_price['volume'] == 0].sample(10)

Unnamed: 0,date,ticker,adj_close,close,high,low,open,volume
7899773,2014-08-27,SLCT,7.55,7.55,7.55,7.55,7.55,0.0
8418654,2007-06-01,TENX,750.0,750.0,774.0,750.0,750.0,0.0
1396881,2009-05-26,CARV,83.755394,84.150002,84.150002,84.150002,84.150002,0.0
7761551,2011-06-02,SGB,8.973354,10.57,10.57,10.57,10.57,0.0
9435829,2011-09-12,WSO-B,45.214828,56.02,56.02,56.02,56.02,0.0
935727,2016-02-17,BDR,0.44,0.44,0.44,0.44,0.44,0.0
7741112,2016-06-01,SENEB,38.099998,38.099998,38.099998,38.099998,38.099998,0.0
923036,2008-10-20,BDGE,13.745096,20.25,20.25,20.25,20.25,0.0
4474199,2009-10-19,IPB,10.570426,18.25,18.25,18.25,18.25,0.0
5301905,2008-05-19,MAMS,1.1,1.1,1.1,1.1,1.1,0.0


In [90]:
# BAC releases massive stocks in 2009/12 which cause a huge volume jump
final_price[final_price['volume'] == max(final_price['volume'])]

Unnamed: 0,date,ticker,adj_close,close,high,low,open,volume
800376,2009-12-04,BAC,14.886605,16.280001,16.4,15.62,15.76,1226791000.0


In [93]:
# The stock is still traded on over-the-counter market at a very low price
final_price[final_price['close'] == min(final_price['close'])]

Unnamed: 0,date,ticker,adj_close,close,high,low,open,volume
8041209,2016-04-18,SPDC,0.0001,0.0001,0.0001,0.0001,0.0001,339.0


In [94]:
"""
The prices look unreasonable, but we cross checked with Bloomberg and 
the price are real. It might be due to a reverse stock split during that time,
A reverse stock split is a type of corporate action which consolidates 
the number of existing shares of stock into fewer, proportionally more valuable, shares. 
"""
final_price[final_price['close'] == max(final_price['close'])]

Unnamed: 0,date,ticker,adj_close,close,high,low,open,volume
8569958,2005-02-28,TOPS,8157235000.0,8157240000.0,8316000000.0,7677180000.0,7862400000.0,0.0


## Conclusion on QA
Even though some numbers of certain stocks look unreasonable, they all can be explained. As a result, we will keep all the records.

## Define Helper Functions

In [101]:
def shift_cal(df, days=[1], price_type="open", cal_type=["pct", "dir"]):
    # make shift and calculation
    for day in days:
        shift_col = price_type + "_" + str(day)
        df[shift_col] = df.groupby("ticker")[price_type].shift(day)
        for calculation in cal_type:
            cal_col = price_type[0] * 2 + "_" + str(day) + "d_" + calculation
            if calculation == "pct":
                df[cal_col] = (df[price_type] - df[shift_col]) / df[shift_col]
            elif calculation == "dir":
                df[cal_col] = np.sign(df[price_type] - df[shift_col])
            else:
                raise ValueError("The given cal_type is not defined")
    return df

### Define Shift Days

In [102]:
shift_days = [1, 5, 22, 66]

### OO Transformation

In [103]:
# get open price from the dataframe
open_price = final_price[["ticker", "date", "open"]].sort_values(["ticker", "date"]).reset_index(drop=True)

# apply shift_cal on open price df
open_shifted = shift_cal(open_price, days=shift_days, price_type="open", cal_type=["pct", "dir"])

# keep related columns
remain_cols = []
for col in open_shifted.columns:
    if "oo" in col:
        remain_cols.append(col)
oo_final = open_shifted[["ticker", "date"] + remain_cols]

# final sort and export
oo_final = oo_final.sort_values(["ticker", "date"]).reset_index(drop=True)

In [125]:
oo_1d_pct = oo_final[["ticker","date","oo_1d_pct"]]
oo_1d_dir = oo_final[["ticker","date","oo_1d_dir"]]
oo_5d_pct = oo_final[["ticker","date","oo_5d_pct"]]
oo_5d_dir = oo_final[["ticker","date","oo_5d_dir"]]
oo_22d_pct = oo_final[["ticker","date","oo_22d_pct"]]
oo_22d_dir = oo_final[["ticker","date","oo_22d_dir"]]
oo_66d_pct = oo_final[["ticker","date","oo_66d_pct"]]
oo_66d_dir = oo_final[["ticker","date","oo_66d_dir"]]
oo_1d_pct.to_csv("./us_daily_target/oo_1d_pct.csv", index=False)
oo_1d_dir.to_csv("./us_daily_target/oo_1d_dir.csv", index=False)
oo_5d_pct.to_csv("./us_daily_target/oo_5d_pct.csv", index=False)
oo_5d_dir.to_csv("./us_daily_target/oo_5d_dir.csv", index=False)
oo_22d_pct.to_csv("./us_daily_target/oo_22d_pct.csv", index=False)
oo_22d_dir.to_csv("./us_daily_target/oo_22d_dir.csv", index=False)
oo_66d_pct.to_csv("./us_daily_target/oo_66d_pct.csv", index=False)
oo_66d_dir.to_csv("./us_daily_target/oo_66d_dir.csv", index=False)

### CC Transformation

In [106]:
# get close price from the dataframe
close_price = final_price[["ticker", "date", "adj_close"]].sort_values(["ticker", "date"]).reset_index(drop=True)
close_price.rename(columns={"adj_close": "close"}, inplace=True)

# apply shift_cal on close price df
close_shifted = shift_cal(close_price, days=shift_days, price_type="close", cal_type=["pct", "dir"])

# keep related columns
remain_cols = []
for col in close_shifted.columns:
    if "cc" in col:
        remain_cols.append(col)
cc_final = close_shifted[["ticker", "date"] + remain_cols]

# final sort and export
cc_final = cc_final.sort_values(["ticker", "date"]).reset_index(drop=True)

In [111]:
cc_final.head()

Unnamed: 0,ticker,date,cc_1d_pct,cc_1d_dir,cc_5d_pct,cc_5d_dir,cc_22d_pct,cc_22d_dir,cc_66d_pct,cc_66d_dir
0,A,2005-01-03,,,,,,,,
1,A,2005-01-04,-0.026382,-1.0,,,,,,
2,A,2005-01-05,-0.00043,-1.0,,,,,,
3,A,2005-01-06,-0.021945,-1.0,,,,,,
4,A,2005-01-07,-0.00088,-1.0,,,,,,


In [117]:
cc_1d_pct = cc_final[["ticker","date","cc_1d_pct"]]
cc_1d_dir = cc_final[["ticker","date","cc_1d_dir"]]
cc_5d_pct = cc_final[["ticker","date","cc_5d_pct"]]
cc_5d_dir = cc_final[["ticker","date","cc_5d_dir"]]
cc_22d_pct = cc_final[["ticker","date","cc_22d_pct"]]
cc_22d_dir = cc_final[["ticker","date","cc_22d_dir"]]
cc_66d_pct = cc_final[["ticker","date","cc_66d_pct"]]
cc_66d_dir = cc_final[["ticker","date","cc_66d_dir"]]
cc_1d_pct.to_csv("./us_daily_target/cc_1d_pct.csv", index=False)
cc_1d_dir.to_csv("./us_daily_target/cc_1d_dir.csv", index=False)
cc_5d_pct.to_csv("./us_daily_target/cc_5d_pct.csv", index=False)
cc_5d_dir.to_csv("./us_daily_target/cc_5d_dir.csv", index=False)
cc_22d_pct.to_csv("./us_daily_target/cc_22d_pct.csv", index=False)
cc_22d_dir.to_csv("./us_daily_target/cc_22d_dir.csv", index=False)
cc_66d_pct.to_csv("./us_daily_target/cc_66d_pct.csv", index=False)
cc_66d_dir.to_csv("./us_daily_target/cc_66d_dir.csv", index=False)

In [107]:
len(oo_final.ticker.unique())

2657

In [108]:
len(cc_final.ticker.unique())

2657

In [126]:
len(all_price.ticker.unique())

5208