In [1]:
import pandas as pd
import numpy as np
from pandas_datareader import data as pdr
from datetime import datetime, timedelta
from utils import status_calc

In [2]:
START_DATE = "2010-01-01"
END_DATE = "2020-11-11"

In [3]:
#Read raw data download from Quanel

df_raw = pd.read_csv(r"SHARADAR_SF1.csv",index_col="calendardate")
#Filter only quarterly statement
is_ARQ = df_raw['dimension'] == 'ARQ'
df_Quarter = df_raw[is_ARQ]

#Select factors, drop any rows with Null value
df_selected_factor = df_Quarter[['ticker',
    'eps','netinc','opinc','ncfo','cashnequsd',
    'de','ebitda','fcf','marketcap','netmargin',
    'pb','pe','ps','workingcapital','ev',
    'divyield','fcfps','revenue','invcap', 'ebit', 'equityusd', 'assets']]
    
df_new = df_selected_factor.copy()
df_new.dropna(axis=0, how="any", inplace=True)

#Caclulate ratio
df_ratio = df_new[['equityusd','netinc','assets','invcap','ebit']]
df_roe = pd.DataFrame(df_ratio['netinc'] / df_ratio['equityusd'], columns = ['roe'])
df_roa = pd.DataFrame(df_ratio['netinc'] / df_ratio['assets'], columns = ['roa'])
df_roic = pd.DataFrame(df_ratio['netinc'] / df_ratio['invcap'], columns = ['roic'])

keystats_new = pd.concat([df_new, df_roe, df_roa, df_roic], axis =1)
keystats_new.to_csv("keystats_new.csv")

In [57]:
#Creates the dataset containing S&P500 prices

sp500_raw_data = pdr.get_data_yahoo("SPY", start=START_DATE, end=END_DATE)
sp500_raw_data.to_csv("sp500_index.csv")

sp500_raw_data_avg = sp500_raw_data.iloc[:,].rolling(window=60).mean()
sp500_raw_data_avg.to_csv("sp500_index_avg.csv")

In [None]:
#Creates the dataset containing all stock prices

ticker_list = keystats_new.ticker.drop_duplicates().tolist()

# Get all Adjusted Close prices for all the tickers in our list,
# between START_DATE and END_DATE
all_data = pdr.get_data_yahoo(ticker_list, start=START_DATE, end=END_DATE)
stock_raw_data = all_data["Adj Close"]

# Remove any columns that hold no data, and print their tickers.
stock_raw_data.dropna(how="all", axis=1, inplace=True)
missing_tickers = [
    ticker for ticker in ticker_list if ticker.upper() not in stock_raw_data.columns
]
print(f"{len(missing_tickers)} tickers are missing: \n {missing_tickers} ")
# If there are only some missing datapoints, forward fill.
stock_raw_data.ffill(inplace=True)
stock_raw_data.to_csv("stock_prices.csv")

# 5 business day/ week * 4 week/month * 3 month
stock_raw_data_avg = stock_raw_data.iloc[:,].rolling(window=60).mean()
stock_raw_data_avg.to_csv("stock_prices_avg.csv")

In [58]:
"""
Currently, the sp500 and stock price datasets we downloaded do not have any data for
days when the market was closed (weekends and public holidays). We need to amend this so that
all rows are included. Doing this now saves a lot of effort when we actually create the
keystats dataset, which requires that we have stock data every day.
:return: SP500 and stock dataframes, with no missing rows.
"""
# Read in SP500 data and stock data, parsing the dates.
sp500_raw_data = pd.read_csv("sp500_index_avg.csv", index_col="Date", parse_dates=True)
stock_raw_data = pd.read_csv("stock_prices_avg.csv", index_col="Date", parse_dates=True)

# We will reindex to include the weekends.
start_date = str(stock_raw_data.index[0])
end_date = str(stock_raw_data.index[-1])
idx = pd.date_range(start_date, end_date)
sp500_raw_data = sp500_raw_data.reindex(idx)
stock_raw_data = stock_raw_data.reindex(idx)

# Now the weekends are NaN, so we fill forward these NaNs
# (i.e weekends take the value of Friday's adjusted close).
sp500_raw_data.ffill(inplace=True)
stock_raw_data.ffill(inplace=True)

In [59]:
#adding stock price with keystat

stock_price = []
sp500_price = []
stock_1m_price = []
sp500_1m_price = []
stock_p_change = []
sp500_p_change = []
for i in range(keystats_new.shape[0]):
#for i in range(100):    
    current_date = keystats_new.index[i]
    one_month_date = datetime.strptime(keystats_new.index[i],"%m/%d/%Y") + timedelta(weeks = 4)
    one_month_date = one_month_date.strftime("%m/%d/%Y")
    try:
        stock_price_i = float(stock_raw_data.loc[current_date, keystats_new.ticker[i]])
        stock_1m_price_i = float(stock_raw_data.loc[one_month_date, keystats_new.ticker[i]])
        sp500_price_i = float(sp500_raw_data.loc[current_date, "Adj Close"])
        sp500_1m_price_i = float(sp500_raw_data.loc[one_month_date, "Adj Close"])
        stock_p_change_i = round(
                ((stock_1m_price_i - stock_price_i) / stock_price_i * 100), 2
        )
        sp500_p_change_i = round(
                ((sp500_1m_price_i - sp500_price_i) / sp500_price_i * 100), 2
        )
    except KeyError:
        stock_price_i = None
        stock_1m_price_i = None
        sp500_price_i = None
        sp500_1m_price_i = None
        stock_p_change_i = None
        sp500_p_change_i = None
    stock_price.append(stock_price_i)
    stock_1m_price.append(stock_1m_price_i)
    sp500_price.append(sp500_price_i)
    sp500_1m_price.append(sp500_1m_price_i)   
    stock_p_change.append(stock_p_change_i)
    sp500_p_change.append(sp500_p_change_i)
    
keystats_new["stock_price"] = stock_price
keystats_new["sp500_price"] = sp500_price
keystats_new["stock_p_change"] = stock_p_change
keystats_new["sp500_p_change"] = sp500_p_change

keystats_to_split = keystats_new.copy()
keystats_to_split.dropna(axis=0, subset=["stock_price", "stock_p_change"], inplace=True)
keystats_to_split.to_csv("keystats_to_split.csv", index = True)

In [60]:
ticker_list = keystats_to_split.ticker.drop_duplicates().tolist()
lastest_keystat = pd.DataFrame() 
train_keystat = pd.DataFrame() 
for i in ticker_list:
    try:
        lastest_i = keystats_to_split.loc[keystats_to_split["ticker"] == i].tail(1)
        train_i = keystats_to_split.loc[keystats_to_split["ticker"] == i].iloc[:-1]
    except:
        continue
    lastest_keystat = lastest_keystat.append(lastest_i)
    train_keystat = train_keystat.append(train_i)
train_keystat.to_csv("ketstats_to_train.csv", index = True)
lastest_keystat.to_csv("forward_sample.csv", index = True)