In [1]:
import pandas as pd
import numpy as np
import os
from datetime import date
import datetime
import time
from tqdm import tqdm
import yfinance as yf

# Class for aggregating longitudinally

In [71]:
class CreateLongData:
    """
    """
    def __init__(self, directory, df):
        self.directory = directory
        self.df = df

    def usa_stocks(self, date):
        files = os.listdir(f"{self.directory}/{date}")
        usa_stocks_file = [x for x in files if (x.startswith('usa_stocks')) & (x.endswith('.csv'))][0]
        df = pd.read_csv(f"{self.directory}/{date}/{usa_stocks_file}").iloc[:, 1:]
        # remove % from values
        df = df.replace({'%':''}, regex=True) 
        # no change during after hours
        df['After-Hours Change'] = df['After-Hours Change'].replace(np.nan, 0) 
        # drop fundamentals for now
        df = df.drop(['Company', 'Industry', 'Country', 'P/E', 'Forward P/E', 'P/B', 'Dividend Yield', 'Total Debt/Equity',
                      'EPS growth this year', 'EPS growth next year'], axis=1) 
        # remove stock with missing market cap data
        df = df[~df['Market Cap'].isna()] 
        # remove remaining stock with missing either instituional ownership, float short, short ratio, volatility, rsi
        df = df.dropna() 
        # ensure all columns are numerics
        df = pd.concat([df[['Ticker', 'Sector']], df.drop(['Ticker', 'Sector'], axis=1).apply(pd.to_numeric)], axis=1)
        # convert % to decimals
        perc_cols = ['Dividend Yield','EPS growth this year','EPS growth next year','Institutional Ownership',
                     'Institutional Transactions','Float Short','Performance (Week)','Performance (Month)',
                     'Volatility (Week)','Volatility (Month)','20-Day Simple Moving Average','50-Day Simple Moving Average',
                     '200-Day Simple Moving Average','50-Day High','50-Day Low','52-Week High','52-Week Low',
                     'Relative Strength Index (14)','Change from Open','Gap','Change','After-Hours Change']
        df[list(set(df.columns).intersection(set(perc_cols)))] = df[list(set(df.columns).intersection(set(perc_cols)))]/100
        # clean up column names
        df.columns = df.columns.str.replace("(", "")
        df.columns = df.columns.str.replace(")", "")
        df.columns = df.columns.str.replace("-", "_")
        df.columns = df.columns.str.replace(" ", "_")
        # add date
        df['Date'] = datetime.datetime.strptime(date, '%m-%d-%Y')
        return df

    def technical_signals(self, date, df_all):
        files = os.listdir(f"{self.directory}/{date}")
        signal_files = [x for x in files if ((x.startswith('signal')) | (x.startswith('dv'))) & (x.endswith('.csv'))]
        # iterate through all technical indicators
        for i in signal_files:
            # for each stock flagged by the specified technical indicator in the filename, join to main df
            df_signal = pd.read_csv(f"{self.directory}/{date}/{i}").iloc[:, 1:]
            signal_col = i.replace('.csv','')
            # 1 for technical indicator satisfied
            df_signal[signal_col] = 1
            df_all = df_all.merge(df_signal, on='Ticker', how='left')
            # 0 if technical indicator not satisfied
            df_all[signal_col] = df_all[signal_col].replace(np.nan, 0)         
        return df_all
    
    def get_vix(self):
        end = date.today() + datetime.timedelta(days=1)
        start = self.df['Date'].min()
        vix = yf.download('^VIX', start=start, end=end)
        vix = vix.reset_index()
        vix_close = vix[['Date', 'Close']].rename(columns={'Close':'vix'})
        return vix_close
    
    def create_lag_feats(self, df, num_lags=1):
        # lag all current_ columns
        df_lag = df.copy()
        current_cols = [x for x in df_lag.columns if x.startswith('current')]
        for i in tqdm(range(1, num_lags + 1)):
            for j in current_cols:
                col_name = j.replace('current', f"previous{i}")
                df_lag[col_name] = df_lag.groupby('ticker')[j].shift(i)
        return df_lag
    
    
    def combine_long(self):
        # first scan directory for new data
        files = os.listdir(f"{self.directory}")
        df_dates = [datetime.datetime.strptime(x, '%Y-%m-%d') for x in self.df['Date'].tolist()]
        new_files = [x for x in files if (datetime.datetime.strptime(x, '%m-%d-%Y') not in df_dates) &
                     (datetime.datetime.strptime(x, '%m-%d-%Y') > min(df_dates))]
        if not new_files:
            print("Data is up to date")
        else:
            print(f"Detetected {len(new_files)} new files. Updating now.")
            for i in tqdm(new_files):
                df1 = self.usa_stocks(i)
                df2 = self.technical_signals(i, df1)
                #vix_df = self.get_vix()
                #df2 = df2.merge(vix_df, on='Date', how='left')
                df_updt = pd.concat([self.df, df2], axis=0)
                df_updt['Date'] = pd.to_datetime(df_updt['Date'])
                df_updt = df_updt[['Date', 'Ticker'] + df_updt.drop(['Date', 'Ticker'], axis=1).columns.tolist()]
                self.df = df_updt.sort_values(['Ticker', 'Date']).reset_index().drop(['index'], axis=1)
        # add in vix
        vix_df = self.get_vix()
        self.df = self.df.merge(vix_df, on='Date', how='left')
        # clean-up columns names
        candlesticks = ['signal_doji', 'signal_hammer', 'signal_long_lower_shadow', 'signal_long_upper_shadow',
                        'signal_inverted_hammer', 'signal_spinning_top_white', 'signal_spinning_top_black',
                        'signal_dragon_fly_doji', 'signal_gravestone_doji', 'signal_marubozu_white', 'signal_marubozu_black']
        patterns = ['signal_channel_down', 'signal_channel_up', 'signal_double_bottom', 'signal_double_top',
                    'signal_head_and_shoulders', 'signal_head_and_shoulders_inverse', 'signal_horizontal',
                    'signal_multiple_top', 'signal_multple_bottom', 'signal_triangle_ascending', 'signal_triangle_descending',
                    'signal_tl_resistance', 'signal_tl_support', 'signal_wedge', 'signal_wedge_down', 'signal_wedge_up']
        self.df.columns = [x.replace('signal', 'candlestick') if x in candlesticks else x for x in self.df.columns]
        self.df.columns = [x.replace('signal', 'pattern') if x in patterns else x for x in self.df.columns]
        self.df.columns = self.df.columns.str.replace('signal', 'indicator')
        self.df.columns = self.df.columns.str.replace('dv', 'indicator')
        self.df.columns = ['current_' + x if x not in ['Date', 'Ticker', 'Sector'] else x for x in self.df.columns]
        self.df.columns = self.df.columns.str.lower()
        # add in day of week
        self.df['current_day_of_week'] = self.df['date'].dt.dayofweek
        # add in lag terms
        self.df = self.create_lag_feats(self.df, num_lags=1)
        # write out data
        self.df.to_csv('df_long.csv', index=False)
        return self.df
    

In [72]:
df_init = pd.DataFrame()
init_date = '10-01-2021'

# initialize class
cld = CreateLongData(directory='D:/Finviz Data', df=df_init)
df_init = cld.usa_stocks(date=init_date)
df_init = cld.technical_signals(date=init_date, df_all=df_init)
df_init.to_csv('df_long.csv', index=False)
df_init

Unnamed: 0,Ticker,Sector,Market_Cap,Institutional_Ownership,Institutional_Transactions,Float_Short,Short_Ratio,Performance_Week,Performance_Month,Average_True_Range,...,signal_marubozu_white,signal_marubozu_black,signal_20daysma_crossed_above_50daysma,signal_20daysma_crossed_below_50daysma,signal_50daysma_crossed_above_200daysma,signal_50daysma_crossed_below_200daysma,signal_price_10perc_above_20daysma,signal_price_10perc_below_20daysma,signal_price_below_20daysma,signal_price_crossed_below_20daysma
0,A,Healthcare,47687.70,0.903,0.0100,0.0114,2.07,-0.0952,-0.1109,3.62,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,AA,Basic Materials,9145.24,0.790,-0.0265,0.0659,1.58,0.0264,0.0914,2.36,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AAC,Financial,1217.46,0.587,0.9225,0.0020,0.80,0.0010,0.0062,0.03,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,AADI,Healthcare,611.03,0.109,0.1883,0.0087,0.31,-0.0898,0.0603,1.89,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,AAIC,Real Estate,119.61,0.441,-0.0477,0.0088,1.50,-0.0185,-0.0080,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3680,ZUMZ,Consumer Cyclical,1000.55,0.872,0.0472,0.0766,6.11,-0.0024,0.0002,1.95,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3681,ZUO,Technology,2065.97,0.716,0.0632,0.0407,5.78,-0.0251,-0.0240,0.65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3682,ZVO,Consumer Defensive,79.90,0.621,0.0079,0.0266,6.68,-0.0041,-0.0805,0.10,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3683,ZYNE,Healthcare,174.92,0.395,0.5756,0.0489,1.15,-0.0538,-0.0094,0.20,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [73]:
cld = CreateLongData(directory='D:/Finviz Data', df=pd.read_csv('df_long.csv'))
df_long = cld.combine_long()
df_long

  0%|                                                                                                                                                                                                               | 0/5 [00:00<?, ?it/s]

Detetected 5 new files. Updating now.


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.19it/s]

[*********************100%***********************]  1 of 1 completed


  0%|                                                                                                                                                                                                               | 0/1 [00:00<?, ?it/s]




100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.24s/it]


Unnamed: 0,date,ticker,sector,current_market_cap,current_institutional_ownership,current_institutional_transactions,current_float_short,current_short_ratio,current_performance_week,current_performance_month,...,previous1_indicator_20daysma_crossed_above_50daysma,previous1_indicator_20daysma_crossed_below_50daysma,previous1_indicator_50daysma_crossed_above_200daysma,previous1_indicator_50daysma_crossed_below_200daysma,previous1_indicator_price_10perc_above_20daysma,previous1_indicator_price_10perc_below_20daysma,previous1_indicator_price_below_20daysma,previous1_indicator_price_crossed_below_20daysma,previous1_vix,previous1_day_of_week
0,2021-10-01,A,Healthcare,47687.70,0.903,0.0100,0.0114,2.07,-0.0952,-0.1109,...,,,,,,,,,,
1,2021-10-04,A,Healthcare,46577.98,0.903,0.0100,0.0114,2.07,-0.0843,-0.1441,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,21.100000,4.0
2,2021-10-05,A,Healthcare,46782.80,0.903,0.0100,0.0114,2.06,-0.0430,-0.1464,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,22.959999,0.0
3,2021-10-06,A,Healthcare,46972.34,0.903,0.0100,0.0114,2.06,-0.0397,-0.1354,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,21.299999,1.0
4,2021-10-07,A,Healthcare,47482.88,0.903,0.0100,0.0114,2.07,-0.0140,-0.1310,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,21.000000,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22135,2021-10-04,ZYXI,Healthcare,385.55,0.318,-0.0374,0.1841,11.56,-0.0945,-0.2007,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,21.100000,4.0
22136,2021-10-05,ZYXI,Healthcare,392.67,0.318,-0.0374,0.1841,11.39,-0.0645,-0.1631,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,22.959999,0.0
22137,2021-10-06,ZYXI,Healthcare,394.80,0.318,-0.0374,0.1841,11.44,-0.0423,-0.1547,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,21.299999,1.0
22138,2021-10-07,ZYXI,Healthcare,399.43,0.318,-0.0374,0.1841,11.49,-0.0149,-0.1302,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,21.000000,2.0


In [3]:
# Additional feature ideas
    # PCA feature engineering
    # method for dv creation
    
# Modeling pipeline
    # RNN
    # Mixed effect logistic regression
    # Use output of above models as features for an xgboost with hyperparameter tuning for final prediction
    
# performance tracking

# descriptive stats for columns
    
# Dependent variable ideas
    # top gainers
    # >5% increase next day
    # >5% increase within the week
    # Marubozu white
    # Green inverted hammer
    # model probablitity of different kinds of candlesticks
    # class 0 should be opposite of class 1 (i.e. 1 = up 3%, 0 = down 3%)
