In [1]:
import pandas as pd
import numpy as np
import os
from datetime import date
#from datetime import datetime
import datetime
import time
from tqdm import tqdm
import yfinance as yf

# Class for aggregating longitudinally

In [2]:
class CreateLongData:
    """
    """
    def __init__(self, directory, df):
        self.directory = directory
        self.df = df

    def usa_stocks(self, date):
        files = os.listdir(f"{self.directory}/{date}")
        usa_stocks_file = [x for x in files if (x.startswith('usa_stocks')) & (x.endswith('.csv'))][0]
        df = pd.read_csv(f"{self.directory}/{date}/{usa_stocks_file}").iloc[:, 1:]
        # remove % from values
        df = df.replace({'%':''}, regex=True) 
        # no change during after hours
        df['After-Hours Change'] = df['After-Hours Change'].replace(np.nan, 0) 
        # drop fundamentals for now
        df = df.drop(['Company', 'Industry', 'Country', 'P/E', 'Forward P/E', 'P/B', 'Dividend Yield', 'Total Debt/Equity',
                      'EPS growth this year', 'EPS growth next year', 'Total Debt/Equity'], axis=1) 
        # remove stock with missing market cap data
        df = df[~df['Market Cap'].isna()] 
        # remove remaining stock with missing either instituional ownership, float short, short ratio, volatility, rsi
        df = df.dropna() 
        # clean up column names
        df.columns = df.columns.str.replace("(", "")
        df.columns = df.columns.str.replace(")", "")
        df.columns = df.columns.str.replace("-", "_")
        df.columns = df.columns.str.replace(" ", "_")
        # ensure all columns are numerics
        df = pd.concat([df[['Ticker', 'Sector']], df.drop(['Ticker', 'Sector'], axis=1).apply(pd.to_numeric)], axis=1)
        # add date
        df['Date'] = datetime.datetime.strptime(date, '%m-%d-%Y')
        return df

    def technical_signals(self, date, df_all):
        files = os.listdir(f"{self.directory}/{date}")
        signal_files = [x for x in files if ((x.startswith('signal')) | (x.startswith('dv'))) & (x.endswith('.csv'))]
        # iterate through all technical indicators
        for i in signal_files:
            # for each stock flagged by the specified technical indicator in the filename, join to main df
            df_signal = pd.read_csv(f"{self.directory}/{date}/{i}").iloc[:, 1:]
            signal_col = i.replace('.csv','')
            # 1 for technical indicator satisfied
            df_signal[signal_col] = 1
            df_all = df_all.merge(df_signal, on='Ticker', how='left')
            # 0 if technical indicator not satisfied
            df_all[signal_col] = df_all[signal_col].replace(np.nan, 0)         
        return df_all
    
    def get_vix(self):
        end = date.today() + datetime.timedelta(days=1)
        start = self.df['Date'].min()
        vix = yf.download('^VIX', start=start, end=end)
        vix = vix.reset_index()
        vix_close = vix[['Date', 'Close']].rename(columns={'Close':'vix'})
        return vix_close
    
    def create_lag_feats(self):
        pass
    
    
    def combine_long(self):
        # first scan directory for new data
        files = os.listdir(f"{self.directory}")
        df_dates = [datetime.datetime.strptime(x, '%Y-%m-%d') for x in self.df['Date'].tolist()]
        new_files = [x for x in files if datetime.datetime.strptime(x, '%m-%d-%Y') not in df_dates]
        if not new_files:
            print("Data is up to date")
        else:
            print(f"Detetected {len(new_files)} new files. Updating now.")
            for i in tqdm(new_files):
                df1 = self.usa_stocks(i)
                df2 = self.technical_signals(i, df1)
                vix_df = self.get_vix()
                df2 = df2.merge(vix_df, on='Date', how='left')
                df_updt = pd.concat([self.df, df2], axis=0)
                df_updt['Date'] = pd.to_datetime(df_updt['Date'])
                df_updt = df_updt[['Date', 'Ticker'] + df_updt.drop(['Date', 'Ticker'], axis=1).columns.tolist()]
                self.df = df_updt.sort_values(['Ticker', 'Date']).reset_index().drop(['index'], axis=1)
        # write out data
        self.df.to_csv('df_long.csv', index=False)
        return self.df
    

In [4]:
df_init = pd.DataFrame()
init_date = '9-15-2021'

# initialize class
cld = CreateLongData(directory='D:/Finviz Data', df=df_init)
df_init = cld.usa_stocks(date=init_date)
df_init = cld.technical_signals(date=init_date, df_all=df_init)
df_init.to_csv('df_long.csv', index=False)
df_init

Unnamed: 0,Ticker,Sector,Market_Cap,Institutional_Ownership,Float_Short,Short_Ratio,Average_True_Range,Volatility_Week,Volatility_Month,Relative_Strength_Index_14,...,dv_today_up_5perc,dv_today_up,dv_today_down,dv_today_down_5perc,signal_price_above_20daysma,signal_price_crossed_above_20daysma,signal_lt_3perc_below_high,signal_lt_3perc_above_low,signal_doji,signal_hammer
0,PNRG,Energy,104.97,3.9,0.46,0.53,4.07,15.49,5.22,51.06,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,DNMR,Basic Materials,1529.56,48.1,12.06,4.98,1.63,11.36,9.92,36.05,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,STON,Consumer Cyclical,225.38,85.5,0.89,3.43,0.18,10.44,6.52,23.11,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CSCW,Communication Services,83.26,1.2,3.89,0.89,0.08,11.33,8.63,48.26,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MOSY,Technology,50.39,16.8,4.98,0.23,0.46,9.77,7.45,47.41,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3899,TIL,Healthcare,2617.97,55.0,4.59,21.57,1.58,12.19,8.37,53.45,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3900,OPAD,Real Estate,670.39,93.1,30.46,6.95,1.37,32.42,12.00,68.57,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3901,MPX,Consumer Cyclical,496.06,15.1,4.11,9.55,0.82,10.09,5.84,54.79,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
3902,CDXC,Healthcare,454.29,30.4,8.22,7.97,0.48,11.75,6.14,23.69,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
cld = CreateLongData(directory='D:/Finviz Data', df=pd.read_csv('df_long.csv'))
df_long = cld.combine_long()
df_long

  0%|                                                                                                                                                                                                              | 0/15 [00:00<?, ?it/s]

Detetected 15 new files. Updating now.
[*********************100%***********************]  1 of 1 completed

  7%|█████████████▏                                                                                                                                                                                        | 1/15 [00:01<00:15,  1.14s/it]


[*********************100%***********************]  1 of 1 completed

 13%|██████████████████████████▍                                                                                                                                                                           | 2/15 [00:01<00:13,  1.03s/it]


[*********************100%***********************]  1 of 1 completed

 20%|███████████████████████████████████████▌                                                                                                                                                              | 3/15 [00:02<00:11,  1.03it/s]


[*********************100%***********************]  1 of 1 completed

 27%|████████████████████████████████████████████████████▊                                                                                                                                                 | 4/15 [00:03<00:10,  1.08it/s]


[*********************100%***********************]  1 of 1 completed

 33%|██████████████████████████████████████████████████████████████████                                                                                                                                    | 5/15 [00:04<00:09,  1.10it/s]


[*********************100%***********************]  1 of 1 completed

 40%|███████████████████████████████████████████████████████████████████████████████▏                                                                                                                      | 6/15 [00:05<00:08,  1.12it/s]


[*********************100%***********************]  1 of 1 completed

 47%|████████████████████████████████████████████████████████████████████████████████████████████▍                                                                                                         | 7/15 [00:06<00:07,  1.13it/s]


[*********************100%***********************]  1 of 1 completed

 53%|█████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                                                                            | 8/15 [00:07<00:06,  1.12it/s]


[*********************100%***********************]  1 of 1 completed

 60%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                                                                               | 9/15 [00:08<00:05,  1.10it/s]


[*********************100%***********************]  1 of 1 completed

 67%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                                                                 | 10/15 [00:09<00:05,  1.04s/it]


[*********************100%***********************]  1 of 1 completed

 73%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍                                                    | 11/15 [00:10<00:04,  1.04s/it]


[*********************100%***********************]  1 of 1 completed

 80%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▌                                       | 12/15 [00:11<00:03,  1.13s/it]


[*********************100%***********************]  1 of 1 completed

 87%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋                          | 13/15 [00:13<00:02,  1.22s/it]


[*********************100%***********************]  1 of 1 completed

 93%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊             | 14/15 [00:14<00:01,  1.30s/it]


[*********************100%***********************]  1 of 1 completed

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:15<00:00,  1.06s/it]







Unnamed: 0,Date,Ticker,Sector,Market_Cap,Institutional_Ownership,Float_Short,Short_Ratio,Average_True_Range,Volatility_Week,Volatility_Month,...,52_Week_High,52_Week_Low,signal_20daysma_crossed_above_50daysma,signal_20daysma_crossed_below_50daysma,signal_50daysma_crossed_above_200daysma,signal_50daysma_crossed_below_200daysma,signal_price_10perc_above_20daysma,signal_price_10perc_below_20daysma,signal_price_below_20daysma,signal_price_crossed_below_20daysma
0,2021-09-15,A,Healthcare,52609.71,90.2,1.24,2.10,3.00,2.11,1.82,...,,,,,,,,,,
1,2021-09-16,A,Healthcare,52455.32,90.2,1.24,2.09,3.02,2.18,1.82,...,,,,,,,,,,
2,2021-09-17,A,Healthcare,52655.12,90.2,1.24,2.10,3.16,2.47,1.85,...,,,,,,,,,,
3,2021-09-20,A,Healthcare,51671.00,90.2,1.24,2.11,3.32,1.96,1.73,...,,,,,,,,,,
4,2021-09-21,A,Healthcare,51701.16,90.2,1.24,2.14,3.30,1.99,1.69,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60049,2021-09-30,ZYXI,Healthcare,408.45,31.8,18.41,11.45,0.42,3.05,3.41,...,,,,,,,,,,
60050,2021-10-01,ZYXI,Healthcare,395.87,31.8,18.41,11.40,0.44,3.75,3.57,...,-51.42,-2.03,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
60051,2021-10-04,ZYXI,Healthcare,385.55,31.8,18.41,11.56,0.44,3.95,3.58,...,-52.69,1.03,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
60052,2021-10-05,ZYXI,Healthcare,392.67,31.8,18.41,11.39,0.43,3.95,3.47,...,-51.81,3.47,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# Additional feature ideas
    # prior short squeezes
    # prior trend reversals
    # previous day candlestick
    # % difference from 20 day sma (continuous)
    # % difference from 50 day sma (continuous)
    # % difference from 200 day sma (continuous)
    # difference b/w 20 and 50 day sma (continuous)
    # difference b/w 50 and 200 day sma (continuous)
    # difference b/w 20 and 200 days sma (continuous)
    # % difference from 50 day high (continuous)
    # % difference from 50 day low (continuous)
    # % difference from 52 week high (continuous)
    # % difference from 52 week low (continuous)
    # Day of week
    # Number day ordinally and use as slope within sector or stock
    # VIX price
    # PCA on signal features
    
# Modeling pipeline
    # RNN
    # Mixed effect logistic regression
    # Use output of above models as features for an xgboost with hyperparameter tuning for final prediction
    
# performance tracking
    
# Dependent variable ideas
    # top gainers
    # >5% increase next day
    # >5% increase within the week
    # Marubozu white
    # Green inverted hammer
    # model probablitity of different kinds of candlesticks
    # class 0 should be opposite of class 1 (i.e. 1 = up 3%, 0 = down 3%)
    
# cleaning 
    # convert % to decimals
    # review finviz data dictionary
    # rename dv cols to signal
    
# data process flow
    # rolling 10 day window configuration
    # descriptive stats for features