In [1]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

# Class for aggregating longitudinally

In [51]:
class CreateLongData:
    """
    """
    def __init__(self, directory, df):
        self.directory = directory
        self.df = df

    def usa_stocks(self, date):
        files = os.listdir(f"{self.directory}/{date}")
        usa_stocks_file = [x for x in files if (x.startswith('usa_stocks')) & (x.endswith('.csv'))][0]
        df = pd.read_csv(f"{self.directory}/{date}/{usa_stocks_file}").iloc[:, 1:]
        # remove % from values
        df = df.replace({'%':''}, regex=True) 
        # no change during after hours
        df['After-Hours Change'] = df['After-Hours Change'].replace(np.nan, 0) 
        # drop fundamentals for now
        df = df.drop(['Company', 'Industry', 'Country', 'P/E', 'Forward P/E', 'P/B', 'Dividend Yield', 'Total Debt/Equity',
                      'EPS growth this year', 'EPS growth next year', 'Total Debt/Equity'], axis=1) 
        # remove stock with missing market cap data
        df = df[~df['Market Cap'].isna()] 
        # remove remaining stock with missing either instituional ownership, float short, short ratio, volatility, rsi
        df = df.dropna() 
        # clean up column names
        df.columns = df.columns.str.replace("(", "")
        df.columns = df.columns.str.replace(")", "")
        df.columns = df.columns.str.replace("-", "_")
        df.columns = df.columns.str.replace(" ", "_")
        # ensure all columns are numerics
        df = pd.concat([df[['Ticker', 'Sector']], df.drop(['Ticker', 'Sector'], axis=1).apply(pd.to_numeric)], axis=1)
        # add date
        df['Date'] = datetime.strptime(date, '%m-%d-%Y')
        return df

    def technical_signals(self, date, df_all):
        files = os.listdir(f"{self.directory}/{date}")
        signal_files = [x for x in files if ((x.startswith('signal')) | (x.startswith('dv'))) & (x.endswith('.csv'))]
        # iterate through all technical indicators
        for i in signal_files:
            # for each stock flagged by the specified technical indicator in the filename, join to main df
            df_signal = pd.read_csv(f"{directory}/{date}/{i}").iloc[:, 1:]
            signal_col = i.replace('.csv','')
            # 1 for technical indicator satisfied
            df_signal[signal_col] = 1
            df_all = df_all.merge(df_signal, on='Ticker', how='left')
            # 0 if technical indicator not satisfied
            df_all[signal_col] = df_all[signal_col].replace(np.nan, 0)         
        return df_all
    
    def create_lag_feats(self):
        pass
    
    
    def combine_long(self):
        # first scan directory for new data
        files = os.listdir(f"{self.directory}")
        df_dates = [datetime.strptime(x, '%Y-%m-%d') for x in self.df['Date'].tolist()]
        new_files = [x for x in files if datetime.strptime(x, '%m-%d-%Y') not in df_dates]
        if not new_files:
            print("Data is up to date")
        else:
            print(f"Detetected {len(new_files)} new files. Updating now.")
            for i in new_files:
                df1 = self.usa_stocks(i)
                df2 = self.technical_signals(i, df1)
                df_updt = pd.concat([self.df, df2], axis=0)
                df_updt['Date'] = pd.to_datetime(df_updt['Date'])
                df_updt = df_updt[['Date', 'Ticker'] + df_updt.drop(['Date', 'Ticker'], axis=1).columns.tolist()]
                self.df = df_updt.sort_values(['Ticker', 'Date']).reset_index().drop(['index'], axis=1)
                self.df.to_csv('df_long.csv', index=False)
        return self.df
    

In [55]:
df_init = pd.DataFrame()
init_date = '9-15-2021'

# initialize class
cld = CreateLongData(directory='D:/Finviz Data', df=df_init)
df_init = cld.usa_stocks(date=init_date)
df_init = cld.technical_signals(date=init_date, df_all=df_init)
df_init.to_csv('df_long.csv', index=False)
df_init

Unnamed: 0,Ticker,Sector,Market_Cap,Institutional_Ownership,Float_Short,Short_Ratio,Average_True_Range,Volatility_Week,Volatility_Month,Relative_Strength_Index_14,...,dv_today_up_5perc,dv_today_up,dv_today_down,dv_today_down_5perc,signal_price_above_20daysma,signal_price_crossed_above_20daysma,signal_lt_3perc_below_high,signal_lt_3perc_above_low,signal_doji,signal_hammer
0,PNRG,Energy,104.97,3.9,0.46,0.53,4.07,15.49,5.22,51.06,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,DNMR,Basic Materials,1529.56,48.1,12.06,4.98,1.63,11.36,9.92,36.05,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,STON,Consumer Cyclical,225.38,85.5,0.89,3.43,0.18,10.44,6.52,23.11,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CSCW,Communication Services,83.26,1.2,3.89,0.89,0.08,11.33,8.63,48.26,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MOSY,Technology,50.39,16.8,4.98,0.23,0.46,9.77,7.45,47.41,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3899,TIL,Healthcare,2617.97,55.0,4.59,21.57,1.58,12.19,8.37,53.45,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3900,OPAD,Real Estate,670.39,93.1,30.46,6.95,1.37,32.42,12.00,68.57,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3901,MPX,Consumer Cyclical,496.06,15.1,4.11,9.55,0.82,10.09,5.84,54.79,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
3902,CDXC,Healthcare,454.29,30.4,8.22,7.97,0.48,11.75,6.14,23.69,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
cld = CreateLongData(directory='D:/Finviz Data', df=pd.read_csv('df_long.csv'))
df_long = cld.combine_long()
df_long

Detetected 2 new files. Updating now.


Unnamed: 0,Date,Ticker,Sector,Market_Cap,Institutional_Ownership,Float_Short,Short_Ratio,Average_True_Range,Volatility_Week,Volatility_Month,...,dv_today_down,dv_today_down_5perc,signal_price_above_20daysma,signal_price_crossed_above_20daysma,signal_lt_3perc_below_high,signal_lt_3perc_above_low,signal_doji,signal_hammer,Shares_Float,Gap
0,2021-09-15,A,Healthcare,52609.71,90.2,1.24,2.10,3.00,2.11,1.82,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,,
1,2021-09-16,A,Healthcare,52455.32,90.2,1.24,2.09,3.02,2.18,1.82,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,301.92,
2,2021-09-17,A,Healthcare,52655.12,90.2,1.24,2.10,3.16,2.47,1.85,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,,0.37
3,2021-09-15,AA,Basic Materials,9268.81,79.1,5.99,1.37,2.33,4.18,4.75,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,,
4,2021-09-16,AA,Basic Materials,8861.68,79.1,5.99,1.37,2.39,4.32,4.67,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,185.07,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11709,2021-09-16,ZYNE,Healthcare,181.40,39.5,4.73,1.17,0.21,3.72,4.91,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,39.13,
11710,2021-09-17,ZYNE,Healthcare,177.62,39.5,4.73,1.17,0.22,4.65,5.01,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,-0.69
11711,2021-09-15,ZYXI,Healthcare,429.09,32.5,16.54,10.64,0.51,3.62,3.83,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,,
11712,2021-09-16,ZYXI,Healthcare,431.90,32.5,16.54,10.57,0.50,3.73,3.76,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.61,
