# Import Packages

In [96]:
import pandas as pd
import numpy as np
import os
from datetime import datetime
import statsmodels.api as sm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Initialize Variables

In [77]:
train_dt = '9-15-2021' # must be in m-dd-yyyy format
dv_dt = '9-16-2021' # must be in m-dd-yyyy format
directory = 'D:/Finviz Data'
files = os.listdir(f"{directory}/{train_dt}")
files

['usa_stocks_no_etfs.csv',
 'signal_top_gainers.csv',
 'signal_top_losers.csv',
 'signal_new_high.csv',
 'signal_new_low.csv',
 'signal_most_volatile.csv',
 'signal_most_active.csv',
 'signal_unusual_volume.csv',
 'signal_overbought.csv',
 'signal_oversold.csv',
 'signal_downgrades.csv',
 'signal_upgrades.csv',
 'signal_earnings_before.csv',
 'signal_recent_insider_buying.csv',
 'signal_recent_insider_selling.csv',
 'signal_major_news.csv',
 'signal_horizontal.csv',
 'signal_tl_resistance.csv',
 'signal_tl_support.csv',
 'signal_wedge_up.csv',
 'signal_wedge_down.csv',
 'signal_triangle_ascending.csv',
 'signal_triangle_descending.csv',
 'signal_wedge.csv',
 'signal_channel_up.csv',
 'signal_channel_down.csv',
 'signal_double_top.csv',
 'signal_double_bottom.csv',
 'signal_multiple_top.csv',
 'signal_multple_bottom.csv',
 'signal_head_and_shoulders.csv',
 'signal_head_and_shoulders_inverse.csv',
 'signal_relative_volume_gt_5.csv',
 'signal_high_short_float.csv',
 'dv_today_up_5perc.csv

# All USA Stocks (no ETFs)

In [66]:
usa_stocks_file = [x for x in files if (x.startswith('usa_stocks')) & (x.endswith('.csv'))][0]

def usa_stocks(filename, directory, date):
    df = pd.read_csv(f"{directory}/{filename}").iloc[:, 1:]
    # remove % from values
    df = df.replace({'%':''}, regex=True) 
    # no change during after hours
    df['After-Hours Change'] = df['After-Hours Change'].replace(np.nan, 0) 
    # drop fundamentals for now
    df = df.drop(['Company', 'Industry', 'Country', 'P/E', 'Forward P/E', 'P/B', 'Dividend Yield', 'Total Debt/Equity',
                  'EPS growth this year', 'EPS growth next year', 'Total Debt/Equity'], axis=1) 
    # remove stock with missing market cap data
    df = df[~df['Market Cap'].isna()] 
    # remove remaining stock with missing either instituional ownership, float short, short ratio, volatility, rsi
    df = df.dropna() 
    # one-hot-encode sector
    df = pd.concat([df, pd.get_dummies(df['Sector'], prefix = 'Sector')], axis=1).drop(['Sector'], axis=1) 
    # clean up column names
    df.columns = df.columns.str.replace("(", "")
    df.columns = df.columns.str.replace(")", "")
    df.columns = df.columns.str.replace("-", "_")
    df.columns = df.columns.str.replace(" ", "_")
    # ensure all columns are numerics
    df = pd.concat([df[['Ticker']], df.drop(['Ticker'], axis=1).apply(pd.to_numeric)], axis=1)
    # add date
    df['Date'] = datetime.strptime(date, '%m-%d-%Y')
    return df

df_stocks = usa_stocks(usa_stocks_file, f"{directory}/{train_dt}", train_dt)
df_stocks

Unnamed: 0,Ticker,Market_Cap,Institutional_Ownership,Float_Short,Short_Ratio,Average_True_Range,Volatility_Week,Volatility_Month,Relative_Strength_Index_14,Change_from_Open,...,Sector_Consumer_Cyclical,Sector_Consumer_Defensive,Sector_Energy,Sector_Financial,Sector_Healthcare,Sector_Industrials,Sector_Real_Estate,Sector_Technology,Sector_Utilities,Date
0,PNRG,104.97,3.9,0.46,0.53,4.07,15.49,5.22,51.06,-14.32,...,0,0,1,0,0,0,0,0,0,2021-09-15
1,DNMR,1529.56,48.1,12.06,4.98,1.63,11.36,9.92,36.05,-13.61,...,0,0,0,0,0,0,0,0,0,2021-09-15
2,STON,225.38,85.5,0.89,3.43,0.18,10.44,6.52,23.11,-13.57,...,1,0,0,0,0,0,0,0,0,2021-09-15
3,CSCW,83.26,1.2,3.89,0.89,0.08,11.33,8.63,48.26,-13.39,...,0,0,0,0,0,0,0,0,0,2021-09-15
4,MOSY,50.39,16.8,4.98,0.23,0.46,9.77,7.45,47.41,-13.35,...,0,0,0,0,0,0,0,1,0,2021-09-15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4763,TIL,2617.97,55.0,4.59,21.57,1.58,12.19,8.37,53.45,15.98,...,0,0,0,0,1,0,0,0,0,2021-09-15
4765,OPAD,670.39,93.1,30.46,6.95,1.37,32.42,12.00,68.57,16.91,...,0,0,0,0,0,0,1,0,0,2021-09-15
4766,MPX,496.06,15.1,4.11,9.55,0.82,10.09,5.84,54.79,20.58,...,1,0,0,0,0,0,0,0,0,2021-09-15
4767,CDXC,454.29,30.4,8.22,7.97,0.48,11.75,6.14,23.69,25.85,...,0,0,0,0,1,0,0,0,0,2021-09-15


# Include Technical and Performance Indicators

In [78]:
signal_files = [x for x in files if ((x.startswith('signal')) | (x.startswith('dv'))) & (x.endswith('.csv'))]

def technical_signals(signal_files, directory, df_all, dv=False):
    # iterate through all technical indicators
    for i in signal_files:
        # for each stock flagged by the specified technical indicator in the filename, join to main df
        df_signal = pd.read_csv(f"{directory}/{i}").iloc[:, 1:]
        signal_col = i.replace('.csv','')
        # 1 for technical indicator satisfied
        df_signal[signal_col] = 1
        df_all = df_all.merge(df_signal, on='Ticker', how='left')
        # 0 if technical indicator not satisfied
        df_all[signal_col] = df_all[signal_col].replace(np.nan, 0)
        # replace today with yesterday 
        if dv == False:
            df_all.columns = df_all.columns.str.replace('today', 'yesterday')          
    return df_all
    
df_final = technical_signals(signal_files, f"{directory}/{train_dt}", df_stocks)    
df_final

Unnamed: 0,Ticker,Market_Cap,Institutional_Ownership,Float_Short,Short_Ratio,Average_True_Range,Volatility_Week,Volatility_Month,Relative_Strength_Index_14,Change_from_Open,...,dv_yesterday_up_5perc,dv_yesterday_up,dv_yesterday_down,dv_yesterday_down_5perc,signal_price_above_20daysma,signal_price_crossed_above_20daysma,signal_lt_3perc_below_high,signal_lt_3perc_above_low,signal_doji,signal_hammer
0,PNRG,104.97,3.9,0.46,0.53,4.07,15.49,5.22,51.06,-14.32,...,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1,DNMR,1529.56,48.1,12.06,4.98,1.63,11.36,9.92,36.05,-13.61,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,STON,225.38,85.5,0.89,3.43,0.18,10.44,6.52,23.11,-13.57,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CSCW,83.26,1.2,3.89,0.89,0.08,11.33,8.63,48.26,-13.39,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,MOSY,50.39,16.8,4.98,0.23,0.46,9.77,7.45,47.41,-13.35,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3899,TIL,2617.97,55.0,4.59,21.57,1.58,12.19,8.37,53.45,15.98,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3900,OPAD,670.39,93.1,30.46,6.95,1.37,32.42,12.00,68.57,16.91,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3901,MPX,496.06,15.1,4.11,9.55,0.82,10.09,5.84,54.79,20.58,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0
3902,CDXC,454.29,30.4,8.22,7.97,0.48,11.75,6.14,23.69,25.85,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Dependent Variable (Close Up or Down Today)

In [79]:
dv_files = [x for x in os.listdir(f"{directory}/{dv_dt}") if (x.startswith('dv')) & (x.endswith('.csv'))]
df_final = technical_signals(dv_files, f"{directory}/{dv_dt}", df_final, dv=True)    
df_final

Unnamed: 0,Ticker,Market_Cap,Institutional_Ownership,Float_Short,Short_Ratio,Average_True_Range,Volatility_Week,Volatility_Month,Relative_Strength_Index_14,Change_from_Open,...,signal_price_above_20daysma,signal_price_crossed_above_20daysma,signal_lt_3perc_below_high,signal_lt_3perc_above_low,signal_doji,signal_hammer,dv_today_down,dv_today_down_5perc,dv_today_up,dv_today_up_5perc
0,PNRG,104.97,3.9,0.46,0.53,4.07,15.49,5.22,51.06,-14.32,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,DNMR,1529.56,48.1,12.06,4.98,1.63,11.36,9.92,36.05,-13.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,STON,225.38,85.5,0.89,3.43,0.18,10.44,6.52,23.11,-13.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,CSCW,83.26,1.2,3.89,0.89,0.08,11.33,8.63,48.26,-13.39,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,MOSY,50.39,16.8,4.98,0.23,0.46,9.77,7.45,47.41,-13.35,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3899,TIL,2617.97,55.0,4.59,21.57,1.58,12.19,8.37,53.45,15.98,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3900,OPAD,670.39,93.1,30.46,6.95,1.37,32.42,12.00,68.57,16.91,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3901,MPX,496.06,15.1,4.11,9.55,0.82,10.09,5.84,54.79,20.58,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3902,CDXC,454.29,30.4,8.22,7.97,0.48,11.75,6.14,23.69,25.85,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [83]:
conditions = [df_final['dv_today_up'] == 1, df_final['dv_today_down'] == 1]
choices = [1, 0]
df_final['dv_up_or_down'] = np.select(conditions, choices, default=np.nan)

Unnamed: 0,Ticker,Market_Cap,Institutional_Ownership,Float_Short,Short_Ratio,Average_True_Range,Volatility_Week,Volatility_Month,Relative_Strength_Index_14,Change_from_Open,...,signal_price_crossed_above_20daysma,signal_lt_3perc_below_high,signal_lt_3perc_above_low,signal_doji,signal_hammer,dv_today_down,dv_today_down_5perc,dv_today_up,dv_today_up_5perc,dv_up_or_down
0,PNRG,104.97,3.9,0.46,0.53,4.07,15.49,5.22,51.06,-14.32,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,DNMR,1529.56,48.1,12.06,4.98,1.63,11.36,9.92,36.05,-13.61,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,STON,225.38,85.5,0.89,3.43,0.18,10.44,6.52,23.11,-13.57,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,CSCW,83.26,1.2,3.89,0.89,0.08,11.33,8.63,48.26,-13.39,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,MOSY,50.39,16.8,4.98,0.23,0.46,9.77,7.45,47.41,-13.35,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3899,TIL,2617.97,55.0,4.59,21.57,1.58,12.19,8.37,53.45,15.98,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3900,OPAD,670.39,93.1,30.46,6.95,1.37,32.42,12.00,68.57,16.91,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3901,MPX,496.06,15.1,4.11,9.55,0.82,10.09,5.84,54.79,20.58,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3902,CDXC,454.29,30.4,8.22,7.97,0.48,11.75,6.14,23.69,25.85,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [88]:
conditions = [df_final['dv_today_up_5perc'] == 1, df_final['dv_today_down_5perc'] == 1]
choices = [1, 0]
df_final['dv_up_or_down_5perc'] = np.select(conditions, choices, default=np.nan)
df_final['dv_up_or_down_5perc']
df_final

Unnamed: 0,Ticker,Market_Cap,Institutional_Ownership,Float_Short,Short_Ratio,Average_True_Range,Volatility_Week,Volatility_Month,Relative_Strength_Index_14,Change_from_Open,...,signal_lt_3perc_below_high,signal_lt_3perc_above_low,signal_doji,signal_hammer,dv_today_down,dv_today_down_5perc,dv_today_up,dv_today_up_5perc,dv_up_or_down,dv_up_or_down_5perc
0,PNRG,104.97,3.9,0.46,0.53,4.07,15.49,5.22,51.06,-14.32,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,
1,DNMR,1529.56,48.1,12.06,4.98,1.63,11.36,9.92,36.05,-13.61,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
2,STON,225.38,85.5,0.89,3.43,0.18,10.44,6.52,23.11,-13.57,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,
3,CSCW,83.26,1.2,3.89,0.89,0.08,11.33,8.63,48.26,-13.39,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,
4,MOSY,50.39,16.8,4.98,0.23,0.46,9.77,7.45,47.41,-13.35,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3899,TIL,2617.97,55.0,4.59,21.57,1.58,12.19,8.37,53.45,15.98,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,
3900,OPAD,670.39,93.1,30.46,6.95,1.37,32.42,12.00,68.57,16.91,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
3901,MPX,496.06,15.1,4.11,9.55,0.82,10.09,5.84,54.79,20.58,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3902,CDXC,454.29,30.4,8.22,7.97,0.48,11.75,6.14,23.69,25.85,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [89]:
df_final.columns.tolist()

['Ticker',
 'Market_Cap',
 'Institutional_Ownership',
 'Float_Short',
 'Short_Ratio',
 'Average_True_Range',
 'Volatility_Week',
 'Volatility_Month',
 'Relative_Strength_Index_14',
 'Change_from_Open',
 'Average_Volume',
 'Relative_Volume',
 'Price',
 'After_Hours_Change',
 'Sector_Basic_Materials',
 'Sector_Communication_Services',
 'Sector_Consumer_Cyclical',
 'Sector_Consumer_Defensive',
 'Sector_Energy',
 'Sector_Financial',
 'Sector_Healthcare',
 'Sector_Industrials',
 'Sector_Real_Estate',
 'Sector_Technology',
 'Sector_Utilities',
 'Date',
 'signal_top_gainers',
 'signal_top_losers',
 'signal_new_high',
 'signal_new_low',
 'signal_most_volatile',
 'signal_most_active',
 'signal_unusual_volume',
 'signal_overbought',
 'signal_oversold',
 'signal_downgrades',
 'signal_upgrades',
 'signal_earnings_before',
 'signal_recent_insider_buying',
 'signal_recent_insider_selling',
 'signal_major_news',
 'signal_horizontal',
 'signal_tl_resistance',
 'signal_tl_support',
 'signal_wedge_up',


# Check Fill Rate of Features

In [90]:
def feature_summary(df):
    """
    Get fill count and descriptvie stats for each feature
    """
    sum_stats = pd.DataFrame(df.describe())
    num_non_zero = pd.DataFrame((df != 0).astype(int).sum()).T
    num_non_zero = num_non_zero.rename(index={0: 'Num_non_zeros'})
    final_summary = pd.concat([sum_stats, num_non_zero], axis=0)[1:]
    final_summary = final_summary.T.reset_index().rename(columns={'index': 'Feature'})
    return final_summary

feat_summary = feature_summary(df_final)
feat_summary

Unnamed: 0,Feature,mean,std,min,25%,50%,75%,max,Num_non_zeros
0,Market_Cap,13155.336701,86782.965797,5.95,249.175,957.79,4721.7575,2547828.78,3904.0
1,Institutional_Ownership,62.668686,29.060791,0.02,40.000,70.50,88.2000,100.00,3904.0
2,Float_Short,4.324816,5.090182,0.00,1.190,2.58,5.5700,45.26,3899.0
3,Short_Ratio,4.173968,4.072618,0.00,1.720,3.20,5.3000,52.62,3902.0
4,Average_True_Range,2.751148,73.091217,0.02,0.300,0.74,1.6600,4562.31,3904.0
...,...,...,...,...,...,...,...,...,...
70,dv_today_up_5perc,0.035605,0.185326,0.00,0.000,0.00,0.0000,1.00,139.0
71,dv_up_or_down,0.475813,0.499481,0.00,0.000,0.00,1.0000,1.00,1921.0
72,dv_up_or_down_5perc,0.655660,0.476277,0.00,0.000,1.00,1.0000,1.00,3831.0
73,Ticker,,,,,,,,3904.0


# Logistic Regression

In [95]:
# get features and dv
other_feats = ['Market_Cap', 'Institutional_Ownership', 'Float_Short', 'Short_Ratio', 'Average_True_Range', 'Volatility_Week',
               'Volatility_Month', 'Relative_Strength_Index_14', 'Change_from_Open', 'Average_Volume', 'Relative_Volume', 'Price']
signal_feats = [x for x in df_final.columns if x.startswith('signal')]
prior_dv = [x for x in df_final.columns if 'yesterday' in x]
final_feats = signal_feats + prior_dv + other_feats
dv = ['dv_up_or_down']

# create data
train_df = df_final[~df_final[dv[0]].isna()]
x_train = sm.add_constant(train_df[final_feats])
y_train = train_df[dv]

# fit model
model = sm.GLM(y_train, x_train, family=sm.families.Binomial())
res = model.fit()

res.summary()

0,1,2,3
Dep. Variable:,dv_up_or_down,No. Observations:,3783.0
Model:,GLM,Df Residuals:,3727.0
Model Family:,Binomial,Df Model:,55.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2516.3
Date:,"Thu, 16 Sep 2021",Deviance:,5032.7
Time:,21:20:55,Pearson chi2:,3810.0
No. Iterations:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0032,0.373,-0.009,0.993,-0.735,0.728
signal_top_gainers,-0.3971,0.322,-1.232,0.218,-1.029,0.235
signal_top_losers,0.9195,0.771,1.192,0.233,-0.592,2.431
signal_new_high,-0.3973,0.287,-1.386,0.166,-0.959,0.165
signal_new_low,0.1275,0.284,0.449,0.654,-0.430,0.685
signal_most_volatile,-0.1572,0.251,-0.626,0.531,-0.649,0.335
signal_most_active,-0.3551,0.272,-1.305,0.192,-0.888,0.178
signal_unusual_volume,0.2075,0.360,0.576,0.565,-0.499,0.914
signal_overbought,-0.1752,0.496,-0.353,0.724,-1.148,0.797


In [97]:
# training performance
y_train_pred = res.predict(x_train)
print(recall_score(y_train, (y_train_pred > 0.5)*1))
print(precision_score(y_train, (y_train_pred > 0.5)*1))
print(f1_score(y_train, (y_train_pred > 0.5)*1))
print(roc_auc_score(y_train, (y_train_pred > 0.5)*1))

0.4811111111111111
0.6115819209039548
0.5385572139303483
0.6018767860144562


In [None]:
# theres probably a lot of colinearity going on here
# will try the following
    # tree based methods
    # glmnet 
    # pca