In [15]:
'''
Loads necessary packages
'''

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from finta import TA
from utils.append_indicators import append_indicators
import pickle

In [16]:
'''
Creates MySQL connection object
'''

engine = create_engine(
    'mysql://Quotermain:Quotermain233@192.168.0.105:3306/trading_data'
)

In [17]:
'''
Creates collections with timeframes 
for candles and indicators
'''

dict_of_tf = {
    '1_': 480, #problem
    '4_': 120,
    '15_': 32,
    '30_': 16, #problem
    '2_': 240, #problem
    '120_': 4,
    '20_': 24, #problem
    '240_': 2,
    '5_': 96,
    '6_': 80,
    '10_': 48, #problem
    '3_': 160,
    '60_': 8
}

list_with_indicators = [
    'SMA', 'SMM', 'EMA_13', 'EMA_26', 'EMA_DIF', 'DEMA', 'TEMA', 'TRIMA', 'TRIX',
    'VAMA', 'ER', 'ZLEMA', 'WMA', 'HMA', 'EVWMA', 'VWAP', 'SMMA', 'MOM',
    'ROC', 'RSI', 'IFT_RSI', 'TR', 'ATR', 'BBWIDTH', 'PERCENT_B', 'ADX', 'STOCH', 
    'STOCHD', 'STOCHRSI', 'WILLIAMS', 'UO', 'AO', 'TP', 'ADL', 'CHAIKIN', 'MFI',
    'OBV', 'WOBV', 'VZO', 'EFI', 'CFI', 'EMV', 'CCI', 'COPP', 'CMO', 'FISH', 
    'SQZMI', 'VPT', 'FVE', 'VFI', 'MSD', 'return'
]

In [20]:
'''
Reads the LIMITED data for SBER,
sets the datetime index, drops
duplicates and nulls
'''

df = pd.read_sql('SELECT * FROM SBER_train LIMIT 100000', engine)
df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce')
df.dropna(inplace=True)
df = df.set_index('date_time')
df.drop_duplicates(inplace=True)

In [23]:
'''
Appends columns with target variable
as max distance to low and high during
time_range
'''

df['dist_to_max_per_range'] = np.array(df[['close']]\
    .iloc[::-1].rolling(30, min_periods=1).max().iloc[::-1])\
    - np.array(df[['close']])

df['dist_to_min_per_range'] = np.array(df[['close']])\
    - np.array(df[['close']]\
    .iloc[::-1].rolling(30, min_periods=1).min().iloc[::-1])

In [25]:
'''
Calculates proportion of each row 
in order book to the apropriate 
section(bid or offer)
'''

df_offer_count_proportion = df.loc[:, 'offer_count_10':'offer_count_1']\
    .div(df.loc[:, 'offer_count_10':'offer_count_1'].sum(axis=1), axis=0)

df_bid_count_proportion = df.loc[:, 'bid_count_10':'bid_count_1']\
    .div(df.loc[:, 'bid_count_10':'bid_count_1'].sum(axis=1), axis=0)

In [26]:
'''
Calculates offer/bid ratio per row
and drops columns with separate bids
and asks
'''

offer_bid_ratio = pd.DataFrame(df.loc[:, 'offer_count_10':'offer_count_1'].sum(axis=1) /\
    df.loc[:, 'bid_count_10':'bid_count_1'].sum(axis=1))

df = df.drop([
    'offer_count_10', 'offer_count_9', 'offer_count_8', 'offer_count_7',
    'offer_count_6', 'offer_count_5', 'offer_count_4', 'offer_count_3',
    'offer_count_2', 'offer_count_1', 'bid_count_10', 'bid_count_9', 
    'bid_count_8', 'bid_count_7',
    'bid_count_6', 'bid_count_5', 'bid_count_4', 'bid_count_3',
    'bid_count_2', 'bid_count_1'], axis = 1)

In [27]:
'''
Concatenates single df for analysis
and drops nulls
'''

list_of_dfs = [
    df,
    df_offer_count_proportion, 
    df_bid_count_proportion, 
    offer_bid_ratio
]

temp_df = pd.concat(list_of_dfs, axis=1)

temp_df = temp_df.dropna()

In [28]:
'''
Appends indicators and drops nulls
'''

for key in dict_of_tf:
    temp_df = append_indicators(
        temp_df, key, list_with_indicators
    )

temp_df = temp_df.dropna()

  (log((1 + _smooth) / (1 - _smooth))).ewm(span=3).mean(),


In [29]:
'''
Copies the df with uploaded indicators
to avoid waiting
'''

df_to_analyze = temp_df.copy()

In [30]:
'''
Creates column to indicate movement above and below
median movement of the price as the target variable
'''

conditions = [
    np.logical_and(
        df_to_analyze['dist_to_max_per_range'] > np.percentile(
            df_to_analyze['dist_to_max_per_range'], 50
        ),
        df_to_analyze['dist_to_min_per_range'] < np.percentile(
            df_to_analyze['dist_to_min_per_range'], 50
        )
    ),
    np.logical_and(
        df_to_analyze['dist_to_max_per_range'] < np.percentile(
            df_to_analyze['dist_to_max_per_range'], 50
        ),
        df_to_analyze['dist_to_min_per_range'] > np.percentile(
            df_to_analyze['dist_to_min_per_range'], 50
        )
    )
]

choices = ['up', 'down']
df_to_analyze['y'] = np.select(conditions, choices, default='nothing')
df_to_analyze.y=df_to_analyze.y.shift(-1)
df_to_analyze = df_to_analyze.dropna()

In [None]:
'''
Splits the data into features and targets
and further splits it into train and test
'''

X = df_to_analyze.drop(['dist_to_max_per_range', 'dist_to_min_per_range', 'y'], axis=1)
y = df_to_analyze.y

#Creates the oldest data as the train set and the newest as the test set
train_size = int(df_to_analyze.shape[0] * 0.75)
X_train = X.iloc[:train_size, :]
y_train = y[:train_size]
X_test = X.iloc[train_size:, :]
y_test = y.iloc[train_size:]

In [6]:


clf_rf = RandomForestClassifier(
    n_estimators = 300 ,
    max_depth = 9,
    min_samples_split = 3,
    min_samples_leaf = 2,
    n_jobs = -1
)

clf_rf.fit(X_train, y_train)

sel = SelectFromModel(clf_rf)
sel.fit(X_train, y_train)

X_important_train = sel.transform(X_train)
X_important_test = sel.transform(X_test)

clf_important = RandomForestClassifier(
    n_estimators = 9,
    max_depth = 9,
    min_samples_split = 3,
    min_samples_leaf = 2,
    n_jobs = -1
)

clf_important.fit(X_important_train, y_train)

y_pred = clf_rf.predict(X_test)
y_important_pred = clf_important.predict(X_important_test)

file_with_model = asset + '_model.sav'
pickle.dump(clf_important, open(file_with_model, 'wb'))

file_with_features = asset + '_features.sav'
pickle.dump(X_train.columns[sel.get_support()], open(file_with_features, 'wb'))

print(asset)
print('Clf')
print(classification_report(y_test, y_pred))
print('Clf_important')
print(classification_report(y_test, y_important_pred))
print(X_train.columns[sel.get_support()])
print()
break

  (log((1 + _smooth) / (1 - _smooth))).ewm(span=3).mean(),


SBER
Clf


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

        down       0.45      0.51      0.48      3309
     nothing       0.00      0.00      0.00      1832
          up       0.42      0.61      0.49      3045

   micro avg       0.43      0.43      0.43      8186
   macro avg       0.29      0.37      0.33      8186
weighted avg       0.34      0.43      0.38      8186

Clf_important
              precision    recall  f1-score   support

        down       0.43      0.41      0.42      3309
     nothing       0.24      0.19      0.22      1832
          up       0.39      0.46      0.42      3045

   micro avg       0.38      0.38      0.38      8186
   macro avg       0.35      0.35      0.35      8186
weighted avg       0.37      0.38      0.37      8186

Index(['5_open', '5_volume', '6_open', '6_volume', '10_open', '10_high',
       '10_volume', '15_volume', '20_volume', '30_volume',
       ...
       '60_ADL', '60_CHAIKIN', '60_OBV', '60_WOBV', '60_EFI', '60_CFI',
       '6

In [13]:
list(X_train.columns[sel.get_support()])

['5_open',
 '5_volume',
 '6_open',
 '6_volume',
 '10_open',
 '10_high',
 '10_volume',
 '15_volume',
 '20_volume',
 '30_volume',
 '60_volume',
 '120_open',
 '120_volume',
 '240_high',
 '240_volume',
 '1_SMA',
 '1_TRIMA',
 '1_VWAP',
 '1_SMMA',
 '1_ADX',
 '1_ADL',
 '1_OBV',
 '1_WOBV',
 '1_CFI',
 '1_EMV',
 '1_VPT',
 '1_VFI',
 '4_EVWMA',
 '4_VWAP',
 '4_SMMA',
 '4_TR',
 '4_ADX',
 '4_AO',
 '4_ADL',
 '4_OBV',
 '4_WOBV',
 '4_CFI',
 '4_EMV',
 '4_FISH',
 '4_VPT',
 '4_VFI',
 '15_VWAP',
 '15_TR',
 '15_ATR',
 '15_ADX',
 '15_AO',
 '15_TP',
 '15_ADL',
 '15_CHAIKIN',
 '15_OBV',
 '15_WOBV',
 '15_EFI',
 '15_CFI',
 '15_EMV',
 '15_FISH',
 '15_VPT',
 '15_VFI',
 '30_VWAP',
 '30_SMMA',
 '30_TR',
 '30_ATR',
 '30_ADX',
 '30_STOCH',
 '30_WILLIAMS',
 '30_UO',
 '30_AO',
 '30_TP',
 '30_ADL',
 '30_CHAIKIN',
 '30_OBV',
 '30_WOBV',
 '30_EFI',
 '30_CFI',
 '30_EMV',
 '30_FISH',
 '30_VPT',
 '30_VFI',
 '2_EVWMA',
 '2_VWAP',
 '2_ADX',
 '2_AO',
 '2_ADL',
 '2_OBV',
 '2_WOBV',
 '2_CFI',
 '2_EMV',
 '2_FISH',
 '2_VPT',
 '2_VFI'

In [14]:
clf_important.feature_importances_

array([0.00226871, 0.0030992 , 0.00044537, 0.00300653, 0.00151642,
       0.0037976 , 0.00138416, 0.00559962, 0.00389098, 0.01052893,
       0.0077664 , 0.00150352, 0.00412202, 0.00269028, 0.00350319,
       0.00242328, 0.00601214, 0.00456422, 0.00450119, 0.00228785,
       0.00799021, 0.01164905, 0.00452787, 0.00489151, 0.00456277,
       0.00503516, 0.00367892, 0.00795708, 0.0033358 , 0.00419946,
       0.00199037, 0.00708916, 0.00564659, 0.00304962, 0.00787863,
       0.00234641, 0.00606174, 0.00289602, 0.00259313, 0.00746668,
       0.00711167, 0.00536263, 0.00614678, 0.00374779, 0.00439107,
       0.00720966, 0.00224892, 0.00375214, 0.00140641, 0.00165306,
       0.01215079, 0.00284448, 0.00333391, 0.00808304, 0.00145467,
       0.00553978, 0.00501615, 0.00475353, 0.00241703, 0.00559361,
       0.00435671, 0.00390439, 0.00119335, 0.00742131, 0.00258663,
       0.00340881, 0.00139658, 0.00572459, 0.00466908, 0.0074943 ,
       0.0051658 , 0.00218064, 0.00422776, 0.0054814 , 0.00610