In [43]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from finta import TA
from utils.append_indicators import append_indicators

In [44]:
engine = create_engine('mysql://Quotermain:Quotermain233@192.168.0.105:3306/trading_data')

In [45]:
assets = [
	'ALRS', 
    'CHMF', 
    'GAZP', 
    'GMKN', 
    'HYDR', 
    'LKOH', 
    'MGNT', 
    'MOEX', 
    'MTLR', 
    'MTSS', 
	'NVTK', 
    'ROSN', 
    'RTKM', 
    'SBER', 
    'SBERP', 
    'SIBN', 
    'SNGS', 
    'SNGSP', 
    'TATN', 
    'YNDX'
]
dict_of_tf = {
    '1_': 480, #problem
    '4_': 120,
    '15_': 32,
    '30_': 16, #problem
    '2_': 240, #problem
    '120_': 4,
    '20_': 24, #problem
    '240_': 2,
    '5_': 96,
    '6_': 80,
    '10_': 48, #problem
    '3_': 160,
    '60_': 8
}

list_with_indicators = [
            'SMA', 'SMM', 'EMA_13', 'EMA_26', 'EMA_DIF', 'DEMA', 'TEMA', 'TRIMA', 'TRIX',
            'VAMA', 'ER', 'ZLEMA', 'WMA', 'HMA', 'EVWMA', 'VWAP', 'SMMA', 'MOM',
            'ROC', 'RSI', 'IFT_RSI', 'TR', 'ATR', 'BBWIDTH', 'PERCENT_B', 'ADX', 'STOCH', 
            'STOCHD', 'STOCHRSI', 'WILLIAMS', 'UO', 'AO', 'TP', 'ADL', 'CHAIKIN', 'MFI',
            'OBV', 'WOBV', 'VZO', 'EFI', 'CFI', 'EMV', 'CCI', 'COPP', 'CMO', 'FISH', 
            'SQZMI', 'VPT', 'FVE', 'VFI', 'MSD', 'return'
        ]

In [46]:
df = pd.read_sql('SELECT * FROM ' + 'ALRS' + '_train', engine)
df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce')
df = df.dropna()
df = df.set_index('date_time')
df = df.drop_duplicates()

In [47]:
df['dist_to_max_per_range'] = np.array(df[['close']]\
    .iloc[::-1].rolling(100, min_periods=1).max().iloc[::-1])\
    - np.array(df[['close']])
df['dist_to_min_per_range'] = np.array(df[['close']])\
    - np.array(df[['close']]\
    .iloc[::-1].rolling(100, min_periods=1).min().iloc[::-1])

In [48]:
#Calculates proportion of each row in order book to the apropriate section(bid or offer)
df_offer_count_proportion = df.loc[:, 'offer_count_10':'offer_count_1']\
    .div(df.loc[:, 'offer_count_10':'offer_count_1'].sum(axis=1), axis=0)
df_bid_count_proportion = df.loc[:, 'bid_count_10':'bid_count_1']\
    .div(df.loc[:, 'bid_count_10':'bid_count_1'].sum(axis=1), axis=0)
#Calculates offer/bid ratio per row
offer_bid_ratio = pd.DataFrame(df.loc[:, 'offer_count_10':'offer_count_1'].sum(axis=1) /\
    df.loc[:, 'bid_count_10':'bid_count_1'].sum(axis=1))
df = df.drop([
    'offer_count_10', 'offer_count_9', 'offer_count_8', 'offer_count_7',
    'offer_count_6', 'offer_count_5', 'offer_count_4', 'offer_count_3',
    'offer_count_2', 'offer_count_1', 'bid_count_10', 'bid_count_9', 
    'bid_count_8', 'bid_count_7',
    'bid_count_6', 'bid_count_5', 'bid_count_4', 'bid_count_3',
    'bid_count_2', 'bid_count_1'], axis = 1)

In [49]:
#Concatenates single df for analysis
list_of_dfs = [
    df,
    df_offer_count_proportion, 
    df_bid_count_proportion, 
    offer_bid_ratio
]
df_to_analyze = pd.concat(list_of_dfs, axis=1)

In [50]:
df_to_analyze = df_to_analyze.dropna()

In [51]:
for key in dict_of_tf:
    df_to_analyze = append_indicators(
        df_to_analyze, key, list_with_indicators
    )

  (log((1 + _smooth) / (1 - _smooth))).ewm(span=3).mean(),


In [52]:
df_to_analyze = df_to_analyze.dropna()

In [53]:
df_to_analyze.shape

(8815, 772)

In [54]:
#df_to_analyze = df_to_analyze.resample('1T').first()

In [55]:
conditions = [
    np.logical_and(
        df['dist_to_max_per_range'] > np.percentile(df['dist_to_max_per_range'], 70),
        df['dist_to_min_per_range'] < np.percentile(df['dist_to_min_per_range'], 20)
    ),
    np.logical_and(
        df['dist_to_max_per_range'] < np.percentile(df['dist_to_max_per_range'], 20),
        df['dist_to_min_per_range'] > np.percentile(df['dist_to_min_per_range'], 70)
    )
]
choices = ['up', 'down']
df['y'] = np.select(conditions, choices, default='nothing')
df.y=df.y.shift(-1) # shifting back because we want to predict using current state
df = df.dropna()

X = df.drop(['dist_to_max_per_range', 'dist_to_min_per_range', 'y'], axis=1)
y = df.y

X_train, X_test, y_train, y_test = train_test_split(X, y)

clf_rf = RandomForestClassifier(
    n_estimators = 9,
    max_depth = 9,
    min_samples_split = 3,
    min_samples_leaf = 2,
    n_jobs = -1
)

clf_rf.fit(X_train, y_train)

sel = SelectFromModel(clf_rf)
sel.fit(X_train, y_train)

X_important_train = sel.transform(X_train)
X_important_test = sel.transform(X_test)

clf_important = RandomForestClassifier(
    n_estimators = 9,
    max_depth = 9,
    min_samples_split = 3,
    min_samples_leaf = 2,
    n_jobs = -1
)

clf_important.fit(X_important_train, y_train)

y_pred = clf_rf.predict(X_test)
y_important_pred = clf_important.predict(X_important_test)

#print(asset)
print('Clf')
print(classification_report(y_test, y_pred))
print('Clf_important')
print(classification_report(y_test, y_important_pred))
print(X_train.columns[sel.get_support()])
print()

Clf
              precision    recall  f1-score   support

        down       0.90      0.52      0.66       337
     nothing       0.84      0.98      0.90      1933
          up       0.87      0.38      0.53       341

   micro avg       0.84      0.84      0.84      2611
   macro avg       0.87      0.63      0.70      2611
weighted avg       0.85      0.84      0.82      2611

Clf_important
              precision    recall  f1-score   support

        down       0.88      0.54      0.67       337
     nothing       0.85      0.97      0.90      1933
          up       0.85      0.44      0.58       341

   micro avg       0.85      0.85      0.85      2611
   macro avg       0.86      0.65      0.72      2611
weighted avg       0.85      0.85      0.83      2611

Index(['offer_price_8', 'offer_price_7', 'offer_price_1', '4_open', '6_low',
       '6_volume', '10_low', '10_volume', '15_open', '15_volume', '20_open',
       '20_high', '20_volume', '30_open', '30_high', '30_volume', 

In [56]:
df_to_analyze[['1_SMA', '1_SMM', '1_EMA_13', '1_EMA_26', '1_EMA_DIF', '1_DEMA',
       '1_TEMA', '1_TRIMA', '1_TRIX', '1_VAMA', '1_ER', '1_KAMA', '1_ZLEMA',
       '1_WMA', '1_HMA', '1_MOM', '1_ROC', '1_RSI', '1_IFT_RSI', '1_ATR',
       '1_BBWIDTH', '1_PERCENT_B', '1_ADX', '1_STOCH', '1_STOCHD',
       '1_STOCHRSI', '1_WILLIAMS', '1_UO', '1_AO', '1_ADL', '1_CHAIKIN',
       '1_MFI', '1_WOBV', '1_EFI', '1_CFI', '1_EMV', '1_CCI', '1_COPP',
       '1_VPT', '1_FVE', '1_VFI', '1_MSD', '1_return']].isna().sum()

KeyError: "['1_KAMA'] not in index"