In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from finta import TA
from utils.append_indicators import append_indicators
import pickle

In [2]:
engine = create_engine('mysql://Quotermain:Quotermain233@192.168.0.105:3306/trading_data')

In [None]:
assets = [
    #'ALRS', 
    #'CHMF', 
    #'GAZP', 
    #'GMKN', 
    #'HYDR', 
    #'LKOH', 
    #'MGNT', 
    #'MOEX', 
    #'MTLR', 
    #'MTSS', 
    #'NVTK', 
    #'ROSN', 
    #'RTKM', 
    'SBER' 
    #'SBERP', 
    #'SIBN', 
    #'SNGS', 
    #'SNGSP', 
    #'TATN', 
    #'YNDX'
]
dict_of_tf = {
    '1_': 480, #problem
    '4_': 120,
    '15_': 32,
    '30_': 16, #problem
    '2_': 240, #problem
    '120_': 4,
    '20_': 24, #problem
    '240_': 2,
    '5_': 96,
    '6_': 80,
    '10_': 48, #problem
    '3_': 160,
    '60_': 8
}

list_with_indicators = [
            'SMA', 'SMM', 'EMA_13', 'EMA_26', 'EMA_DIF', 'DEMA', 'TEMA', 'TRIMA', 'TRIX',
            'VAMA', 'ER', 'ZLEMA', 'WMA', 'HMA', 'EVWMA', 'VWAP', 'SMMA', 'MOM',
            'ROC', 'RSI', 'IFT_RSI', 'TR', 'ATR', 'BBWIDTH', 'PERCENT_B', 'ADX', 'STOCH', 
            'STOCHD', 'STOCHRSI', 'WILLIAMS', 'UO', 'AO', 'TP', 'ADL', 'CHAIKIN', 'MFI',
            'OBV', 'WOBV', 'VZO', 'EFI', 'CFI', 'EMV', 'CCI', 'COPP', 'CMO', 'FISH', 
            'SQZMI', 'VPT', 'FVE', 'VFI', 'MSD', 'return'
        ]

for asset in assets:
    df = pd.read_sql('SELECT * FROM ' + asset + '_train LIMIT 100000', engine)
    df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce')
    df = df.dropna()
    df = df.set_index('date_time')
    df = df.drop_duplicates()

    df['dist_to_max_per_range'] = np.array(df[['close']]\
        .iloc[::-1].rolling(30, min_periods=1).max().iloc[::-1])\
        - np.array(df[['close']])
    df['dist_to_min_per_range'] = np.array(df[['close']])\
        - np.array(df[['close']]\
        .iloc[::-1].rolling(30, min_periods=1).min().iloc[::-1])

    #Calculates proportion of each row in order book to the apropriate section(bid or offer)
    df_offer_count_proportion = df.loc[:, 'offer_count_10':'offer_count_1']\
        .div(df.loc[:, 'offer_count_10':'offer_count_1'].sum(axis=1), axis=0)
    df_bid_count_proportion = df.loc[:, 'bid_count_10':'bid_count_1']\
        .div(df.loc[:, 'bid_count_10':'bid_count_1'].sum(axis=1), axis=0)
    #Calculates offer/bid ratio per row
    offer_bid_ratio = pd.DataFrame(df.loc[:, 'offer_count_10':'offer_count_1'].sum(axis=1) /\
        df.loc[:, 'bid_count_10':'bid_count_1'].sum(axis=1))
    df = df.drop([
        'offer_count_10', 'offer_count_9', 'offer_count_8', 'offer_count_7',
        'offer_count_6', 'offer_count_5', 'offer_count_4', 'offer_count_3',
        'offer_count_2', 'offer_count_1', 'bid_count_10', 'bid_count_9', 
        'bid_count_8', 'bid_count_7',
        'bid_count_6', 'bid_count_5', 'bid_count_4', 'bid_count_3',
        'bid_count_2', 'bid_count_1'], axis = 1)

    #Concatenates single df for analysis
    list_of_dfs = [
        df,
        df_offer_count_proportion, 
        df_bid_count_proportion, 
        offer_bid_ratio
    ]
    df_to_analyze = pd.concat(list_of_dfs, axis=1)

    df_to_analyze = df_to_analyze.dropna()

    for key in dict_of_tf:
        df_to_analyze = append_indicators(
            df_to_analyze, key, list_with_indicators
        )

    df_to_analyze = df_to_analyze.dropna()

    #df_to_analyze = df_to_analyze.resample('1T').first()

    conditions = [
        np.logical_and(
            df_to_analyze['dist_to_max_per_range'] > np.percentile(df_to_analyze['dist_to_max_per_range'], 70),
            df_to_analyze['dist_to_min_per_range'] < np.percentile(df_to_analyze['dist_to_min_per_range'], 30)
        ),
        np.logical_and(
            df_to_analyze['dist_to_max_per_range'] < np.percentile(df_to_analyze['dist_to_max_per_range'], 30),
            df_to_analyze['dist_to_min_per_range'] > np.percentile(df_to_analyze['dist_to_min_per_range'], 70)
        )
    ]
    choices = ['up', 'down']
    df_to_analyze['y'] = np.select(conditions, choices, default='nothing')
    df_to_analyze.y=df_to_analyze.y.shift(-1) # shifting back because we want to predict using current state
    df_to_analyze = df_to_analyze.dropna()

    X = df_to_analyze.drop(['dist_to_max_per_range', 'dist_to_min_per_range', 'y'], axis=1)
    y = df_to_analyze.y
    
    train_size = int(df_to_analyze.shape[0] * 0.75)
    X_train = X.iloc[:train_size, :]
    y_train = y[:train_size]
    X_test = X.iloc[train_size:, :]
    y_test = y.iloc[train_size:]

    clf_rf = RandomForestClassifier(
        n_estimators = 300 ,
        max_depth = 9,
        min_samples_split = 3,
        min_samples_leaf = 2,
        n_jobs = -1
    )

    clf_rf.fit(X_train, y_train)

    sel = SelectFromModel(clf_rf)
    sel.fit(X_train, y_train)

    X_important_train = sel.transform(X_train)
    X_important_test = sel.transform(X_test)

    clf_important = RandomForestClassifier(
        n_estimators = 9,
        max_depth = 9,
        min_samples_split = 3,
        min_samples_leaf = 2,
        n_jobs = -1
    )

    clf_important.fit(X_important_train, y_train)

    y_pred = clf_rf.predict(X_test)
    y_important_pred = clf_important.predict(X_important_test)
    
    file_with_model = asset + '_model.sav'
    pickle.dump(clf_important, open(file_with_model, 'wb'))
    
    file_with_features = asset + '_features.sav'
    pickle.dump(X_train.columns[sel.get_support()], open(file_with_features, 'wb'))
    
    print(asset)
    print('Clf')
    print(classification_report(y_test, y_pred))
    print('Clf_important')
    print(classification_report(y_test, y_important_pred))
    print(X_train.columns[sel.get_support()])
    print()
    break

  (log((1 + _smooth) / (1 - _smooth))).ewm(span=3).mean(),


In [82]:
df_to_analyze.shape

(11788, 75)