In [1]:
import numpy as np
import pandas as pd

from scipy.signal import find_peaks
from datetime import datetime, timedelta
from statsmodels.tsa.api import Holt

from ta import add_all_ta_features
from ta.utils import dropna
from copy import deepcopy
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

pd.set_option('display.max_columns', None)

In [2]:
dataframe = pd.read_csv('MATICUSDT_list.csv', delimiter=';', names=[
    'openTime',
    'open',
    'high',
    'low',
    'close',
    'volume',
    'closeTime'
])

In [3]:
num_test = int(len(dataframe) / 5)

In [4]:
dataframe['openTime'] = pd.to_datetime(dataframe['openTime'])
dataframe['closeTime'] = pd.to_datetime(dataframe['closeTime'])

In [5]:
data = list(dataframe['close'])
index = dataframe['closeTime']
livestock = pd.Series(data, index)

In [6]:
fit = Holt(livestock, exponential=True).fit(0.06, 0.06, optimized=False)

In [7]:
def delete_adjacent_extremums(values, peaks_, mins_, extremums_):
    for j in range(len(extremums_) - 1):
        if extremums_[j] in peaks_:
            peak_count = 1
            if extremums_[j + 1] in mins_:
                j += 1
            else:
                while extremums_[j + peak_count] in peaks_:
                    peak_count += 1
                max_pack = deepcopy(extremums_[[tuple(range(j, j + peak_count))]])
                maximum_index = extremums_[j + values[extremums_[[tuple(range(j, j + peak_count))]]].argmax()]
                to_delete = deepcopy(max_pack[np.where(max_pack != maximum_index)])
                for d in to_delete:
                    peaks_ = np.delete(peaks_, np.argwhere(peaks_ == d))
        else:
            min_count = 1
            if extremums_[j + 1] in peaks_:
                j += 1
            else:
                while extremums_[j + min_count] in mins_:
                    min_count += 1
                min_pack = deepcopy(extremums[[tuple(range(j, j + min_count))]])
                minimum_index = extremums_[j + values[extremums_[[tuple(range(j, j + min_count))]]].argmin()]
                to_delete = deepcopy(min_pack[np.where(min_pack != minimum_index)])
                for d in to_delete:
                    mins_ = np.delete(mins_, np.argwhere(mins_ == d))
                j += min_count
    extremums_ = np.sort(np.concatenate([peaks_, mins_]))
    return peaks_, mins_, extremums_

In [8]:
day = datetime.strptime('2022-01-03', '%Y-%m-%d')
start_timestamp = pd.Timestamp(day.strftime('%Y-%m-%d'))
finish_timestamp = pd.Timestamp((day + timedelta(days=1)).strftime('%Y-%m-%d'))

extremum_width = int(len(dataframe.loc[(dataframe['closeTime'] >= start_timestamp) & (dataframe['closeTime'] <= finish_timestamp)])/70)
extremum_prominence = fit.fittedvalues.mean()/1000
peaks, _ = find_peaks(fit.fittedvalues, width=extremum_width, prominence=extremum_prominence)
mins, _ = find_peaks(fit.fittedvalues*-1, width=extremum_width, prominence=extremum_prominence)
extremums = np.sort(np.concatenate([peaks, mins]))

peaks, mins, extremums = delete_adjacent_extremums(fit.fittedvalues, peaks, mins, extremums)

In [9]:
print("Peaks count: " + str(len(peaks)))
print("Mins count: " + str(len(mins)))
assert len(extremums) == len(peaks) + len(mins), 'We lost an extremum'

Peaks count: 8811
Mins count: 8811


In [10]:
signals = np.array([])
if peaks[0] < mins[0]:
    signal = True
else:
    signal = False
for i in tqdm(range(len(dataframe))):
    signals = np.append(signals, signal)
    if i in extremums:
        signal = not signal

HBox(children=(FloatProgress(value=0.0, max=1455406.0), HTML(value='')))




In [11]:
dataframe['signal'] = signals
# dataframe.to_csv('MATICUSDT_signal.csv', sep=';')

In [12]:
dataframe_train = dataframe[:-num_test]
dataframe_test = dataframe[-num_test:]
signal_train = deepcopy(dataframe_train['signal'])
dataframe_train.drop(['signal'], axis=1, inplace=True)
signal_test = deepcopy(dataframe_test['signal'])
dataframe_test.drop(['signal'], axis=1, inplace=True)

In [None]:
df_train = dropna(dataframe_train)
df_train = add_all_ta_features(
    df_train, open="open", high="high", low="low", close="close", volume="volume")

df_train = df_train.fillna(0)

In [None]:
df_test = dropna(dataframe_test)
df_test = add_all_ta_features(
    df_test, open="open", high="high", low="low", close="close", volume="volume")

df_test = df_test.fillna(0)

In [15]:
before = len(df_train) + len(df_test)
df_train = df_train[~df_train.isin([np.inf, -np.inf]).any(1)]
df_test = df_test[~df_test.isin([np.inf, -np.inf]).any(1)]
after = len(df_train) + len(df_test)
print(str(before - after) + ' infinite values.')

1123 infinite values.


In [16]:
df_train['signal'] = signal_train
df_test['signal'] = signal_test
print('Train signal: ' + str(np.bincount(df_train['signal'])))
print('Test signal: ' + str(np.bincount(df_test['signal'])))

Train signal: [502573 500166]
Test signal: [145127 145954]


In [17]:
# df_train.to_csv('MATICUSDT_TA_train.csv', sep=';')
# df_test.to_csv('MATICUSDT_TA_test.csv', sep=';')

In [18]:
df_train['openTime'] = df_train['openTime'].apply(lambda x: x.toordinal())
df_train['closeTime'] = df_train['closeTime'].apply(lambda x: x.toordinal())
df_test['openTime'] = df_test['openTime'].apply(lambda x: x.toordinal())
df_test['closeTime'] = df_test['closeTime'].apply(lambda x: x.toordinal())

In [19]:
y_train = df_train['signal']
df_train.drop(['signal'], axis=1, inplace=True)
y_test = df_test['signal']
df_test.drop(['signal'], axis=1, inplace=True)

In [20]:
print('Train shape: ' + str(df_train.shape))
print('Train signal: ' + str(len(y_train)))
print('Test shape: ' + str(df_test.shape))
print('Test signal: ' + str(len(y_test)))

Train shape: (1002739, 93)
Train signal: 1002739
Test shape: (291081, 93)
Test signal: 291081


In [21]:
clf = RandomForestClassifier(max_depth=4, random_state=17, verbose=2)
clf.fit(df_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 100


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s


building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  3.7min finished


RandomForestClassifier(max_depth=4, random_state=17, verbose=2)

In [22]:
preds = clf.predict(df_test)
print(accuracy_score(preds, y_test))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


0.8990315410487115


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.1s finished


In [23]:
with open('RandomForest_depth_4.pickle', 'wb') as out:
    pickle.dump(clf, out)