In [2]:
# import packages
import numpy as np
import pandas as pd
from tools import FeatureEngineer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
# load and prepare data
bnb = pd.read_csv('bnb.csv') # load data
bnb = bnb.sample(n=10000, replace=False, random_state=1) # 10000 samples
tool = FeatureEngineer()
bnb = tool.build_technical_indicators(bnb) # construct technical indicators
bnb.drop(columns=['Asset_ID', 'KAMA', 'PSAR+', 'PSAR-'], inplace=True) # drop columns with too many NAs
bnb.dropna(axis=0, inplace=True) # drop rows contain missing
r = bnb.index[np.isinf(bnb).any(1)]
bnb.drop([i for i in r], axis=0, inplace=True) # drop rows contain infinity

In [4]:
# set parameters
split_ratio = 0.2
scaler = MinMaxScaler()
x = bnb.drop(columns=['Target', 'Open', 'High', 'Low'])
y = bnb['Target']
clf = RandomForestRegressor()

In [5]:
# train
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=split_ratio, random_state=42)
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))

Mean Squared Error: 1.691063404804819e-05


In [6]:
# check feature importance
clf.feature_importances_

array([0.0257791 , 0.02329998, 0.00812041, 0.04157261, 0.00983132,
       0.02709737, 0.0471757 , 0.01633233, 0.01741093, 0.01775159,
       0.02706617, 0.01419328, 0.00676541, 0.0087481 , 0.02480277,
       0.01814059, 0.01859304, 0.0069844 , 0.00889828, 0.01609507,
       0.01465327, 0.01506935, 0.0161027 , 0.00773287, 0.01577121,
       0.00767739, 0.01245826, 0.0223793 , 0.01049326, 0.01968399,
       0.01779427, 0.00036308, 0.01043391, 0.01080334, 0.01108904,
       0.00736143, 0.01160226, 0.00407817, 0.0110184 , 0.0135078 ,
       0.00699709, 0.01882252, 0.00684033, 0.01068555, 0.01383817,
       0.00427605, 0.00431546, 0.00423618, 0.01427234, 0.01431883,
       0.01589434, 0.01367903, 0.00565011, 0.00931013, 0.00838011,
       0.00806363, 0.01069588, 0.02101936, 0.01118985, 0.00984811,
       0.01815048, 0.0164131 , 0.01752626, 0.00911494, 0.01992827,
       0.014998  , 0.02548008, 0.01051017, 0.00829255, 0.01221323,
       0.01030809])

In [7]:
threshold = 0.01
not_important = {}
important_features = {}
feature_name = [item for item in x.columns]
feature_importance = clf.feature_importances_

In [9]:

for idx, val in enumerate(feature_importance):
    if val > threshold:
        important_features[feature_name[idx]] = round(val, 4)
    else:
        not_important[feature_name[idx]] = round(val, 4)
        
print(important_features)
print()
print(not_important)

{'timestamp': 0.0258, 'Count': 0.0233, 'Volume': 0.0416, 'open_sub_close': 0.0271, 'high_div_low': 0.0472, 'ma8_vol': 0.0163, 'ma21_vol': 0.0174, 'ma50_vol': 0.0178, 'ma200_vol': 0.0271, 'AO': 0.0142, 'PVO': 0.0248, 'PVO_signal': 0.0181, 'ROC': 0.0186, 'RSI_stoch_d': 0.0161, 'RSI_stoch_k': 0.0147, 'stoch': 0.0151, 'stoch_signal': 0.0161, 'ult': 0.0158, 'ADI': 0.0125, 'CMF': 0.0224, 'EoM': 0.0105, 'EoM_signal': 0.0197, 'MFI': 0.0178, 'OBV': 0.0104, 'VPT': 0.0108, 'ATR': 0.0111, 'BOLL-': 0.0116, 'BOLL_percent': 0.011, 'BOLL_width': 0.0135, 'DC-': 0.0188, 'DC_percent': 0.0107, 'DC_width': 0.0138, 'KC_percent': 0.0143, 'KC_width': 0.0143, 'Ulcer': 0.0159, 'ADX': 0.0137, 'ema50_price': 0.0107, 'ema200_price': 0.021, 'MACD': 0.0112, 'MI': 0.0182, 'PSAR': 0.0164, 'STC': 0.0175, 'VI': 0.0199, 'VI+': 0.015, 'VI-': 0.0255, 'WMA': 0.0105, 'DLR': 0.0122, 'DR': 0.0103}

{'Close': 0.0081, 'VWAP': 0.0098, 'PPO': 0.0068, 'PPO_signal': 0.0087, 'RSI': 0.007, 'RSI_stoch': 0.0089, 'TSI': 0.0077, 'WRI': 0.