In [3]:
# import packages
import numpy as np
import pandas as pd
from tools import FeatureEngineer


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:
# load data
bnb = pd.read_csv('bnb.csv')
bnb.drop(columns='Asset_ID', inplace=True)

In [5]:
fe = FeatureEngineer()
bnb = fe.build_technical_indicators(bnb)
bnb.head()

Unnamed: 0,timestamp,Count,Open,High,Low,Close,Volume,VWAP,Target,open_sub_close,...,PSAR-,STC,TRIX,VI,VI+,VI-,WMA,CR,DLR,DR
0,1523956260,7.0,12.4195,12.4195,12.4101,12.4195,794.7,12.411386,-0.004366,0.0,...,,,,,,,,0.0,,
1,1523956320,33.0,12.4195,12.4195,12.4001,12.415,1117.73,12.407532,-0.00394,0.0045,...,,,,,,,,-0.036233,-0.03624,-0.036233
2,1523956380,32.0,12.415,12.4195,12.4003,12.41,1062.37,12.401607,-0.003153,0.005,...,12.4195,,,,,,,-0.076493,-0.040282,-0.040274
3,1523956440,38.0,12.4006,12.41,12.3931,12.4,2259.55,12.399251,-0.003429,0.0006,...,12.4195,,,,,,,-0.157011,-0.080613,-0.08058
4,1523956500,79.0,12.4,12.421,12.393,12.3942,7113.37,12.406144,-0.002187,0.0058,...,,,,,,,,-0.203712,-0.046785,-0.046774


In [6]:
# inspect missing values
temp = bnb.isna().sum()
print([i for i in temp.index if temp[i] > 1000])
print([temp[i] for i in temp.index if temp[i] > 1000])

['RSI_stoch', 'RSI_stoch_d', 'RSI_stoch_k', 'MFI', 'PSAR+', 'PSAR-']
[1905, 2193, 2049, 3153, 888055, 915717]


In [7]:
# drop columns with too many NAs
bnb.drop(columns=['KAMA', 'PSAR+', 'PSAR-'], inplace=True)

In [8]:
# drop NAs
bnb.dropna(axis=0, inplace=True)
bnb.isna().sum()

timestamp    0
Count        0
Open         0
High         0
Low          0
            ..
VI-          0
WMA          0
CR           0
DLR          0
DR           0
Length: 75, dtype: int64

In [9]:
# check if df contains infinite values
r = bnb.index[np.isinf(bnb).any(1)]
print(r)

Int64Index([ 697988,  697989,  697990,  697991,  697992,  697993,  697994,
             697995,  697996,  787255,  787256,  787257,  787259,  787260,
             787261,  787262,  787263,  787264,  787289,  787291,  787292,
             787293,  787294,  787317,  787318,  787319,  787320,  787321,
             787322,  787323,  787324,  787325,  787326,  787327,  787532,
             787533,  787534,  787570,  787664,  787665,  787955,  787957,
             787958,  787959,  788065,  788066,  788080,  788082,  798090,
             798091,  798092,  954316,  954319,  954321,  954323,  954324,
             954325,  969425,  969426,  969427, 1063757, 1063758, 1063759,
            1063760, 1063764, 1063765, 1063768, 1063769, 1063771, 1063772,
            1063798, 1063799, 1063800, 1063809, 1063810, 1063811, 1063872,
            1063873, 1155936, 1155937, 1155972, 1155974, 1155976, 1155978,
            1156057, 1156073, 1156076, 1156077, 1156080, 1156081, 1156083],
           dtype='int64'

In [10]:
bnb.drop([i for i in r], axis=0, inplace=True)

In [11]:
bnb.corr()

Unnamed: 0,timestamp,Count,Open,High,Low,Close,Volume,VWAP,Target,open_sub_close,...,PSAR,STC,TRIX,VI,VI+,VI-,WMA,CR,DLR,DR
timestamp,1.000000,0.455384,0.653054,0.653041,0.653075,0.653053,0.114014,0.653057,0.002765,0.002816,...,0.653062,0.001778,0.006659,0.003630,0.108174,0.101319,0.653053,0.653053,0.000995,0.001154
Count,0.455384,1.000000,0.600764,0.601603,0.599749,0.600683,0.507294,0.600682,0.012887,0.039572,...,0.600851,-0.002977,-0.024643,-0.002600,0.056555,0.061422,0.600835,0.600683,-0.026222,-0.024679
Open,0.653054,0.600764,1.000000,0.999998,0.999998,0.999998,0.065864,0.999999,-0.001489,0.004342,...,0.999971,0.000077,0.005279,0.006751,0.105078,0.092363,0.999998,0.999998,-0.000399,-0.000201
High,0.653041,0.601603,0.999998,1.000000,0.999994,0.999998,0.066385,0.999998,-0.001497,0.003221,...,0.999968,0.000096,0.005148,0.006787,0.105091,0.092306,0.999997,0.999998,0.000235,0.000437
Low,0.653075,0.599749,0.999998,0.999994,1.000000,0.999998,0.065236,0.999999,-0.001498,0.003082,...,0.999967,0.000141,0.005411,0.006889,0.105241,0.092266,0.999996,0.999998,0.000293,0.000487
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VI-,0.101319,0.061422,0.092363,0.092306,0.092266,0.092247,-0.000867,0.092274,0.003984,0.053769,...,0.093879,-0.647129,-0.384579,-0.939508,-0.765278,1.000000,0.092754,0.092247,-0.123745,-0.123733
WMA,0.653053,0.600835,0.999998,0.999997,0.999996,0.999996,0.065899,0.999997,-0.001484,0.003966,...,0.999980,-0.000288,0.005121,0.006325,0.104670,0.092754,1.000000,0.999996,-0.000246,-0.000047
CR,0.653053,0.600683,0.999998,0.999998,0.999998,1.000000,0.065818,0.999999,-0.001500,0.002175,...,0.999967,0.000155,0.005281,0.006881,0.105207,0.092247,0.999996,1.000000,0.000808,0.001006
DLR,0.000995,-0.026222,-0.000399,0.000235,0.000293,0.000808,-0.037896,0.000308,-0.005916,-0.557282,...,-0.000943,0.069754,0.003529,0.136757,0.133220,-0.123745,-0.000246,0.000808,1.000000,0.999966


In [12]:
def model_training(x, y, split_ratio, scaler, model_list):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=split_ratio, random_state=42)
    # x_train = scaler.fit_transform(x_train)
    # x_test = scaler.transform(x_test)
    for model_name, clf in model_list.items():
        clf.fit(x_train, y_train)
        y_pred = clf.predict(x_test)
        print(model_name)
        print('R^2 Score:', r2_score(y_test, y_pred))
        print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
        print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))
        print()

In [13]:
split_ratio = 0.2
scaler = MinMaxScaler()
x = bnb.drop(columns=['Target', 'Open', 'High', 'Low', 'VWAP'])
y = bnb['Target']
model_list = {'linear regression': LinearRegression(),
              'ridge': Ridge(),
              'elastic net': ElasticNet()}

model_training(x, y, split_ratio, scaler, model_list)

linear regression
R^2 Score: 0.0008145034851070321
Mean Absolute Error: 0.0027143575457516237
Root Mean Squared Error: 0.004348497068449981



  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


ridge
R^2 Score: 0.002944262645648066
Mean Absolute Error: 0.002711747846550771
Root Mean Squared Error: 0.00434386019579549



  model = cd_fast.enet_coordinate_descent(


elastic net
R^2 Score: 0.00025167105120349476
Mean Absolute Error: 0.002713695446123922
Root Mean Squared Error: 0.00434972163117337



In [14]:
split_ratio = 0.2
scaler = StandardScaler()
x = bnb.drop(columns=['Target'])
y = bnb['Target']
model_list = {'linear regression': LinearRegression(),
              'ridge': Ridge(),
              'elastic net': ElasticNet()}

model_training(x, y, split_ratio, scaler, model_list)

linear regression
R^2 Score: 0.0008923505211246852
Mean Absolute Error: 0.0027135342217234665
Root Mean Squared Error: 0.004348327668372185



  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


ridge
R^2 Score: 0.0030557416759781075
Mean Absolute Error: 0.002711691299159997
Root Mean Squared Error: 0.00434361734936229



  model = cd_fast.enet_coordinate_descent(


elastic net
R^2 Score: 0.00025167105120349476
Mean Absolute Error: 0.002713695446123922
Root Mean Squared Error: 0.00434972163117337



In [15]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn import datasets

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# split data
x, y = np.array(bnb.drop(columns='Target')), np.array(bnb['Target'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# normalization
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# convert to tensor
x_train, x_test = torch.from_numpy(x_train).float(), torch.from_numpy(x_test).float()
y_train, y_test = torch.from_numpy(y_train).float(), torch.from_numpy(y_test).float()
y_train = y_train.view(y_train.shape[0], 1)

n_samples, n_features = x_train.shape

In [16]:
# construct model
input_size, output_size = n_features, 1
model = nn.Linear(input_size, output_size)

In [17]:
# loss & optimizer
learning_rate = 0.01
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [18]:
# training loop
num_epochs = 2000
for epoch in range(num_epochs):
    
    # forward pass and loss
    y_pred = model(x_train)
    loss = criterion(y_pred, y_train)
    
    # backward pass and update
    loss.backward()
    optimizer.step()
    
    # zero grad before new step
    optimizer.zero_grad()
    
    if (epoch+1) % 50 == 0:
        print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')

epoch: 50, loss = 0.0041
epoch: 100, loss = 0.0032
epoch: 150, loss = 0.0025
epoch: 200, loss = 0.0021
epoch: 250, loss = 0.0017
epoch: 300, loss = 0.0015
epoch: 350, loss = 0.0013
epoch: 400, loss = 0.0011
epoch: 450, loss = 0.0010
epoch: 500, loss = 0.0008
epoch: 550, loss = 0.0007
epoch: 600, loss = 0.0007
epoch: 650, loss = 0.0006
epoch: 700, loss = 0.0006
epoch: 750, loss = 0.0005
epoch: 800, loss = 0.0005
epoch: 850, loss = 0.0004
epoch: 900, loss = 0.0004
epoch: 950, loss = 0.0004
epoch: 1000, loss = 0.0004
epoch: 1050, loss = 0.0003
epoch: 1100, loss = 0.0003
epoch: 1150, loss = 0.0003
epoch: 1200, loss = 0.0003
epoch: 1250, loss = 0.0003
epoch: 1300, loss = 0.0003
epoch: 1350, loss = 0.0003
epoch: 1400, loss = 0.0003
epoch: 1450, loss = 0.0002
epoch: 1500, loss = 0.0002
epoch: 1550, loss = 0.0002
epoch: 1600, loss = 0.0002
epoch: 1650, loss = 0.0002
epoch: 1700, loss = 0.0002
epoch: 1750, loss = 0.0002
epoch: 1800, loss = 0.0002
epoch: 1850, loss = 0.0002
epoch: 1900, loss = 0

In [19]:
# evaluation
pred = model(x_test).detach().numpy()
pred = [pred[i] for i in range(len(pred))]
print('R^2 Score:', r2_score(y_test, pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, pred)))

R^2 Score: -8.86538682870751
Mean Absolute Error: 0.010368501
Root Mean Squared Error: 0.013663853
