## Import Libraries

In [22]:
# Standard libraries
import pandas as pd
import numpy as np

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# xgboost
from xgboost import XGBClassifier

# talib
from talib import BBANDS
from talib import SAR
from talib import RSI
from talib import STOCH
from talib import EMA
from talib import WILLR

## Data Import

In [23]:
train_df = pd.read_csv("training.csv", names=("open", "high", "low", "close"))
test_df = pd.read_csv("testing.csv", names=("open", "high", "low", "close"))
len(test_df)

20

In [24]:
n_days_in = 20
n_days_out = len(test_df)
train_len = len(train_df)

## Data Preprocessing

In [25]:
# MinMax Normalization
maxValue = train_df.to_numpy().max()
minValue = train_df.to_numpy().min()
diff = maxValue - minValue
train = train_df.transform(lambda x: (x - minValue) / diff)
test = test_df.transform(lambda x: (x - minValue) / diff)
train = pd.concat([train, test], axis=0)
train = train.reset_index(drop=True)
train

Unnamed: 0,open,high,low,close
0,186.73,188.71,186.00,186.30
1,185.57,186.33,184.94,185.54
2,184.81,185.03,183.10,184.66
3,184.39,184.48,182.31,182.54
4,182.20,182.27,180.27,181.59
...,...,...,...,...
1503,151.82,153.00,151.50,152.50
1504,152.51,153.86,152.50,152.83
1505,152.95,153.18,152.61,153.13
1506,153.20,154.12,153.20,154.04


In [26]:
train["upperband"], train["middleband"], train["lowerband"] = BBANDS(train.close.to_numpy())
train["sar"] = SAR(train.high.to_numpy(), train.low.to_numpy())
train["rsi"] = RSI(train.close.to_numpy(), timeperiod=5)
train["slowk"], train["slowd"] = STOCH(train.high.to_numpy(), train.low.to_numpy(), train.close.to_numpy())
train["ema"] = EMA(train.close.to_numpy(), timeperiod=5)
train["willr"] = WILLR(train.high.to_numpy(), train.low.to_numpy(), train.close.to_numpy(), timeperiod=9)
train.isnull().sum()

open          0
high          0
low           0
close         0
upperband     4
middleband    4
lowerband     4
sar           1
rsi           5
slowk         8
slowd         8
ema           4
willr         8
dtype: int64

In [27]:
train_data = train.dropna()
train_data = train_data.reset_index(drop=True)
train

Unnamed: 0,open,high,low,close,upperband,middleband,lowerband,sar,rsi,slowk,slowd,ema,willr
0,186.73,188.71,186.00,186.30,,,,,,,,,
1,185.57,186.33,184.94,185.54,,,,188.710000,,,,,
2,184.81,185.03,183.10,184.66,,,,188.634600,,,,,
3,184.39,184.48,182.31,182.54,,,,188.413216,,,,,
4,182.20,182.27,180.27,181.59,187.698956,184.126,180.553044,188.047023,,,,184.126000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,151.82,153.00,151.50,152.50,154.034751,152.702,151.369249,156.048006,42.470391,22.596115,22.366772,152.656289,-84.119497
1504,152.51,153.86,152.50,152.83,153.799424,152.602,151.404576,155.683366,47.145425,29.306072,23.497817,152.714193,-74.427481
1505,152.95,153.18,152.61,153.13,153.739473,152.582,151.424527,155.347896,51.613636,51.959934,34.620707,152.852795,-54.696133
1506,153.20,154.12,153.20,154.04,154.455874,152.800,151.144126,155.039265,63.358632,71.763836,51.009947,153.248530,-5.204461


In [28]:
train_data

Unnamed: 0,open,high,low,close,upperband,middleband,lowerband,sar,rsi,slowk,slowd,ema,willr
0,179.48,179.61,177.35,179.16,183.135180,180.986,178.836820,185.531287,10.760278,35.685147,30.969656,181.130321,-84.066901
1,180.36,182.00,179.32,180.00,182.836037,180.668,178.499963,184.549533,24.037590,35.196490,34.271958,180.753547,-70.489978
2,179.83,181.60,179.50,181.07,182.739132,180.620,178.500868,183.685589,38.586495,46.049189,38.976942,180.859032,-51.562500
3,181.79,182.36,180.35,180.52,181.551635,180.260,178.968365,182.925318,34.358325,57.668859,46.304846,180.746021,-55.539972
4,185.77,188.97,184.75,188.52,188.637788,181.854,175.070212,177.350000,78.063128,75.844229,59.854092,183.337347,-3.872633
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,151.82,153.00,151.50,152.50,154.034751,152.702,151.369249,156.048006,42.470391,22.596115,22.366772,152.656289,-84.119497
1496,152.51,153.86,152.50,152.83,153.799424,152.602,151.404576,155.683366,47.145425,29.306072,23.497817,152.714193,-74.427481
1497,152.95,153.18,152.61,153.13,153.739473,152.582,151.424527,155.347896,51.613636,51.959934,34.620707,152.852795,-54.696133
1498,153.20,154.12,153.20,154.04,154.455874,152.800,151.144126,155.039265,63.358632,71.763836,51.009947,153.248530,-5.204461


## Training

In [29]:
y = list()
for i in range(len(train_data)):
    isBull = (train_data["open"][i] > train_data["sar"][i], 
              train_data["open"][i] >= train_data["middleband"][i],
              train_data["rsi"][i] > 50,
              train_data["slowk"][i] >= train_data["slowd"][i],
              train_data["open"][i] >= train_data["ema"][i],
              train_data["willr"][i] > -50)
    if np.count_nonzero(isBull) > 4:
        y.append(2)
    elif np.count_nonzero(isBull) < 2:
        y.append(0)
    else:
        y.append(1)
y = np.array(y, dtype=int)

In [30]:
X = list()
for i in range(n_days_in, len(train_data)):
    X.append(train_data.loc[i-n_days_in:i-1, :].values)
X = np.array(X)

In [31]:
y = y[n_days_in + n_days_out :] # why?
len(y)

1460

In [32]:
test = X[-len(test_df):]
len(test)

20

In [33]:
new_X = X[:-len(test_df)]

new_X = new_X.reshape((len(y), -1))

In [34]:
new_X.shape

(1460, 260)

In [35]:
X_train, X_val, y_train, y_val = train_test_split(new_X, y, test_size=0.2, shuffle=False)

In [36]:
xgb = XGBClassifier(learning_rate=0.1, 
                    objective='multi:softmax',
                    num_class=3,
                    n_estimators=1000, max_depth=1, min_child_weight=2, use_label_encoder=False)
# model = xgb.fit(X_train, y_train,
#                eval_set=[(X_val, y_val)],
#               eval_metric="auc",
#                verbose=True)

In [37]:
parameters = {
    'max_depth': list(range(1, 10)),
    'min_child_weight': list(range(1, 10)),
    "n_estimators": list(range(100, 1001, 100))
}
# gsearch = GridSearchCV(xgb, param_grid=parameters, scoring="f1", cv=2)
# gsearch.fit(X_train, y_train,  eval_set=[(X_val, y_val)], eval_metric="auc", verbose=True)
# best_parameters = gsearch.best_estimator_.get_params()
rsearch = RandomizedSearchCV(xgb, param_distributions=parameters, scoring="f1", cv=2)
rsearch.fit(X_train, y_train,  eval_set=[(X_val, y_val)], eval_metric="auc", verbose=False)
best_parameters = rsearch.best_estimator_.get_params()

Traceback (most recent call last):
  File "/opt/anaconda3/envs/python3_8/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/opt/anaconda3/envs/python3_8/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 216, in __call__
    return self._score(
  File "/opt/anaconda3/envs/python3_8/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 264, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/opt/anaconda3/envs/python3_8/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 1123, in f1_score
    return fbeta_score(
  File "/opt/anaconda3/envs/python3_8/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 1261, in fbeta_score
    _, _, f, _ = precision_recall_fscore_support(
  File "/opt/anaconda3/envs/python3_8/lib/python3.8/site-packages/sklearn/metrics/_classification.py", line 1544, in precision_recall

In [38]:
print(best_parameters)

{'objective': 'multi:softprob', 'use_label_encoder': False, 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'enable_categorical': False, 'gamma': 0, 'gpu_id': -1, 'importance_type': None, 'interaction_constraints': '', 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 5, 'min_child_weight': 3, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 1000, 'n_jobs': 8, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': None, 'subsample': 1, 'tree_method': 'exact', 'validate_parameters': 1, 'verbosity': None, 'num_class': 3}


In [39]:
xgb = XGBClassifier(learning_rate=0.1, n_estimators=30,
                    objective='multi:softmax',
                    num_class=3,
                    max_depth=3, min_child_weight=10, use_label_encoder=False)
model = xgb.fit(X_train, y_train,
                eval_set=[(X_val, y_val)],
                eval_metric="mlogloss",
                verbose=True)

[0]	validation_0-mlogloss:1.09821
[1]	validation_0-mlogloss:1.10103
[2]	validation_0-mlogloss:1.10074
[3]	validation_0-mlogloss:1.10070
[4]	validation_0-mlogloss:1.10235
[5]	validation_0-mlogloss:1.10282
[6]	validation_0-mlogloss:1.10318
[7]	validation_0-mlogloss:1.10346
[8]	validation_0-mlogloss:1.10455
[9]	validation_0-mlogloss:1.10622
[10]	validation_0-mlogloss:1.10682
[11]	validation_0-mlogloss:1.10894
[12]	validation_0-mlogloss:1.11034
[13]	validation_0-mlogloss:1.11147
[14]	validation_0-mlogloss:1.11309
[15]	validation_0-mlogloss:1.11488
[16]	validation_0-mlogloss:1.11629
[17]	validation_0-mlogloss:1.11623
[18]	validation_0-mlogloss:1.11572
[19]	validation_0-mlogloss:1.11700
[20]	validation_0-mlogloss:1.12119
[21]	validation_0-mlogloss:1.12322
[22]	validation_0-mlogloss:1.12543
[23]	validation_0-mlogloss:1.12773
[24]	validation_0-mlogloss:1.12795
[25]	validation_0-mlogloss:1.12783
[26]	validation_0-mlogloss:1.12800
[27]	validation_0-mlogloss:1.12775
[28]	validation_0-mlogloss:1.1

In [40]:
model.predict(test.reshape(20, -1))

array([2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2])

In [41]:
# test["upperband"], test["middleband"], test["lowerband"] = BBANDS(test.close.to_numpy())
# # train["ma5"] = MA(train.close.to_numpy(), timeperiod=5)
# test["sar"] = SAR(test.high.to_numpy(), test.low.to_numpy())
# test["rsi"] = RSI(test.close.to_numpy(), timeperiod=5)
# test["slowk"], test["slowd"] = STOCH(test.high.to_numpy(), test.low.to_numpy(), test.close.to_numpy())

In [42]:
predictions = model.predict(test.values)
len(predictions)

AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [None]:
ans = []
val = 0
for i in range(1, len(predictions)):
    _sum = sum(predictions[i-1:i+1])
    if _sum == 2:
        val = 1 if (val == 1) else (val + 1) 
    elif _sum == 1:
        val = val
    else:
        val = -1 if (val == -1) else val - 1
    ans.append(val)
print(ans, len(ans), sep='\n\n')
with open("./output.csv", "w") as fp:
    for i in range(len(ans)):
        print(ans[i], file=fp)