In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score as AUC
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import datetime
from datetime import timedelta
import features

## features engineering

In [None]:
# generate features
import warnings
warnings.filterwarnings("ignore")
ni_raw = pd.read_feather('ni.feather')
ni_i = features.first_and_last(ni_raw)
features.make_minute(ni_i)
features.make_datetime(ni_i)
features.make_labels(ni_i)

ni = pd.read_feather('ni_minute.feather')
features.make_diff_period_close_features(ni['Close', 'price_max', 'price_min'])
features.make_prices_features(ni['price_mean', 'ask_mean', 'bid_mean'])
features.make_qty_features(ni[['volume_mean', 'ask_qty_mean', 'bid_qty_mean', 'open_int_mean']])

In [None]:
#  predict next 1 minute
ni_features_labels = pd.DataFrame()

ni_minute = pd.read_feather('date.feather')
ni_features_labels['date'] = ni_minute['date']
ni_features_labels['datetime'] = ni_minute['datetime']

# close features
for f in ['close']:
    features = pd.read_feather(f + '_features.feather')
    for slot in [1, 2, 5]:
        for col in ['RSI3', 'RSI6', 'RSI12', 'RSI24', 'MOM', 'DIFF', 'MACD', 'talib_K', 'talib_D', 'talib_J']:
            ni_features_labels[f + '_' + col + '_m' + str(slot)] = features[col + '_m' + str(slot)]

# price, ask, bid features
for f in ['price', 'ask', 'bid']:
    features = pd.read_feather(f + '_features.feather')
    for slot in []:
        for m in range(20):
            ni_features_labels[f + '_mean' + str(m) + '_m' + str(slot)] = features[f + '_mean' + str(m) + '_m' + str(slot)]
    for slot in [1]:
        for m in range(20):
            ni_features_labels[f + '_rate' + str(m) + '_m' + str(slot)] = features[f + '_rate' + str(m) + '_m' + str(slot)]

# volume, ast_qty, bid_qty features
for f in ['volume', 'ask_qty', 'bid_qty']:
    features = pd.read_feather(f + '_features.feather')
    for slot in [1]:
        for m in range(20):
            ni_features_labels[f + '_mean' + str(m) + '_m' + str(slot)] = features[f + '_mean' + str(m) + '_m' + str(slot)]
    for slot in [1, 2, 5, 10]:
        for m in range(20):
            ni_features_labels[f + '_rate' + str(m) + '_m' + str(slot)] = features[f + '_rate' + str(m) + '_m' + str(slot)]

# open_int features
features = pd.read_feather('open_int_features.feather')
for slot in []:
    for m in range(20):
        ni_features_labels['open_int_mean' + str(m) + '_m' + str(slot)] = features['open_int_mean' + str(m) + '_m' + str(slot)]
for slot in [1, 2, 5, 10]:
    for m in range(20):
        ni_features_labels['open_int_rate' + str(m) + '_m' + str(slot)] = features['open_int_rate' + str(m) + '_m' + str(slot)]

ni_labels = pd.read_feather('ni_labels.feather')
ni_features_labels = pd.concat([ni_features_labels, ni_labels], axis = 1)

ni_features_labels = ni_features_labels.dropna(axis=0, how='any')
ni_features_labels.reset_index(inplace=True, drop=True)
ni_features_labels.columns

In [None]:
# predict next 5 minutes
ni_features_labels = pd.DataFrame()

ni_minute = pd.read_csv('ni_minute_1.csv')
ni_features_labels['date'] = ni_minute['date']

# close features
for f in ['close']:
    features = pd.read_csv(f + '_features.csv')
    for slot in [1, 5]:
        for col in ['RSI3', 'RSI6', 'RSI12', 'RSI24', 'MOM', 'DIFF', 'MACD', 'talib_K', 'talib_D', 'talib_J']:
            ni_features_labels[f + '_' + col + '_m' + str(slot)] = features[col + '_m' + str(slot)]

# price, ask, bid features
for f in ['price', 'ask', 'bid']:
    features = pd.read_csv(f + '_features.csv')
    for slot in []:
        for m in range(20):
            ni_features_labels[f + '_mean' + str(m) + '_m' + str(slot)] = features[f + '_mean' + str(m) + '_m' + str(slot)]
    for slot in [1, 5]:
        for m in range(20):
            ni_features_labels[f + '_rate' + str(m) + '_m' + str(slot)] = features[f + '_rate' + str(m) + '_m' + str(slot)]

# volume, ast_qty, bid_qty features
for f in ['volume', 'ask_qty', 'bid_qty']:
    features = pd.read_csv(f + '_features.csv')
    for slot in [1, 5]:
        for m in range(20):
            ni_features_labels[f + '_mean' + str(m) + '_m' + str(slot)] = features[f + '_mean' + str(m) + '_m' + str(slot)]
    for slot in [1, 5, 10, 30, 60, 120]:
        for m in range(20):
            ni_features_labels[f + '_rate' + str(m) + '_m' + str(slot)] = features[f + '_rate' + str(m) + '_m' + str(slot)]

# open_int features
features = pd.read_csv('open_int_features.csv')
for slot in [1, 5]:
    for m in range(20):
        ni_features_labels['open_int_mean' + str(m) + '_m' + str(slot)] = features['open_int_mean' + str(m) + '_m' + str(slot)]
for slot in [1, 5, 10, 30, 60, 120]:#10
    for m in range(20):
        ni_features_labels['open_int_rate' + str(m) + '_m' + str(slot)] = features['open_int_rate' + str(m) + '_m' + str(slot)]

ni_labels = pd.read_csv('ni_label.csv')
ni_features_labels = pd.concat([ni_features_labels, ni_labels], axis = 1)

# ni_features_labels.insert(0, 'index', range(len(ni_features_labels)), allow_duplicates=False)
# ni_features_labels = ni_features_labels.loc[ni_features_labels['index'] % 5 == 0]
# ni_features_labels.drop(columns=['index'], inplace=True)
ni_features_labels = ni_features_labels.dropna(axis=0, how='any')
ni_features_labels.reset_index(inplace=True, drop=True)

In [None]:
# predict next 10 minutes
ni_features_labels = pd.DataFrame()

ni_minute = pd.read_csv('ni_minute_1.csv')
ni_features_labels['date'] = ni_minute['date']

# close features
for f in ['close']:
    features = pd.read_csv(f + '_features.csv')
    for slot in [1, 5, 10]:
        for col in ['RSI3', 'RSI6', 'RSI12', 'RSI24', 'MOM', 'DIFF', 'MACD', 'talib_K', 'talib_D', 'talib_J']:
            ni_features_labels[f + '_' + col + '_m' + str(slot)] = features[col + '_m' + str(slot)]

# price, ask, bid features
for f in ['price', 'ask', 'bid']:
    features = pd.read_csv(f + '_features.csv')
    for slot in []:
        for m in range(20):
            ni_features_labels[f + '_mean' + str(m) + '_m' + str(slot)] = features[f + '_mean' + str(m) + '_m' + str(slot)]
    for slot in [1]:
        for m in range(20):
            ni_features_labels[f + '_rate' + str(m) + '_m' + str(slot)] = features[f + '_rate' + str(m) + '_m' + str(slot)]

# volume, ast_qty, bid_qty features
for f in ['volume', 'ask_qty', 'bid_qty']:
    features = pd.read_csv(f + '_features.csv')
    for slot in [1, 5]:
        for m in range(20):
            ni_features_labels[f + '_mean' + str(m) + '_m' + str(slot)] = features[f + '_mean' + str(m) + '_m' + str(slot)]
    for slot in [1, 5, 10, 30, 60, 120, 240]:
        for m in range(20):
            ni_features_labels[f + '_rate' + str(m) + '_m' + str(slot)] = features[f + '_rate' + str(m) + '_m' + str(slot)]

# open_int features
features = pd.read_csv('open_int_features.csv')
for slot in []:
    for m in range(20):
        ni_features_labels['open_int_mean' + str(m) + '_m' + str(slot)] = features['open_int_mean' + str(m) + '_m' + str(slot)]
for slot in [1, 5, 10, 30, 60, 120]:#10
    for m in range(20):
        ni_features_labels['open_int_rate' + str(m) + '_m' + str(slot)] = features['open_int_rate' + str(m) + '_m' + str(slot)]

ni_labels = pd.read_csv('ni_label.csv')
ni_features_labels = pd.concat([ni_features_labels, ni_labels], axis = 1)

# ni_features_labels.insert(0, 'index', range(len(ni_features_labels)), allow_duplicates=False)
# ni_features_labels = ni_features_labels.loc[ni_features_labels['index'] % 5 == 0]
# ni_features_labels.drop(columns=['index'], inplace=True)
ni_features_labels = ni_features_labels.dropna(axis=0, how='any')
ni_features_labels.reset_index(inplace=True, drop=True)

In [None]:
# predict next 30 minutes
ni_features_labels = pd.DataFrame()

ni_minute = pd.read_feather('date.feather')
ni_features_labels['date'] = ni_minute['date']
ni_features_labels['datetime'] = ni_minute['datetime']
#ni_features_labels['datetime'] = ni_minute['datetime'].shift(-30)

# close features
for f in ['close']:
    features = pd.read_feather(f + '_features.feather')
    for slot in [1, 5]:
        for col in ['RSI3', 'RSI6', 'RSI12', 'RSI24', 'MOM', 'DIFF', 'MACD', 'talib_K', 'talib_D', 'talib_J']:
            ni_features_labels[f + '_' + col + '_m' + str(slot)] = features[col + '_m' + str(slot)]

# price, ask, bid features
for f in ['price', 'ask', 'bid']:
    features = pd.read_feather(f + '_features.feather')
    for slot in []:
        for m in range(20):
            ni_features_labels[f + '_mean' + str(m) + '_m' + str(slot)] = features[f + '_mean' + str(m) + '_m' + str(slot)]
    for slot in [1]:
        for m in range(20):
            ni_features_labels[f + '_rate' + str(m) + '_m' + str(slot)] = features[f + '_rate' + str(m) + '_m' + str(slot)]

# volume, ast_qty, bid_qty features
for f in ['volume', 'ask_qty', 'bid_qty']:
    features = pd.read_feather(f + '_features.feather')
    for slot in [1, 10, 30]:
        for m in range(20):
            ni_features_labels[f + '_mean' + str(m) + '_m' + str(slot)] = features[f + '_mean' + str(m) + '_m' + str(slot)]
    for slot in [1, 5, 10, 30, 60, 120, 240]:
        for m in range(20):
            ni_features_labels[f + '_rate' + str(m) + '_m' + str(slot)] = features[f + '_rate' + str(m) + '_m' + str(slot)]

# open_int features
features = pd.read_feather('open_int_features.feather')
for slot in []:
    for m in range(20):
        ni_features_labels['open_int_mean' + str(m) + '_m' + str(slot)] = features['open_int_mean' + str(m) + '_m' + str(slot)]
for slot in [1, 5, 10, 30, 60, 120, 240]:
    for m in range(20):
        ni_features_labels['open_int_rate' + str(m) + '_m' + str(slot)] = features['open_int_rate' + str(m) + '_m' + str(slot)]

ni_labels = pd.read_feather('ni_labels.feather')
ni_features_labels = pd.concat([ni_features_labels, ni_labels], axis = 1)

ni_features_labels = ni_features_labels.dropna(axis=0, how='any')
ni_features_labels.reset_index(inplace=True, drop=True)
ni_features_labels.columns

## Train

### load train data and test data

In [None]:
def load_data(data, label_bar, rolling_date):
    train_start_date = rolling_date['train_start_date']
    train_end_date = rolling_date['train_end_date']
    test_start_date = rolling_date['test_start_date']
    test_end_date = rolling_date['test_end_date']
    feature_index = []
    for col in data.columns:
        if (not re.findall(r'label', col)) & (not re.findall(r'date', col)):
            feature_index.append(col)
    features = data[feature_index]
    long_label = (data['label_long_' + str(label_bar)] >= 0).astype('int')
    short_label = (data['label_short_' + str(label_bar)] >= 0).astype('int')
    invest = pd.DataFrame({'long_invest': data['label_long_invest'], 'short_invest': data['label_short_invest' + str(label_bar)],
                           'long_return': data['label_long_' + str(label_bar)], 'short_return': data['label_short_' + str(label_bar)]})

    train_index = []
    test_index = []
    train_index.append(data.loc[data['date'] >= train_start_date & data['date'] < train_end_date].index.tolist())
    test_index.append(data.loc[data['date'] >= test_start_date & data['date'] < test_end_date].index.tolist())
    d = data.iloc[test_index[0]]
    date = d[['date', 'datetime']]
    date.reset_index(inplace=True, drop=True)
    print(type(date))
    date.to_feather('test_set_date.feather')

    return features, long_label, short_label, train_index, test_index, invest

date = {'train_start_date': datetime(2020, 1, 1),
        'train_end_date': datetime(2020, 12, 31),
        'test_start_date': datetime(2021, 1, 1),
        'test_end_date': datetime(2020, 8, 31)}
features, long_label, short_label, train_index, test_index, invest = load_data(ni_features_labels, 30, date)

In [None]:
print(features.columns)
num_train = (long_label.iloc[train_index[0]] == 0).sum()
num_test = (long_label.iloc[train_index[0]] == 1).sum()

### training

In [None]:
def train(features, labels):
  train_data = lgb.Dataset(features, label=labels)
  params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'num_leaves': 100,
        'max_depth': 12,
        'learning_rate': 0.01,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.6,
        'bagging_freq': 0,
        'seed': 100,
        'verbose': 1,
        'lambda_l1': 0.3,
        'lambda_l2': 1e-03,
        'scale_pos_weight':num_train/num_test
  }
  lgbm = lgb.train(params, train_data, num_boost_round=10000)
  return lgbm

def test(features, long_labels, short_labels, invest, lgbm_long, lgbm_short):
  y_pred_long = lgbm_long.predict(features)
  y_pred_short = lgbm_short.predict(features)
  long_invest = np.array(invest['long_invest']).tolist()
  short_invest = np.array(invest['short_invest']).tolist()
  long_return = np.array(invest['long_return']).tolist()
  short_return = np.array(invest['short_return']).tolist()
  diff_long = pd.DataFrame({'label': long_labels, 'pred': y_pred_long,
                            'long_invest': invest['long_invest'], 'long_return': invest['long_return']})
  diff_long.reset_index(drop=True, inplace=True)
  diff_long.to_feather('diff_long.feather')
  diff_short = pd.DataFrame({'label': short_labels, 'pred': y_pred_short,
                             'short_invest': invest['short_invest'], 'short_return': invest['short_return']})
  diff_short.reset_index(drop=True, inplace=True)
  diff_short.to_feather('diff_short.feather')
  result = {}
  result['precision_long'] = []
  result['recall_long'] = []
  result['return_rate_long'] = []
  result['precision_short'] = []
  result['recall_short'] = []
  result['return_rate_short'] = []
  C_long = []
  C_short = []
  long_label = np.array(long_labels).reshape(1, -1).tolist()[0]
  short_label = np.array(short_labels).reshape(1, -1).tolist()[0]
  for thr in [0.5, 0.9, 0.99, 0.995]:
    prediction_long = (y_pred_long >= thr).astype('int')
    prediction_short = (y_pred_short >= thr).astype('int')
    c = confusion_matrix(long_label, prediction_long, labels=[0, 1]) # 可将'1'等替换成自己的类别，如'cat'。
    C_long.append(c)
    inv = 0
    ret = 0
    result['precision_long'].append(c[1][1]/(c[0][1]+c[1][1]))
    result['recall_long'].append(c[1][1]/(c[1][0]+c[1][1]))
    for i in range(len(prediction_long)):
        if prediction_long[i] >= thr:
            inv += long_invest[i]
            ret += long_return[i]
    result['return_rate_long'].append(ret/inv)
    c = confusion_matrix(short_label, prediction_short, labels=[0, 1]) # 可将'1'等替换成自己的类别，如'cat'。
    C_short.append(c)
    inv = 0
    ret = 0
    p = 0
    r = 0
    p = c[1][1]/(c[0][1]+c[1][1])
    r = c[1][1]/(c[1][0]+c[1][1])
    result['precision_short'].append(p)
    result['recall_short'].append(r)
    for i in range(len(prediction_short)):
        if prediction_short[i] >= thr:
            inv += short_invest[i]
            ret += short_return[i]
    result['return_rate_short'].append(ret/inv)
  for c in C_long:
    print(c)
  for c in C_short:
    print(c)

  rate_of_return = []
  for thr in [0.9, 0.95, 0.99, 0.995]:
      inv = 0
      ret = 0
      for i in range(len(prediction_long)):
          if y_pred_long[i] - y_pred_short[i] >= thr:
              inv += long_invest[i]
              ret += long_return[i]
          if y_pred_short[i] - y_pred_long[i] >= thr:
              inv += short_invest[i]
              ret += short_return[i]
      rate_of_return.append(ret/inv)
  df = pd.DataFrame(rate_of_return)
  df.to_csv('rate_of_return.txt', index=False)

  return result

In [None]:
invest.iloc[test_index[0]].head(20)

In [None]:
lgbm_long = train(features.iloc[train_index[0]], long_label.iloc[train_index[0]])
lgbm_short = train(features.iloc[train_index[0]], short_label.iloc[train_index[0]])
lgbm_long.save_model('lgbm_long.txt')
lgbm_short.save_model('lgbm_short.txt')
result = test(features.iloc[test_index[0]], long_label.iloc[test_index[0]], short_label.iloc[test_index[0]], invest.iloc[test_index[0]], lgbm_long, lgbm_short)

importance = lgbm_long.feature_importance()
names = lgbm_long.feature_name()
with open('./features_importance.txt', 'w+') as file:
    for index, im in enumerate(importance):
        string = names[index] + ', ' + str(im) + '\n'
        file.write(string)

result_df = pd.DataFrame(result)
result_df.to_csv('result.txt', index=False)

### hyper-parameters tuning

In [None]:
X_train = features.iloc[train_index[0]]
y_train = long_label.iloc[train_index[0]]

params = {
          'boosting_type': 'gbdt',
          'objective': 'binary',
          'metric': 'auc',
          'nthread':4,
          'learning_rate':0.01,
          'num_leaves':32,
          'max_depth': 5,
          'subsample': 0.8,
          'colsample_bytree': 0.8,
    }

data_train = lgb.Dataset(X_train, y_train)
cv_results = lgb.cv(params, data_train, num_boost_round=10000, nfold=5, stratified=False, shuffle=True, metrics='auc',early_stopping_rounds=100,seed=0)
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', pd.Series(cv_results['auc-mean']).max())

In [None]:
# select max_depth and num_leaves
from sklearn.model_selection import GridSearchCV
params_test1={'max_depth': range(10,18,1), 'num_leaves':range(80, 200, 20)}
#params_test1={'num_leaves':range(80, 120, 1)}
gsearch1 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.01, n_estimators=, max_depth=12, bagging_fraction = 0.8,feature_fraction = 0.8),
                       param_grid = params_test1, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch1.fit(X_train,y_train)
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
# select max_bin and min_data_in_leaf
params_test2={'max_bin': range(5,300,10), 'min_data_in_leaf':range(1,102,10)}

gsearch2 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.01, n_estimators=778, max_depth=12, num_leaves=100,bagging_fraction = 0.8,feature_fraction = 0.8),
                       param_grid = params_test2, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch2.fit(X_train,y_train)
gsearch2.cv_results_, gsearch2.best_params_, gsearch2.best_score_

In [None]:
 params_test3={'feature_fraction': [0.6,0.7,0.8,0.9,1.0],
              'bagging_fraction': [0.6,0.7,0.8,0.9,1.0],
              'bagging_freq': range(0,81,10)
}

gsearch3 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.01, n_estimators=778, max_depth=12, num_leaves=100,max_bin=65,min_data_in_leaf=41),
                       param_grid = params_test3, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch3.fit(X_train,y_train)
gsearch3.cv_results_, gsearch3.best_params_, gsearch3.best_score_

In [None]:
params_test4={'lambda_l1': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0],
              'lambda_l2': [1e-5,1e-3,1e-1,0.0,0.1,0.3,0.5,0.7,0.9,1.0]
}

gsearch4 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.01, n_estimators=778, max_depth=12, num_leaves=100,max_bin=65,min_data_in_leaf=41,bagging_freq= 0,feature_fraction= 0.8),
                       param_grid = params_test4, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch4.fit(X_train,y_train)
gsearch4.cv_results_, gsearch4.best_params_, gsearch4.best_score_

In [None]:
params_test5={'min_split_gain':[0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]}

gsearch5 = GridSearchCV(estimator = lgb.LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',learning_rate=0.1, n_estimators=778, max_depth=12, num_leaves=100,max_bin=65,min_data_in_leaf=41,bagging_freq= 0, feature_fraction= 0.8,
lambda_l1=0.3,lambda_l2=1e-03),
                       param_grid = params_test5, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch5.fit(X_train,y_train)
gsearch5.cv_results_, gsearch5.best_params_, gsearch5.best_score_

## Rolling train

In [None]:
def gen_rolling_dates(start_date, end_date, update, train_days, test_days, slot=0):
    train_start = start_date
    test_start = train_start + timedelta(days=(train_days + slot))
    rollings = []
    while test_start < end_date:
        rollings.append({
            'train_start_date': train_start,
            'train_end_date': train_start + timedelta(days=train_days),
            'test_start_date': train_start + timedelta(days=(train_days + slot)),
            'test_end_date': test_start + timedelta(days=test_days)
        })
        train_start = train_start + timedelta(days=update)
        test_start = test_start + timedelta(days=update)
    return rollings

rollings = gen_rolling_dates(datetime.date(2020, 1, 1), datetime.date(2021, 8, 31), 60, 300, 60)

In [None]:
for i in range(len(rollings)):
    features, long_label, short_label, train_index, test_index, invest = load_data(ni_features_labels, 30, rollings[i])
    lgbm_long = train(features.iloc[train_index[0]], long_label.iloc[train_index[0]])
    lgbm_short = train(features.iloc[train_index[0]], short_label.iloc[train_index[0]])
    result = test(features.iloc[test_index[0]], long_label.iloc[test_index[0]], short_label.iloc[test_index[0]], invest.iloc[test_index[0]], lgbm_long, lgbm_short)

## Backtesting

In [None]:
date = pd.read_feather('test_set_date.feather')
diff_long = pd.read_feather('diff_long.feather')
diff_short = pd.read_feather('diff_short.feather')
datetime_end = date['datetime'].shift(-30)
df = pd.DataFrame({'date': date['date'], 'datetime_open': date['datetime'], 'datetime_close': datetime_end,
                   'long_pred': diff_long['pred'], 'long_invest': diff_long['long_invest'], 'long_return': diff_long['long_return'],
                   'short_pred': diff_short['pred'], 'short_invest': diff_short['short_invest'],
                   'short_return': diff_short['short_return']})
df['diff'] = df['long_pred'] - df['short_pred']
df = df.loc[(df['diff'] >= 0.992) | (df['diff'] <= -0.988)]
buy = []
sell = 0
open = []
close = []
ret = []
for i in range(len(df)):
    df['datetime_close'].iloc[i] = pd.to_datetime(df['datetime_close'].iloc[i]) - datetime.timedelta(seconds=1)
    if df['diff'].iloc[i] >= 0.992:
        buy.append(1)
        open.append(df['long_invest'].iloc[i])
        ret.append(df['long_return'].iloc[i])
        close.append(df['long_invest'].iloc[i] + df['long_return'].iloc[i])
    else:
        buy.append(0)
        sell += 1
        close.append(df['short_invest'].iloc[i])
        ret.append(df['short_return'].iloc[i])
        open.append(df['short_invest'].iloc[i] + df['short_return'].iloc[i])
df['buy'] = buy
df['open'] = open
df['close'] = close
df['return'] = ret
df.drop(columns=['diff', 'long_pred', 'long_invest', 'long_return', 'short_pred', 'short_invest', 'short_return'], inplace=True)
df.reset_index(inplace=True, drop=True)
print(df)
df.to_csv('result.csv', index=False)