In [1]:
import lightgbm as lgb 
import xgboost as xgb 
import catboost as cbt 
import numpy as np 
import joblib 
import pandas as pd 
import warnings

import optuna
from sklearn.metrics import mean_absolute_error



In [2]:
def calculate_rsi(prices, window=14):
    # Calculate daily price changes
    delta = prices.diff()

    # Separate gains and losses
    gains = delta.where(delta > 0, 0)
    losses = -delta.where(delta < 0, 0)

    # Calculate average gains and losses over the specified window
    avg_gains = gains.rolling(window=window, min_periods=1).mean()
    avg_losses = losses.rolling(window=window, min_periods=1).mean()

    # Calculate relative strength (RS) and RSI
    rs = avg_gains / avg_losses
    rsi = 100 - (100 / (1 + rs))

    return rsi

In [3]:
def generate_features(df):
    features = ['seconds_in_bucket','imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2', 'imbalance_ratio', 'bid_size_over_ask_size', 'wap_ask_price_imb',
                'wap_bid_price_imb', 'wap_reference_price_imb', 'bid_price_reference_price_imb',
                'rsi'
               ]


    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    df['imbalance_ratio'] = df.eval('imbalance_size/matched_size')
    df["bid_size_over_ask_size"] = df.eval('bid_size/ask_size')
    df['wap_ask_price_imb']= df.eval('(wap-ask_price)/(wap+ask_price)')
    df['wap_bid_price_imb']= df.eval('(wap-bid_price)/(wap+bid_price)')
    df['wap_reference_price_imb']=df.eval('(wap-reference_price)/(wap+reference_price)')
    df['bid_price_reference_price_imb']=df.eval('(bid_price-reference_price)/(bid_price+reference_price)')
    
    # Calculate RSI for each stock    TODO::: checke the index of rsi, macd
    df['rsi'] = df.groupby('stock_id')['wap'].apply(calculate_rsi).reset_index(level=0, drop=True)
    
    return df[features]

In [4]:
df_orig = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
df_orig.fillna(0, inplace = True)

In [5]:
#X = generate_features(df_orig)
#X= X.fillna(0, inplace = True)

In [6]:
X = generate_features(df_orig).values
Y = df_orig['target'].values

In [7]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mae',
    'device': 'gpu', 
    'num_leaves': 25,
    'max_depth':7,
}

model = lgb.LGBMRegressor(**params, n_estimators=895)
model.fit(X, Y)

In [8]:
#lgb.plot_importance(model, importance_type="gain")

In [9]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [10]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    feat = generate_features(test)
    sample_prediction['target'] = model.predict(feat)
    env.predict(sample_prediction)
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [11]:
sample_prediction

Unnamed: 0,row_id,target
0,480_540_0,-1.324023
1,480_540_1,-0.551237
2,480_540_2,0.767845
3,480_540_3,-1.433244
4,480_540_4,-0.859411
...,...,...
195,480_540_195,-1.815581
196,480_540_196,-1.175608
197,480_540_197,-0.299041
198,480_540_198,1.226490
