In [None]:
!pip install catboost
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pandas as pd
import numpy as np

data = pd.read_csv('train.csv')

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
columns_with_missing_values = [
    'imbalance_size', 'reference_price', 'matched_size',
    'far_price', 'near_price', 'bid_price', 'ask_price',
    'wap', 'target'
]

for col in columns_with_missing_values:
    data[col] = data[col].fillna(-1)
    #data[col] = data[col].interpolate(method='linear', limit_direction='forward', axis=0)
    #data.fillna(-1, inplace=True)


In [None]:
data['bid_ref_price_diff'] = data['bid_price'] - data['reference_price']
data['bid_ref_price_ratio'] = data['bid_price'] / data['reference_price']
data['ref_price_ma_5'] = data['reference_price'].rolling(window=5).mean()
data['price_momentum'] = data['reference_price'].diff()
data['volume_weighted_price'] = (data['reference_price'] * data['matched_size']) / data['matched_size'].sum()
data['bid_size_volume_ratio'] = data['bid_size'] / data['matched_size']
data['imbalance_volume_interaction'] = data['imbalance_size'] * data['matched_size']
data['day_of_week'] = data['date_id'] % 7
data['hour_of_day'] = (data['seconds_in_bucket'] // 3600) % 24
data['price_volatility'] = data['reference_price'].rolling(window=5).std()

In [None]:
new_columns_with_missing_values = ['ref_price_ma_5', 'price_momentum', 'price_volatility']
for col in new_columns_with_missing_values:
    data[col] = data[col].fillna(-1)
    #data[col] = data[col].interpolate(method='linear', limit_direction='forward', axis=0)
    #data.fillna(-1, inplace=True)

In [None]:
features = ['imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
            'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size',
            'wap', 'bid_ref_price_diff', 'bid_ref_price_ratio', 'ref_price_ma_5',
            'price_momentum', 'volume_weighted_price', 'bid_size_volume_ratio',
            'imbalance_volume_interaction', 'day_of_week', 'hour_of_day', 'price_volatility']
target_column = 'target'

In [None]:
X = data[features]
y = data[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

In [None]:
model = CatBoostRegressor(iterations=500, depth=8, learning_rate=0.1, cat_features=['day_of_week', 'hour_of_day'], verbose=10)
model.fit(X_train, y_train, eval_set=(X_test, y_test), plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 9.8001404	test: 8.7690656	best: 8.7690656 (0)	total: 1.58s	remaining: 13m 9s
10:	learn: 9.7082135	test: 8.7176041	best: 8.7176041 (10)	total: 19.8s	remaining: 14m 38s
20:	learn: 9.6805778	test: 8.7088065	best: 8.7088065 (20)	total: 36.8s	remaining: 13m 58s
30:	learn: 9.6604027	test: 8.7058237	best: 8.7058237 (30)	total: 53.2s	remaining: 13m 25s
40:	learn: 9.6436749	test: 8.7043528	best: 8.7043528 (40)	total: 1m 9s	remaining: 12m 59s
50:	learn: 9.6296390	test: 8.7035560	best: 8.7029992 (49)	total: 1m 26s	remaining: 12m 39s
60:	learn: 9.6179039	test: 8.7031342	best: 8.7029946 (54)	total: 1m 42s	remaining: 12m 17s
70:	learn: 9.6064087	test: 8.7023832	best: 8.7023102 (69)	total: 2m	remaining: 12m 5s
80:	learn: 9.5967401	test: 8.7019420	best: 8.7018943 (73)	total: 2m 17s	remaining: 11m 52s
90:	learn: 9.5882436	test: 8.7015463	best: 8.7015463 (90)	total: 2m 34s	remaining: 11m 34s
100:	learn: 9.5773363	test: 8.7026864	best: 8.7013257 (92)	total: 2m 55s	remaining: 11m 33s
110:	learn:

<catboost.core.CatBoostRegressor at 0x7d54ba75ed70>

In [None]:
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

mae_train = mean_absolute_error(y_train, train_preds)
mae_test = mean_absolute_error(y_test, test_preds)

print(f"Mean Absolute Error (Train): {mae_train}")
print(f"Mean Absolute Error (Test): {mae_test}")

Mean Absolute Error (Train): 6.503802233496159
Mean Absolute Error (Test): 6.046937598219381
