In [4]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Reading the subsample of the training data
train_df = pd.read_csv('train.csv')

In [5]:
# Using a combination of forward and backward filling to impute missing values
# Forward fill first, then backward fill
train_df.fillna(method='ffill', inplace=True)
train_df.fillna(method='bfill', inplace=True)

# Check if there are any missing values left
missing_values_after = train_df.isnull().sum()
missing_values_after

stock_id                   0
date_id                    0
seconds_in_bucket          0
imbalance_size             0
imbalance_buy_sell_flag    0
reference_price            0
matched_size               0
far_price                  0
near_price                 0
bid_price                  0
bid_size                   0
ask_price                  0
ask_size                   0
wap                        0
target                     0
time_id                    0
row_id                     0
dtype: int64

In [6]:
# Calculate the difference between bid and ask price
train_df['bid_ask_spread'] = train_df['ask_price'] - train_df['bid_price']

# Calculate the mid price between the bid and ask price
train_df['mid_price'] = (train_df['bid_price'] + train_df['ask_price']) / 2

# Calculate the difference between the reference price and the mid price
train_df['ref_mid_diff'] = train_df['reference_price'] - train_df['mid_price']

# Calculate the difference between the far price and the near price
train_df['far_near_diff'] = train_df['far_price'] - train_df['near_price']

# Calculate the difference between the near price and the reference price
train_df['near_ref_diff'] = train_df['near_price'] - train_df['reference_price']

# Calculate the difference between the far price and the reference price
train_df['far_ref_diff'] = train_df['far_price'] - train_df['reference_price']

# Calculate the rolling average for weighted average price (wap) for each stock_id and date_id
train_df['wap_rolling_avg'] = train_df.groupby(['stock_id', 'date_id'])['wap'].transform(lambda x: x.rolling(window=5, min_periods=1).mean())

# Calculate the exponential moving average for weighted average price (wap) for each stock_id and date_id
train_df['wap_ewm_avg'] = train_df.groupby(['stock_id', 'date_id'])['wap'].transform(lambda x: x.ewm(span=5).mean())

# Show the first few rows of the dataframe with the newly created features
train_df.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,...,time_id,row_id,bid_ask_spread,mid_price,ref_mid_diff,far_near_diff,near_ref_diff,far_ref_diff,wap_rolling_avg,wap_ewm_avg
0,0,0,0,3180602.69,1,0.999812,13380276.64,1.000241,1.000241,0.999812,...,0,0_0_0,0.000214,0.999919,-0.000107,0.0,0.000429,0.000429,1.0,1.0
1,1,0,0,166603.91,-1,0.999896,1642214.25,1.000241,1.000241,0.999896,...,0,0_0_1,0.000764,1.000278,-0.000382,0.0,0.000345,0.000345,1.0,1.0
2,2,0,0,302879.87,-1,0.999561,1819368.03,1.000241,1.000241,0.999403,...,0,0_0_2,0.000895,0.99985,-0.000289,0.0,0.00068,0.00068,1.0,1.0
3,3,0,0,11917682.27,-1,1.000171,18389745.62,1.000241,1.000241,0.999999,...,0,0_0_3,0.000215,1.000107,6.4e-05,0.0,7e-05,7e-05,1.0,1.0
4,4,0,0,447549.96,-1,0.999532,17860614.95,1.000241,1.000241,0.999394,...,0,0_0_4,0.000622,0.999705,-0.000173,0.0,0.000709,0.000709,1.0,1.0


In [7]:
#    _____     ____
#  /      \  |  o | 
# |        |/ ___\| 
# |_________/     
# |_|_| |_|_|


import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Preparing the features and target variable
# Include only the columns you want as features
feature_columns = [
    'stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size', 'imbalance_buy_sell_flag',
    'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price',
    'bid_ask_spread', 'mid_price', 'ref_mid_diff', 'far_near_diff', 'near_ref_diff',
    'far_ref_diff', 'wap_rolling_avg', 'wap_ewm_avg'
]
X = train_df[feature_columns]
y = train_df['target']

# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing the XGBoost model
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    colsample_bytree=0.3,
    learning_rate=0.1,
    max_depth=5,
    alpha=10,
    n_estimators=100
)

# Training the model
model.fit(X_train, y_train)

# Making predictions
y_pred = model.predict(X_test)

# Calculating the Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error (MAE): {mae}")

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Mean Absolute Error (MAE): 6.387081420497385
