In [12]:
import pandas as pd

train_df = pd.read_csv('train.csv')
train_df.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [13]:
def preprocess_data(df):
    """
    Preprocess the given dataframe.
    
    Parameters:
    - df: pandas DataFrame to preprocess
    
    Returns:
    - Preprocessed DataFrame
    """
    df = df.dropna(subset=['wap'])
    
    # Ensure 'imbalance_buy_sell_flag' exists before computing and dropping
    if 'imbalance_buy_sell_flag' in df.columns:
        df['imbalance_size'] = df['imbalance_size'] * df['imbalance_buy_sell_flag']
        df = df.drop(columns=['imbalance_buy_sell_flag'])
    
    # Replace missing values in 'far_price' and 'near_price' with -999
    df['far_price'].fillna(-999, inplace=True)
    df['near_price'].fillna(-999, inplace=True)
    
    return df

# Example usage:
# train_df = preprocess_data(train_df)
# test_df = preprocess_data(test_df)


In [14]:
train_df = preprocess_data(train_df)
train_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['imbalance_size'] = df['imbalance_size'] * df['imbalance_buy_sell_flag']


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,0.999812,13380276.64,-999.0,-999.0,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,-166603.91,0.999896,1642214.25,-999.0,-999.0,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,-302879.87,0.999561,1819368.03,-999.0,-999.0,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,-11917682.27,1.000171,18389745.62,-999.0,-999.0,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,-447549.96,0.999532,17860614.95,-999.0,-999.0,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [18]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

df = train_df


# Prepare the dataset
features_to_use = [
    'stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size', 
    'reference_price', 'matched_size', 
    'far_price', 'near_price', 'bid_price'
]
X = df[features_to_use]
X = df.drop(columns=['target', 'row_id'])  # Drop target and row_id to get features
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a CatBoost regressor
model = CatBoostRegressor(iterations=500, learning_rate=0.1, depth=6, verbose=0)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Compute and print the MAE
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')



In [16]:
from public_timeseries_testing_util import MockApi

# Initialize the MockApi
env = MockApi()
iter_test = env.iter_test()

# Loop to make predictions in a time-series manner
for (test_df, revealed_targets_df, sample_prediction_df) in iter_test:
    # Preprocess the test data
    feat = preprocess_data(test_df)
    
    # Make predictions using the trained model
    predictions = model.predict(feat)
    
    # Update the 'target' column in sample_prediction_df with the predictions
    sample_prediction_df['target'] = predictions
    
    # Submit predictions back to the local environment
    env.predict(sample_prediction_df)

   stock_id  date_id  seconds_in_bucket  imbalance_size  \
0         0      481                  0      5497966.18   
1         1      481                  0            0.00   
2         2      481                  0      5439944.90   

   imbalance_buy_sell_flag  reference_price  matched_size  far_price  \
0                       -1         1.000126   26466712.51        NaN   
1                        0         1.000209   11301206.67        NaN   
2                       -1         0.999948   13413115.07        NaN   

   near_price  bid_price  bid_size  ask_price   ask_size  wap   row_id  
0         NaN   0.999950  11324.00   1.000126   28315.00  1.0  481_0_0  
1         NaN   0.999778  18758.73   1.000209   17651.95  1.0  481_0_1  
2         NaN   0.999948   9445.50   1.000636  116259.60  1.0  481_0_2  


KeyError: "['time_id'] not in index"