In [21]:
import pandas as pd
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load training and testing datasets
train_csv_path = '/content/drive/MyDrive/Colab Notebooks/train.csv'
test_csv_path = '/content/drive/MyDrive/Colab Notebooks/test.csv'

# Load data
train_data = pd.read_csv(train_csv_path)
test_data = pd.read_csv(test_csv_path)

# Display the first few rows of the training and testing datasets
print("Training Data:")
print(train_data.head())

print("\nTesting Data:")
print(test_data.head())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training Data:
   stock_id  date_id  seconds_in_bucket  imbalance_size  \
0         0        0                  0      3180602.69   
1         1        0                  0       166603.91   
2         2        0                  0       302879.87   
3         3        0                  0     11917682.27   
4         4        0                  0       447549.96   

   imbalance_buy_sell_flag  reference_price  matched_size  far_price  \
0                        1         0.999812   13380276.64        NaN   
1                       -1         0.999896    1642214.25        NaN   
2                       -1         0.999561    1819368.03        NaN   
3                       -1         1.000171   18389745.62        NaN   
4                       -1         0.999532   17860614.95        NaN   

   near_price  bid_price  bid_size  ask_price   ask_size  wap    tar

In [23]:
#
# 1. Data Process

def preprocess_missing_values_advanced(data):
    # Fill independent fields
    data['bid_price'] = data['bid_price'].fillna(data['bid_price'].median())
    data['ask_price'] = data['ask_price'].fillna(data['ask_price'].median())
    data['bid_size'] = data['bid_size'].fillna(0)  # Default to 0 if no bid size
    data['ask_size'] = data['ask_size'].fillna(0)  # Default to 0 if no ask size

    # Compute 'wap' (Weighted Average Price)
    data['wap'] = (
        (data['bid_price'] * data['ask_size'] + data['ask_price'] * data['bid_size']) /
        (data['bid_size'] + data['ask_size'] + 1e-9)  # Avoid division by zero
    )
    data['wap'] = data['wap'].fillna(data['wap'].median())

    # Fill 'far_price' and 'near_price'
    data['far_price'] = data['far_price'].fillna(data['ask_price'])
    data['far_price'] = data['far_price'].fillna(data['reference_price'])
    data['far_price'] = data['far_price'].fillna(data['far_price'].median())

    data['near_price'] = data['near_price'].fillna(data['bid_price'])
    data['near_price'] = data['near_price'].fillna(data['reference_price'])
    data['near_price'] = data['near_price'].fillna(data['near_price'].median())

    # Compute 'imbalance_size'
    data['imbalance_size'] = data['imbalance_size'].fillna(
        (data['bid_size'] + data['ask_size']) - data['matched_size']
    )
    data['imbalance_size'] = data['imbalance_size'].fillna(0)  # Default to 0

    # Compute 'matched_size'
    data['matched_size'] = data['matched_size'].fillna(
        (data['bid_size'] + data['ask_size']) - data['imbalance_size']
    )
    data['matched_size'] = data['matched_size'].fillna(0)  # Default to 0

    # Fill 'reference_price'
    data['reference_price'] = data['reference_price'].fillna(
        data[['near_price', 'far_price']].mean(axis=1)
    )
    data['reference_price'] = data['reference_price'].fillna(data['reference_price'].median())

    # Fill 'target' (only in training data)
    if 'target' in data.columns:
        data['target'] = data['target'].fillna(data['target'].median())

    # Validate no missing values remain
    remaining_missing = data.isnull().sum()
    if remaining_missing.sum() > 0:
        print("Still missing values in the following columns:")
        print(remaining_missing[remaining_missing > 0])
        raise AssertionError("There are still unprocessed missing values!")

    return data

# Process training and testing datasets
train_data = preprocess_missing_values_advanced(train_data)
test_data = preprocess_missing_values_advanced(test_data)

# Print results
print("\nMissing Values After Processing (Training Data):")
print(train_data.isnull().sum())

print("\nMissing Values After Processing (Testing Data):")
print(test_data.isnull().sum())


Missing Values After Processing (Training Data):
stock_id                   0
date_id                    0
seconds_in_bucket          0
imbalance_size             0
imbalance_buy_sell_flag    0
reference_price            0
matched_size               0
far_price                  0
near_price                 0
bid_price                  0
bid_size                   0
ask_price                  0
ask_size                   0
wap                        0
target                     0
time_id                    0
row_id                     0
dtype: int64

Missing Values After Processing (Testing Data):
stock_id                   0
date_id                    0
seconds_in_bucket          0
imbalance_size             0
imbalance_buy_sell_flag    0
reference_price            0
matched_size               0
far_price                  0
near_price                 0
bid_price                  0
bid_size                   0
ask_price                  0
ask_size                   0
wap               

In [25]:
#
# 2. Feature Engineer

def add_features(data):
    # Add basic common features
    data['mid_price'] = (data['ask_price'] + data['bid_price']) / 2
    data['price_spread'] = (data['ask_price'] - data['bid_price']) / data['mid_price']
    data['imbalance_ratio'] = data['imbalance_size'] / (data['matched_size'] + 1e-9)

    data['price_pressure'] = data['imbalance_size'] * (data['ask_price'] - data['bid_price'])
    data['imbalance_momentum'] = data['imbalance_size'].diff().fillna(0)

    data['mid_price_zscore'] = (data['mid_price'] - data['mid_price'].mean()) / data['mid_price'].std()

    # Create `seconds_in_bucket_group`
    data['seconds_in_bucket_group'] = pd.cut(
        data['seconds_in_bucket'], bins=[0, 300, 480, float('inf')], labels=[0, 1, 2]
    ).cat.add_categories([-1]).fillna(-1).astype('int')

    # Grouped statistics features
    group_cols = ['date_id', 'seconds_in_bucket_group', 'stock_id']
    agg_features = data.groupby(group_cols).agg(
        mid_price_group_mean=('mid_price', 'mean'),
        price_spread_group_mean=('price_spread', 'mean'),
        imbalance_ratio_group_mean=('imbalance_ratio', 'mean'),
        mid_price_group_std=('mid_price', 'std'),
    ).reset_index()

    # Merge back to the original data
    data = pd.merge(data, agg_features, on=group_cols, how='left')

    # Rolling window statistics features
    rolling_window = 50
    data['rolling_mid_price_mean'] = (
        data.groupby(['date_id', 'stock_id'])['mid_price']
        .transform(lambda x: x.rolling(window=rolling_window, min_periods=1).mean())
    )
    data['rolling_mid_price_std'] = (
        data.groupby(['date_id', 'stock_id'])['mid_price']
        .transform(lambda x: x.rolling(window=rolling_window, min_periods=1).std())
    )

    # Ranking features
    data['mid_price_rank'] = data.groupby(['date_id', 'seconds_in_bucket'])['mid_price'].rank(method='average')
    data['mid_price_rank_pct'] = data['mid_price_rank'] / data.groupby(
        ['date_id', 'seconds_in_bucket']
    )['mid_price'].transform('count')

    # Final missing value imputation
    data.fillna(0, inplace=True)

    return data

# Apply feature engineering to training and testing datasets
train_data = add_features(train_data)
test_data = add_features(test_data)

# Print results to confirm the new features have been added
print("Training Data with New Features:")
print(train_data.head())

print("\nTesting Data with New Features:")
print(test_data.head())

# Check for any remaining missing values
print("\nMissing Values After Feature Engineering (Train):")
print(train_data.isnull().sum())

print("\nMissing Values After Feature Engineering (Test):")
print(test_data.isnull().sum())


Training Data with New Features:
   stock_id  date_id  seconds_in_bucket  imbalance_size  \
0         0        0                  0      3180602.69   
1         1        0                  0       166603.91   
2         2        0                  0       302879.87   
3         3        0                  0     11917682.27   
4         4        0                  0       447549.96   

   imbalance_buy_sell_flag  reference_price  matched_size  far_price  \
0                        1         0.999812   13380276.64   1.000026   
1                       -1         0.999896    1642214.25   1.000660   
2                       -1         0.999561    1819368.03   1.000298   
3                       -1         1.000171   18389745.62   1.000214   
4                       -1         0.999532   17860614.95   1.000016   

   near_price  bid_price  ...  imbalance_ratio_group_mean_x  \
0    0.999812   0.999812  ...                      0.237708   
1    0.999896   0.999896  ...                      0.