# Importing Packages, Loading in the Training Data, and Exploring Memory Usage

In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Input
from sklearn.linear_model import LinearRegression
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import tensorflow as tf
from sklearn.metrics import mean_absolute_error
import random


# Load the training data
data = pd.read_csv('/Users/samhitha/Desktop/JPMC_Samhitha/train.csv')

# Preview the first few rows of the dataset
data.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [24]:
len(data.columns)

17

In [25]:
data.shape

(5237980, 17)

In [26]:
def reduce_memory_usage(df):
    print("Memory Usage Before Optimization:")
    print(df.memory_usage(deep=True).sum() / (1024 ** 2), "MB")

    # Iterate through each column in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype

        # Check if the column's data type is numeric
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()

            # Check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    # Display the memory usage after optimization
    print("\nMemory Usage After Optimization:")
    print(df.memory_usage(deep=True).sum() / (1024 ** 2), "MB")
    return df

data = reduce_memory_usage(data)

Memory Usage Before Optimization:
974.2033262252808 MB

Memory Usage After Optimization:
599.5538015365601 MB


# Rolling Average Method to deal with na Values

In [27]:
print(data.isnull().sum())

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                 220
imbalance_buy_sell_flag          0
reference_price                220
matched_size                   220
far_price                  2894342
near_price                 2857180
bid_price                      220
bid_size                         0
ask_price                      220
ask_size                         0
wap                            220
target                          88
time_id                          0
row_id                           0
dtype: int64


In [28]:
columns_with_missing_values = ['imbalance_size', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'ask_price', 'wap', 'target']

In [29]:
# Checking if any columns start with missing values

first_missing_index_imbalance_size = data['imbalance_size'].isnull().idxmax() if data['imbalance_size'].isnull().any() else None
first_missing_index_reference_price = data['reference_price'].isnull().idxmax() if data['reference_price'].isnull().any() else None
first_missing_index_matched_size = data['matched_size'].isnull().idxmax() if data['matched_size'].isnull().any() else None
first_missing_index_far_price = data['far_price'].isnull().idxmax() if data['far_price'].isnull().any() else None
first_missing_index_near_price = data['near_price'].isnull().idxmax() if data['near_price'].isnull().any() else None
first_missing_index_bid_price = data['bid_price'].isnull().idxmax() if data['bid_price'].isnull().any() else None
first_missing_index_ask_price = data['ask_price'].isnull().idxmax() if data['ask_price'].isnull().any() else None
first_missing_index_wap = data['wap'].isnull().idxmax() if data['wap'].isnull().any() else None
first_missing_index_target = data['target'].isnull().idxmax() if data['target'].isnull().any() else None


first_missing_index_imbalance_size, first_missing_index_reference_price, first_missing_index_matched_size,
first_missing_index_far_price, first_missing_index_near_price, first_missing_index_bid_price,
first_missing_index_ask_price, first_missing_index_wap, first_missing_index_target

(369508, 369508, 369508)

'far_price' and 'near_price' columns start with missing values so we need to handle these edge cases using -1 imputations

In [30]:
for col in columns_with_missing_values:

    data[col] = data[col].fillna(-1)

print(data.isnull().sum()) #should be 0 if all missing elements have been handled

stock_id                   0
date_id                    0
seconds_in_bucket          0
imbalance_size             0
imbalance_buy_sell_flag    0
reference_price            0
matched_size               0
far_price                  0
near_price                 0
bid_price                  0
bid_size                   0
ask_price                  0
ask_size                   0
wap                        0
target                     0
time_id                    0
row_id                     0
dtype: int64


# Creating New Features

In [31]:
data['bid_ref_price_diff'] = data['bid_price'] - data['reference_price']
data['bid_ref_price_ratio'] = data['bid_price'] / data['reference_price']
data['ref_price_ma_5'] = data['reference_price'].rolling(window=5).mean()
data['price_momentum'] = data['reference_price'].diff()
data['volume_weighted_price'] = (data['reference_price'] * data['matched_size']) / data['matched_size'].sum()
data['bid_size_volume_ratio'] = data['bid_size'] / data['matched_size']
data['imbalance_volume_interaction'] = data['imbalance_size'] * data['matched_size']
data['day_of_week'] = data['date_id'] % 7
data['hour_of_day'] = (data['seconds_in_bucket'] // 3600) % 24
data['price_volatility'] = data['reference_price'].rolling(window=5).std()

In [32]:
print(data.isnull().sum())

stock_id                        0
date_id                         0
seconds_in_bucket               0
imbalance_size                  0
imbalance_buy_sell_flag         0
reference_price                 0
matched_size                    0
far_price                       0
near_price                      0
bid_price                       0
bid_size                        0
ask_price                       0
ask_size                        0
wap                             0
target                          0
time_id                         0
row_id                          0
bid_ref_price_diff              0
bid_ref_price_ratio             0
ref_price_ma_5                  4
price_momentum                  1
volume_weighted_price           0
bid_size_volume_ratio           0
imbalance_volume_interaction    0
day_of_week                     0
hour_of_day                     0
price_volatility                4
dtype: int64


In [33]:
new_columns_with_missing_values = ['ref_price_ma_5', 'price_momentum', 'price_volatility']

In [34]:
window_size = 5 # Number of elements before current missing element to take an average of

for col in new_columns_with_missing_values:
    data[col] = data[col].fillna(-1) # Using backward fill

print(data.isnull().sum()) #should be 0 if all missing elements have been handled

stock_id                        0
date_id                         0
seconds_in_bucket               0
imbalance_size                  0
imbalance_buy_sell_flag         0
reference_price                 0
matched_size                    0
far_price                       0
near_price                      0
bid_price                       0
bid_size                        0
ask_price                       0
ask_size                        0
wap                             0
target                          0
time_id                         0
row_id                          0
bid_ref_price_diff              0
bid_ref_price_ratio             0
ref_price_ma_5                  0
price_momentum                  0
volume_weighted_price           0
bid_size_volume_ratio           0
imbalance_volume_interaction    0
day_of_week                     0
hour_of_day                     0
price_volatility                0
dtype: int64


In [35]:
seed_value = 42
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
random.seed(seed_value)

In [36]:
features = ['imbalance_size', 'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
            'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'ask_size',
            'wap', 'bid_ref_price_diff', 'bid_ref_price_ratio', 'ref_price_ma_5',
            'price_momentum', 'volume_weighted_price', 'bid_size_volume_ratio',
            'imbalance_volume_interaction', 'day_of_week', 'hour_of_day', 'price_volatility']
target_column = 'target'

In [37]:
X = data[features].values
y = data[target_column].values.reshape(-1, 1)

In [38]:
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

In [39]:
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y)

In [40]:
time_step = 55  # Number of previous time steps to use
batch_size = 64  # Batch size for training

# Create time-series generators for training and testing
split_index = int(len(X_scaled) * 0.75)
X_train, X_test = X_scaled[:split_index], X_scaled[split_index:]
y_train, y_test = y_scaled[:split_index], y_scaled[split_index:]

# Training and validation generators
train_generator = TimeseriesGenerator(X_train, y_train, length=time_step, batch_size=batch_size)
test_generator = TimeseriesGenerator(X_test, y_test, length=time_step, batch_size=batch_size)

In [41]:
time_step = 55
n_features = X_scaled.shape[1]

# Create a Sequential model
model = Sequential()

# Add the first GRU layer (with return_sequences=True for stacking)
model.add(GRU(128, activation='tanh', return_sequences=True, input_shape=(time_step, n_features)))

# Add a second GRU layer
model.add(GRU(64, activation='tanh', return_sequences=True))

# Add a third GRU layer (without return_sequences as it's the final GRU layer)
model.add(GRU(32, activation='tanh'))

# Add a Dropout layer for regularization
model.add(Dropout(0.2))

# Add a Dense output layer (assuming regression task; adjust units and activation if classification)
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss='mae', metrics=['mae'])

# Print the model summary
model.summary()

  super().__init__(**kwargs)


In [20]:
# n_features = X_scaled.shape[1]

In [20]:
# model = Sequential([
#     GRU(50, activation='relu', input_shape=(time_step, n_features)),  # Single GRU layer
#     Dense(1)  # Output layer for regression
# ])

  super().__init__(**kwargs)


In [21]:
# model.compile(optimizer=Adam(), loss='mae')

In [42]:
tf.keras.backend.clear_session()

In [43]:
model.fit(train_generator, validation_data=test_generator, epochs=5)

  self._warn_if_super_not_called()


Epoch 1/5
[1m61382/61382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12391s[0m 202ms/step - loss: 0.0114 - mae: 0.0114 - val_loss: 0.0074 - val_mae: 0.0074
Epoch 2/5
[1m61382/61382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8272s[0m 135ms/step - loss: 0.0079 - mae: 0.0079 - val_loss: 0.0075 - val_mae: 0.0075
Epoch 3/5
[1m61382/61382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4981s[0m 81ms/step - loss: 0.0079 - mae: 0.0079 - val_loss: 0.0075 - val_mae: 0.0075
Epoch 4/5
[1m61382/61382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44352s[0m 723ms/step - loss: 0.0079 - mae: 0.0079 - val_loss: 0.0074 - val_mae: 0.0074
Epoch 5/5
[1m61382/61382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28528s[0m 465ms/step - loss: 0.0079 - mae: 0.0079 - val_loss: 0.0075 - val_mae: 0.0075


<keras.src.callbacks.history.History at 0x1461cc0d0>

In [44]:
y_train_pred = model.predict(train_generator)  

y_train_inverse = scaler_y.inverse_transform(y_train) 
y_train_pred_inverse = scaler_y.inverse_transform(y_train_pred)

[1m61382/61382[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2811s[0m 46ms/step


In [45]:
y_pred = model.predict(test_generator)

# Transform y_test and y_pred back to their original scale
y_test_inverse = scaler_y.inverse_transform(y_test)  # Assuming y_test is already in the correct shape
y_pred_inverse = scaler_y.inverse_transform(y_pred)

[1m20460/20460[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m631s[0m 31ms/step


In [46]:
y_train_inverse_flat = y_train_inverse[-len(y_train_pred_inverse):].flatten()
y_train_pred_inverse_flat = y_train_pred_inverse.flatten()

# Calculate the training MAE
mae_train = mean_absolute_error(y_train_inverse_flat, y_train_pred_inverse_flat)
print("Training Set Mean Absolute Error:", mae_train)

Training Set Mean Absolute Error: 6.5478053


In [47]:
y_test_inverse_flat = y_test_inverse[-len(y_pred_inverse):].flatten()  # Ensure the same length as y_pred_inverse
y_pred_inverse_flat = y_pred_inverse.flatten()

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test_inverse_flat, y_pred_inverse_flat)
print("Test Set Mean Absolute Error:", mae)

Test Set Mean Absolute Error: 6.198259
