In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout

In [2]:
data = pd.read_csv('train.csv') # Loading training data 

In [3]:
def reduce_memory_usage(df):
    print("Memory Usage Before Optimization:")
    print(df.memory_usage(deep=True).sum() / (1024 ** 2), "MB")

    # 🔄 Iterate through each column in the DataFrame
    for col in df.columns:
        col_type = df[col].dtype

        # Check if the column's data type is not 'object' (i.e., numeric)
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            # Check if the column's data type is an integer
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
                    
    # Display the memory usage after optimization
    print("\nMemory Usage After Optimization:")
    print(df.memory_usage(deep=True).sum() / (1024 ** 2), "MB")
    return df

data = reduce_memory_usage(data)

Memory Usage Before Optimization:
974.2033262252808 MB

Memory Usage After Optimization:
599.5538015365601 MB


In [4]:
data.head() # Reading first few examples 

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180603.0,1,0.999812,13380277.0,,,0.999812,60651.5,1.000026,8493.030273,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.9,-1,0.999896,1642214.25,,,0.999896,3233.040039,1.00066,20605.089844,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.9,-1,0.999561,1819368.0,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917680.0,-1,1.000171,18389746.0,,,0.999999,2324.899902,1.000214,479032.40625,1.0,-4.010201,0,0_0_3
4,4,0,0,447550.0,-1,0.999532,17860614.0,,,0.999394,16485.539062,1.000016,434.100006,1.0,-7.349849,0,0_0_4


## Rolling Averages for missing data values

In [5]:
print(data.isnull().sum())

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                 220
imbalance_buy_sell_flag          0
reference_price                220
matched_size                   220
far_price                  2894342
near_price                 2857180
bid_price                      220
bid_size                         0
ask_price                      220
ask_size                         0
wap                            220
target                          88
time_id                          0
row_id                           0
dtype: int64


In [6]:
columns_with_missing_values = ['imbalance_size', 'reference_price', 'matched_size', 'far_price', 'near_price', 'bid_price', 'ask_price', 'wap', 'target']

In [7]:
window_size = 5

for col in columns_with_missing_values:
    # Create rolling mean for the column
    if data[col].isnull().iloc[0]: # If first few values are missing from data, use forward fill
        data[col].fillna(method='bfill', inplace=True) # forward fill 

    data[col + '_rolling_mean'] = data[col].rolling(window=window_size, min_periods=1).mean()

    # Fill missing values in the original column with the rolling average
    data[col] = data[col].fillna(data[col + '_rolling_mean'])

    # Drop the temporary rolling mean column
    data.drop(columns=[col + '_rolling_mean'], inplace=True)
    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(method='bfill', inplace=True) # forward fill
  data[col].fillna(method='bfill', inplace=True) # forward fill
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(method='bfill', inplace=True) # forward fill
  data[col].fillna(method='bfill', inplace=T

In [8]:
print(data.isnull().sum())

stock_id                   0
date_id                    0
seconds_in_bucket          0
imbalance_size             0
imbalance_buy_sell_flag    0
reference_price            0
matched_size               0
far_price                  0
near_price                 0
bid_price                  0
bid_size                   0
ask_price                  0
ask_size                   0
wap                        0
target                     0
time_id                    0
row_id                     0
dtype: int64


## Creating new Features

In [9]:
data['bid_ref_price_diff'] = data['bid_price'] - data['reference_price']
data['bid_ref_price_ratio'] = data['bid_price'] / data['reference_price']
data['ref_price_ma_5'] = data['reference_price'].rolling(window=5).mean()
data['price_momentum'] = data['reference_price'].diff()
data['volume_weighted_price'] = (data['reference_price'] * data['matched_size']) / data['matched_size'].sum()
data['bid_size_volume_ratio'] = data['bid_size'] / data['matched_size']
data['imbalance_volume_interaction'] = data['imbalance_size'] * data['matched_size']
data['day_of_week'] = data['date_id'] % 7
data['hour_of_day'] = (data['seconds_in_bucket'] // 3600) % 24
data['price_volatility'] = data['reference_price'].rolling(window=5).std()

In [10]:
target_column = 'target'

scaler = MinMaxScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

In [None]:
sequence_length = 100

def create_sequences(data, target_col, seq_len):
    X = []
    y = []
    for i in range(len(data) - seq_len):
        X.append(data.iloc[i:i + seq_len].values)
        y.append(data[target_col].iloc[i + seq_len])
    return np.array(X), np.array(y)

X, y = create_sequences(scaled_data, target_column, sequence_length)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
model = Sequential()

model.add(GRU(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))

# Add another GRU layer
model.add(GRU(units=50, return_sequences=False))
model.add(Dropout(0.2))

# Output layer
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

# Make predictions
predictions = model.predict(X_test)

# Rescale predictions back to original scale
predictions = scaler.inverse_transform(predictions)