### Libraries

In [12]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import preprocessing

In [13]:
df = pd.read_parquet("Trades_Quote.parquet")
df.head()

Unnamed: 0,COMPANY,DATE,TRADE_TIME,QUOTE_TIME,EX,SIZE,PRICE,BID,BIDSIZ,ASK,ASKSIZ
0,C,2024-01-02,2024-01-02 09:32:40.845895,2024-01-02 09:32:40.842364,H,100,51.33,51.33,1.0,51.36,1.0
1,C,2024-01-02,2024-01-02 09:32:40.888035,2024-01-02 09:32:40.888012,H,100,51.33,51.32,1.0,51.35,1.0
2,C,2024-01-02,2024-01-02 09:32:40.888311,2024-01-02 09:32:40.888308,H,50,51.33,51.32,1.0,51.35,1.0
3,C,2024-01-02,2024-01-02 09:32:40.929882,2024-01-02 09:32:40.897981,H,200,51.31,51.31,2.0,51.33,1.0
4,C,2024-01-02,2024-01-02 09:32:40.953091,2024-01-02 09:32:40.937756,H,1,51.3,51.3,3.0,51.32,1.0


In [14]:
df["QUOTE_TIME"] = pd.to_datetime(df["QUOTE_TIME"])
df["TRADE_TIME"] = pd.to_datetime(df["TRADE_TIME"])

df["minute"] = df["QUOTE_TIME"].dt.floor("1min")

In [None]:
df["mid"] = (df["BID"] + df["ASK"]) / 2
df["rel_spread"] = (df["ASK"] - df["BID"]) / df["mid"]

avg_rel_spread = ( df.groupby("minute")["rel_spread"].mean() )

df["trade_minute"] = df["TRADE_TIME"].dt.floor("1min")

traded_volume = (df.groupby("trade_minute")["SIZE"].sum())

df["trade_sign"] = 0
df.loc[df["PRICE"] >= df["ASK"], "trade_sign"] = 1
df.loc[df["PRICE"] <= df["BID"], "trade_sign"] = -1

df["signed_volume"] = df["trade_sign"] * df["SIZE"]

order_imbalance = ( df.groupby("minute")["signed_volume"].sum() )

df["depth_imb"] = ((df["ASKSIZ"] - df["BIDSIZ"]) / (df["ASKSIZ"] + df["BIDSIZ"]))

avg_depth_imb = ( df.groupby("minute")["depth_imb"].mean() )

mid_close = ( df.sort_values("QUOTE_TIME").groupby("minute")["mid"].last() )

one_min_return = np.log(mid_close).diff()

realized_vol = ( one_min_return.pow(2).rolling(window=60, min_periods=60).sum() )

In [None]:
minute_df = pd.DataFrame({
    "avg_rel_spread": avg_rel_spread,
    "order_imbalance": order_imbalance,
    "avg_depth_imbalance": avg_depth_imb,
    "traded_volume": traded_volume,
    "one_min_return": one_min_return,
    "realized_volatility": realized_vol
})

display(minute_df.head())
minute_df.tail()

Unnamed: 0,avg_rel_spread,order_imbalance,avg_depth_imbalance,traded_volume,one_min_return,realized_volatility
2024-01-31 15:55:00,0.000503,-4104,-0.031801,116761,0.0,0.001464
2024-01-31 15:56:00,0.000267,-25900,0.106103,93430,8.9e-05,0.001464
2024-01-31 15:57:00,0.000425,-10214,-0.130577,99239,0.0,0.001417
2024-01-31 15:58:00,0.000292,-13297,0.031209,141986,-0.00089,0.001374
2024-01-31 15:59:00,0.000798,-51662,-0.068316,380383,-0.000178,0.001373


### Test Train Data 

In [None]:


X_train, X_test, y_train, y_test = train_test_split(, Y, test_size=0.20, random_state=24)

scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Linear Regression