### Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import preprocessing

In [2]:
df = pd.read_parquet("Trades_Quote.parquet")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1991986 entries, 0 to 1991985
Data columns (total 11 columns):
 #   Column      Dtype         
---  ------      -----         
 0   COMPANY     object        
 1   DATE        object        
 2   TRADE_TIME  datetime64[us]
 3   QUOTE_TIME  datetime64[us]
 4   EX          object        
 5   SIZE        int64         
 6   PRICE       float64       
 7   BID         float64       
 8   BIDSIZ      float64       
 9   ASK         float64       
 10  ASKSIZ      float64       
dtypes: datetime64[us](2), float64(5), int64(1), object(3)
memory usage: 167.2+ MB


Unnamed: 0,COMPANY,DATE,TRADE_TIME,QUOTE_TIME,EX,SIZE,PRICE,BID,BIDSIZ,ASK,ASKSIZ
0,C,2024-01-02,2024-01-02 09:32:40.845895,2024-01-02 09:32:40.842364,H,100,51.33,51.33,1.0,51.36,1.0
1,C,2024-01-02,2024-01-02 09:32:40.888035,2024-01-02 09:32:40.888012,H,100,51.33,51.32,1.0,51.35,1.0
2,C,2024-01-02,2024-01-02 09:32:40.888311,2024-01-02 09:32:40.888308,H,50,51.33,51.32,1.0,51.35,1.0
3,C,2024-01-02,2024-01-02 09:32:40.929882,2024-01-02 09:32:40.897981,H,200,51.31,51.31,2.0,51.33,1.0
4,C,2024-01-02,2024-01-02 09:32:40.953091,2024-01-02 09:32:40.937756,H,1,51.3,51.3,3.0,51.32,1.0


In [3]:
# Create 1-minute interval
df['INTERVAL_TIME'] = df['TRADE_TIME'].dt.floor('min')
df.head()

Unnamed: 0,COMPANY,DATE,TRADE_TIME,QUOTE_TIME,EX,SIZE,PRICE,BID,BIDSIZ,ASK,ASKSIZ,INTERVAL_TIME
0,C,2024-01-02,2024-01-02 09:32:40.845895,2024-01-02 09:32:40.842364,H,100,51.33,51.33,1.0,51.36,1.0,2024-01-02 09:32:00
1,C,2024-01-02,2024-01-02 09:32:40.888035,2024-01-02 09:32:40.888012,H,100,51.33,51.32,1.0,51.35,1.0,2024-01-02 09:32:00
2,C,2024-01-02,2024-01-02 09:32:40.888311,2024-01-02 09:32:40.888308,H,50,51.33,51.32,1.0,51.35,1.0,2024-01-02 09:32:00
3,C,2024-01-02,2024-01-02 09:32:40.929882,2024-01-02 09:32:40.897981,H,200,51.31,51.31,2.0,51.33,1.0,2024-01-02 09:32:00
4,C,2024-01-02,2024-01-02 09:32:40.953091,2024-01-02 09:32:40.937756,H,1,51.3,51.3,3.0,51.32,1.0,2024-01-02 09:32:00


In [None]:
# Calculate Variables: last midprice, quoted spread, relative quoted spread, effective relative spread, order imbalance
# and depth imbalance, number of trades and total traded volume [+1-minute returns and relaized volatility over the past hour]

df['MID_PRICE'] = (df['BID'] + df['ASK']) / 2.0
df['QUOTED_SPREAD'] = df['ASK'] - df['BID']
df['RELATIVE_SPREAD'] = (df['QUOTED_SPREAD'] / df['MID_PRICE'].replace(0, np.nan))
df['EFFECTIVE_RELATIVE_SPREAD'] = (2 * np.abs((df['PRICE'] - df['MID_PRICE']) / df['MID_PRICE'].replace(0, np.nan)))
df['DEPTH_IMBALANCE'] = ((df['ASKSIZ'] - df['BIDSIZ']) / (df['BIDSIZ'] + df['ASKSIZ']).replace(0, np.nan))

# Classify trades as buyer-initiated or seller-initiated
df['BUY_VOLUME'] = np.where(df['PRICE'] >= df['ASK'], df['SIZE'], 0)
df['SELL_VOLUME'] = np.where(df['PRICE'] <= df['BID'], df['SIZE'], 0)

### Test Train Data 

In [None]:

# TODO: Define X and Y
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=24)

scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Linear Regression