<a href="https://colab.research.google.com/github/KorvenDalas/HFT/blob/main/HFT_CW2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install linearmodels

Collecting linearmodels
  Downloading linearmodels-6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.9 kB)
Collecting mypy-extensions>=0.4 (from linearmodels)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Collecting pyhdfe>=0.1 (from linearmodels)
  Downloading pyhdfe-0.2.0-py3-none-any.whl.metadata (4.0 kB)
Collecting formulaic>=1.0.0 (from linearmodels)
  Downloading formulaic-1.1.1-py3-none-any.whl.metadata (6.9 kB)
Collecting setuptools-scm<9.0.0,>=8.0.0 (from setuptools-scm[toml]<9.0.0,>=8.0.0->linearmodels)
  Downloading setuptools_scm-8.2.0-py3-none-any.whl.metadata (6.8 kB)
Collecting interface-meta>=1.2.0 (from formulaic>=1.0.0->linearmodels)
  Downloading interface_meta-1.3.0-py3-none-any.whl.metadata (6.7 kB)
Downloading linearmodels-6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from linearmodels.system import SUR
from scipy.stats import f

# Q1

In [3]:
# Read the df from GitHub
df = pd.read_csv('https://raw.githubusercontent.com/KorvenDalas/HFT/refs/heads/main/CW2_HFT_GSK_01022011.csv')

In [4]:
df = df.rename(columns = {'#RIC': 'RIC',
                          'Date-Time': 'DateTime',
                          'Type': 'Type',
                          'Price': 'Price',
                          'Volume': 'Volume',
                          'Bid Price': 'Bid_Price',
                          'Bid Size': 'Bid_Size',
                          'Ask Price': 'Ask_Price',
                          'Ask Size': 'Ask_Size',
                          'direction': 'Direction'
                          })

In [5]:
# Convert 'Time' column to datetime
df['DateTime'] = pd.to_datetime(df['DateTime'])

# Create new column with time in seconds (including fractional seconds) since midnight
df['Time_S'] = df['DateTime'].dt.hour * 3600 + df['DateTime'].dt.minute * 60 + df['DateTime'].dt.second + df['DateTime'].dt.microsecond / 1e6

# Set 'DateTime' as the index
df.set_index('DateTime', inplace = True)

df.head()

Unnamed: 0_level_0,RIC,Type,Price,Volume,Bid_Price,Bid_Size,Ask_Price,Ask_Size,Direction,Time_S
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2011-02-01 07:50:00.089774+00:00,GSK.L,Quote,,,1127.0,135.0,1172.0,2000.0,0,28200.089774
2011-02-01 07:50:02.746387+00:00,GSK.L,Quote,,,1127.0,135.0,1095.0,5726.0,0,28202.746387
2011-02-01 07:50:02.746387+00:00,GSK.L,Auction,1125.0,5726.0,1127.0,,1095.0,,0,28202.746387
2011-02-01 07:50:02.746387+00:00,GSK.L,Quote,,,1127.0,135.0,1095.0,8252.0,0,28202.746387
2011-02-01 07:50:02.746387+00:00,GSK.L,Auction,1115.0,8252.0,1127.0,,1095.0,,0,28202.746387


In [6]:
# Filter df based on time range
df = df[(df['Time_S'] >= 28800) & (df['Time_S'] <= 59400)]

# 28800  →  08:00 AM      = 8 * 60 * 60 seconds (market open)
# 59400  →  04:30 PM      = 16 * 60 * 60 + 30 * 60 = 57600 + 1800 = 59400 seconds (market close)

## Q1.1 Midquote Series

In [7]:
df = df.copy()
df['Ask_Price'] = df['Ask_Price'].ffill()
df['Bid_Price'] = df['Bid_Price'].ffill()
df['Spread'] = df['Ask_Price'] - df['Bid_Price']
df['Midquote'] = 0.5 * (df['Ask_Price'] + df['Bid_Price'])

print(df['Midquote'].head())

DateTime
2011-02-01 08:00:00.235137+00:00    1128.5
2011-02-01 08:00:00.253174+00:00    1128.5
2011-02-01 08:00:00.359596+00:00    1128.5
2011-02-01 08:00:00.365400+00:00    1128.5
2011-02-01 08:00:00.365400+00:00    1128.5
Name: Midquote, dtype: float64


## Q1.2 Time-Weighted Daily Best Bid-Ask Spread

In [8]:
# Compute time change between successive rows
timechange = df.Time_S.shift(-1) - df.Time_S
timechange.iloc[-1] = 59400 - df['Time_S'].iloc[-1]  # Correct the last time difference

# Time-weighted spread
qspread = 10000 * (df['Ask_Price'] - df['Bid_Price']) / df['Midquote']   # in percentage
TWqspread = np.nansum(qspread * timechange) / np.nansum(timechange)

print(TWqspread)

5.660910921553593


## Q1.3 Time-Weighted Daily Market Depth

In [9]:
# Time-Weighted Depth
depth = df['Ask_Size'] + df['Bid_Size']
TWdepth = np.nansum(depth * timechange) / np.nansum(timechange)

print(TWdepth)

19276.20745082458


## Q1.4 Volume-Weighted Daily Effective Spread

In [10]:
# Filter trades and calculate effective spread
df['Seq'] = range(len(df))
dfTrades = df[(df['Type'] == 'Trade')]
espread = 10000 * 2 * (dfTrades['Direction'] * (dfTrades['Price'] - dfTrades['Midquote'])) / dfTrades['Midquote']
VWdespread = np.nansum(espread * dfTrades['Volume']) / np.nansum(dfTrades['Volume'])
VWdespread

np.float64(8.842557982963504)

## Q1.5 Volume-Weighted Daily 5-Minute Price Impact

In [11]:
# Define time intervals
tau = [300]  # 300s = 5m   # [0.05, 0.1, 0.5, 1, 5, 10, 30, 60, 300]

# Iterate over tau to calculate next midquote and price impact
for t in tau:
    nexMQ = []
    thiMQ = []

    for j in dfTrades.Seq:
        thisMQ = df.iloc[j].Midquote
        thisSec = df.iloc[j].Time_S
        zz = df['Time_S'] - (thisSec + 300)
        zz=pd.concat([zz, df.Seq], axis = 1)
        xx = zz[zz >= 0].Seq

        # Get the first index where time is greater than or equal to thisSec + t
        if not xx.empty:
            xk = xx.iloc[0]
        else:
            xk = np.nan

        # Handle missing values
        if pd.isna(xk):
            nextMQ = np.nan
        else:
            nextMQ = df.iloc[xk].Midquote

        thiMQ.append(thisMQ)
        nexMQ.append(nextMQ)

    # Add next midquote to dfTrades if lengths match
    if len(nexMQ) == len(dfTrades):
        dfTrades[f'NextMQ_tau_{t}'] = nexMQ
    else:
        print(f"Length mismatch for tau = {t}, skipping this tau.")
        continue

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfTrades[f'NextMQ_tau_{t}'] = nexMQ


In [12]:
# Calculate realized impact
PI = 100 * 2 * (dfTrades['Direction'] * (dfTrades['NextMQ_tau_300'] - thiMQ)) / dfTrades['Midquote']
VWpi = np.nansum(PI * dfTrades['Volume']) / np.nansum(dfTrades['Volume'])

print(f"VWpi: {VWpi}")

VWpi: -0.14571193136021285


## Q1.6 Volume-Weighted Daily 5-Minute Realised Spread

In [13]:
# Calculate realized spread
rspread = 100 * 2 * (dfTrades['Direction'] * (dfTrades['Price'] - dfTrades['NextMQ_tau_300'])) / dfTrades['Midquote']
VWrspread = np.nansum(rspread * dfTrades['Volume']) / np.nansum(dfTrades['Volume'])

print(f"VWrspread: {VWrspread}")

VWrspread: 0.23413751118984794


## Q1.7 Daily Realised Volatility

In [14]:
log_returns = np.log(df['Midquote'] / df['Midquote'].shift(1))

# Realized volatility
realised_volatility =  np.sqrt(np.nansum(log_returns))

print(f"Realised Volatility: {realised_volatility}")

Realised Volatility: 0.11395510642592484


## Q1.8 Tick Return Series Based on Midquote

In [15]:
df['TickReturn'] = np.log(df['Midquote'] / df['Midquote'].shift(1))

print(df['TickReturn'].head())

DateTime
2011-02-01 08:00:00.235137+00:00    NaN
2011-02-01 08:00:00.253174+00:00    0.0
2011-02-01 08:00:00.359596+00:00    0.0
2011-02-01 08:00:00.365400+00:00    0.0
2011-02-01 08:00:00.365400+00:00    0.0
Name: TickReturn, dtype: float64


## Q1.9 Return Series Based on Midquote at 5-Min Frequency

In [16]:
Midquote_5m = df['Midquote'].resample('5T').last()

Return_5m = np.log(Midquote_5m / Midquote_5m.shift(1))

df_5m = pd.DataFrame({'Midquote_5m': Midquote_5m,
                      'Return_5m': Return_5m})

print(df_5m.head())

                           Midquote_5m  Return_5m
DateTime                                         
2011-02-01 08:00:00+00:00      1134.50        NaN
2011-02-01 08:05:00+00:00      1134.00  -0.000441
2011-02-01 08:10:00+00:00      1132.75  -0.001103
2011-02-01 08:15:00+00:00      1130.25  -0.002209
2011-02-01 08:20:00+00:00      1128.25  -0.001771


  Midquote_5m = df['Midquote'].resample('5T').last()


## Q1.10 Order Imbalance Series at 5-Min Frequency

In [17]:
# Order Imbalance
df['SignedVolume'] = df['Volume'] * df['Direction']
OrderFlows = df['SignedVolume'].resample('5T').sum()
TotalVolume = df['Volume'].resample('5T').sum()
OrderImbalance = abs(OrderFlows) / TotalVolume

print(OrderImbalance.head())

DateTime
2011-02-01 08:00:00+00:00    0.016421
2011-02-01 08:05:00+00:00    0.285091
2011-02-01 08:10:00+00:00    0.032638
2011-02-01 08:15:00+00:00    0.119250
2011-02-01 08:20:00+00:00    0.053576
Freq: 5min, dtype: float64


  OrderFlows = df['SignedVolume'].resample('5T').sum()
  TotalVolume = df['Volume'].resample('5T').sum()


# Q2

In [18]:
dfTrades

Unnamed: 0_level_0,RIC,Type,Price,Volume,Bid_Price,Bid_Size,Ask_Price,Ask_Size,Direction,Time_S,Spread,Midquote,Seq,NextMQ_tau_300
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2011-02-01 08:00:09.059176+00:00,GSK.L,Trade,1131.0,100549.0,1196.0,,1061.0,,1,28809.059176,-135.0,1128.50,89,1128.5
2011-02-01 08:00:15.171723+00:00,GSK.L,Trade,1132.5,462.0,1132.5,,1133.5,,-1,28815.171723,1.0,1133.00,100,1128.5
2011-02-01 08:00:15.171723+00:00,GSK.L,Trade,1132.5,462.0,1132.5,,1133.5,,-1,28815.171723,1.0,1133.00,101,1128.5
2011-02-01 08:00:15.223623+00:00,GSK.L,Trade,1131.0,260.0,1131.0,,1133.5,,-1,28815.223623,2.5,1132.25,110,1128.5
2011-02-01 08:00:15.249775+00:00,GSK.L,Trade,1131.0,1067.0,1130.5,,1131.0,,1,28815.249775,0.5,1130.75,112,1128.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2011-02-01 16:29:51.426497+00:00,GSK.L,Trade,1143.0,700.0,1143.0,,1144.0,,-1,59391.426497,1.0,1143.50,55333,1128.5
2011-02-01 16:29:51.426497+00:00,GSK.L,Trade,1143.0,1485.0,1143.0,,1144.0,,-1,59391.426497,1.0,1143.50,55334,1128.5
2011-02-01 16:29:51.430574+00:00,GSK.L,Trade,1143.0,300.0,1143.0,,1144.0,,-1,59391.430574,1.0,1143.50,55335,1128.5
2011-02-01 16:29:51.430574+00:00,GSK.L,Trade,1143.0,1684.0,1143.0,,1144.0,,-1,59391.430574,1.0,1143.50,55336,1128.5


## Q2.b Estimation of Coefficients

In [19]:
## define variables ##
panel = pd.DataFrame()
panel['d'] = dfTrades['Direction']                                        # Already exists
panel['delta_d'] = dfTrades['Direction'] - dfTrades['Direction'].shift(1)
panel['q'] = dfTrades['Direction'] * dfTrades['Volume']                   # Quantity can be rescaled by daily average trading volume
panel['delta_q'] = panel['q'] - panel['q'].shift(1)
panel['q_lag'] = panel['q'].shift(1)
panel['p'] = dfTrades['Price']
panel['delta_p'] = panel['p'] - panel['p'].shift(1)
panel = panel.dropna()

In [20]:
## Regression ##
# 2-way:
# First Regression
model1 = sm.OLS(panel['delta_p'], panel.iloc[ :, :2] )
results1 = model1.fit()
coeffs1=results1.params
print(results1.summary())

                                 OLS Regression Results                                
Dep. Variable:                delta_p   R-squared (uncentered):                   0.144
Model:                            OLS   Adj. R-squared (uncentered):              0.144
Method:                 Least Squares   F-statistic:                              738.1
Date:                Thu, 03 Apr 2025   Prob (F-statistic):                   6.01e-297
Time:                        10:15:00   Log-Likelihood:                         -2405.8
No. Observations:                8759   AIC:                                      4816.
Df Residuals:                    8757   BIC:                                      4830.
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

## Q2.d Estimation of Coefficients, Two Equations

In [21]:
# 3-way
equations = {
    'eq1': 'q ~ q_lag',                     # Equation 1: AR(1) for q_t
    'eq2': 'delta_p ~ q + q_lag + delta_d'} # Equation 2: Price impact regression

# Fit the SUR model
sur_model = SUR.from_formula(equations, panel)
results = sur_model.fit()
print(results.summary)

                           System GLS Estimation Summary                           
Estimator:                        GLS   Overall R-squared:                   0.0014
No. Equations.:                     2   McElroy's R-squared:                 0.0896
No. Observations:                8759   Judge's (OLS) R-squared:             0.0013
Date:                Thu, Apr 03 2025   Berndt's R-squared:                  0.1453
Time:                        10:15:00   Dhrymes's R-squared:                 0.0014
                                        Cov. Estimator:                      robust
                                        Num. Constraints:                      None
                     Equation: eq1, Dependent Variable: q                     
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
q_lag          0.0367     0.0141     2.6064     0.0092      0.0091      0.0643
            

In [22]:
print(results.params)

eq1_q_lag      0.036682
eq2_q          0.000009
eq2_q_lag     -0.000007
eq2_delta_d    0.161276
Name: params, dtype: float64


In [23]:
# Extract coefficients
phi      = results.params['eq1_q_lag']     # phi
b_q      = results.params['eq2_q']         # (lambda + beta)
b_q_lag  = results.params['eq2_q_lag']     # -lambda * phi
gamma    = results.params['eq2_delta_d']   # gamma

# Calculate lambda and beta
lambda_ = - b_q_lag / phi
beta = b_q - lambda_

# Print everything
print(f"phi     = {phi:.6f}")
print(f"lambda  = {lambda_:.6f}")
print(f"beta    = {beta:.6f}")
print(f"gamma   = {gamma:.6f}")

phi     = 0.036682
lambda  = 0.000192
beta    = -0.000183
gamma   = 0.161276
