In [49]:
import re
import polars as pl

In [50]:
# Function to parse the log file
def parse_log_file(log_file_path):
    user_list = []
    token_list = []
    
    with open(log_file_path, "r") as f:
        for line in f:
            # Check if the line contains the word 'confirm'
            if 'confirm' in line:
                # Extract user and token using regular expressions
                user_match = re.search(r'trader (\d+): confirm', line)
                token_match = re.search(r'token (\w+)\.', line)
                
                if user_match and token_match:
                    user = user_match.group(1)
                    token = token_match.group(1)
                    
                    user_list.append(user)
                    token_list.append(token)

    # Create a Polars DataFrame
    df = pl.DataFrame(
        {
            "user": user_list,
            "token": token_list
        }
    )

    return df.distinct()


In [51]:
# Path to your log file
log_file_path = "log_hft.txt"

# Parse the log file and create a Polars DataFrame
df = parse_log_file(log_file_path)

# Show the DataFrame
print(df)


shape: (701, 2)
┌──────┬────────────────┐
│ user ┆ token          │
│ ---  ┆ ---            │
│ str  ┆ str            │
╞══════╪════════════════╡
│ 1    ┆ INVES000100001 │
│ 1    ┆ INVEB000100002 │
│ 1    ┆ INVES000100003 │
│ 1    ┆ INVES000100003 │
│ …    ┆ …              │
│ 2281 ┆ AAAAB228100609 │
│ 2282 ┆ BBBBS228200326 │
│ 1    ┆ INVES000100230 │
│ 1    ┆ INVES000100219 │
└──────┴────────────────┘


In [52]:
# Count unique users per token
users_by_token = df.groupby("token").agg(
    pl.col("user").n_unique().alias("unique_users")
)

In [53]:
users_by_token.filter(pl.col("unique_users") > 1)

token,unique_users
str,u32


In [54]:
# Count unique tokens per user
tokens_by_user = df.groupby("user").agg(
    pl.col("token").n_unique().alias("unique_tokens")
)

print(tokens_by_user)

shape: (3, 2)
┌──────┬───────────────┐
│ user ┆ unique_tokens │
│ ---  ┆ ---           │
│ str  ┆ u32           │
╞══════╪═══════════════╡
│ 2281 ┆ 135           │
│ 1    ┆ 230           │
│ 2282 ┆ 75            │
└──────┴───────────────┘


In [55]:
def read_token_crossing_log(file_path):
    buy_tokens = []
    sell_tokens = []
    crossing_prices = []
    volumes = []
    
    # Regex pattern to extract buy token, sell token, crossing price, and volume
    pattern = r"Orders \(b'(.*?)', b'(.*?)'\) crossed at price (\d+), volume (\d+)"
    
    # Read log file line-by-line and extract relevant information
    with open(file_path, 'r') as f:
        for line in f:
            if 'root.process_cross' in line:
                match = re.search(pattern, line)
                if match:
                    buy_token, sell_token, crossing_price, volume = match.groups()
                    buy_tokens.append(buy_token)
                    sell_tokens.append(sell_token)
                    crossing_prices.append(int(crossing_price))
                    volumes.append(int(volume))
                    
    # Create a DataFrame using the extracted information
    df_crossing = pl.DataFrame({
        'buy_token': buy_tokens,
        'sell_token': sell_tokens,
        'crossing_price': crossing_prices,
        'volume': volumes
    })
    
    return df_crossing.distinct()

In [56]:
# Example usage
file_path = 'log_exchange_server.txt'  # Replace with the actual path to your log file
df_crossing = read_token_crossing_log(file_path)
print(df_crossing)

shape: (116, 4)
┌────────────────┬────────────────┬────────────────┬────────┐
│ buy_token      ┆ sell_token     ┆ crossing_price ┆ volume │
│ ---            ┆ ---            ┆ ---            ┆ ---    │
│ str            ┆ str            ┆ i64            ┆ i64    │
╞════════════════╪════════════════╪════════════════╪════════╡
│ INVES000100003 ┆ INVEB000100002 ┆ 1700000        ┆ 1      │
│ AAAAS228100001 ┆ INVEB000100005 ┆ 1710000        ┆ 1      │
│ BBBBS228200001 ┆ INVEB000100004 ┆ 1690000        ┆ 1      │
│ INVES000100007 ┆ AAAAB228100004 ┆ 1670000        ┆ 1      │
│ …              ┆ …              ┆ …              ┆ …      │
│ INVEB000100225 ┆ INVES000100218 ┆ 1270000        ┆ 1      │
│ INVEB000100227 ┆ AAAAS228100600 ┆ 1280000        ┆ 1      │
│ BBBBS228200321 ┆ AAAAB228100606 ┆ 1270000        ┆ 1      │
│ AAAAB228100607 ┆ INVES000100223 ┆ 1280000        ┆ 1      │
└────────────────┴────────────────┴────────────────┴────────┘


In [57]:
# Joining to get buyer information
final_df = (
    df_crossing
    .join(df, left_on="buy_token", right_on="token", how="left")
    .rename({"user": "buyer"})
    .join(df, left_on="sell_token", right_on="token", how="left")
    .rename({"user": "seller"})
    )


In [58]:
print(final_df)


shape: (304, 6)
┌────────────────┬────────────────┬────────────────┬────────┬───────┬────────┐
│ buy_token      ┆ sell_token     ┆ crossing_price ┆ volume ┆ buyer ┆ seller │
│ ---            ┆ ---            ┆ ---            ┆ ---    ┆ ---   ┆ ---    │
│ str            ┆ str            ┆ i64            ┆ i64    ┆ str   ┆ str    │
╞════════════════╪════════════════╪════════════════╪════════╪═══════╪════════╡
│ INVES000100003 ┆ INVEB000100002 ┆ 1700000        ┆ 1      ┆ 1     ┆ 1      │
│ INVES000100003 ┆ INVEB000100002 ┆ 1700000        ┆ 1      ┆ 1     ┆ 1      │
│ INVES000100003 ┆ INVEB000100002 ┆ 1700000        ┆ 1      ┆ 1     ┆ 1      │
│ INVES000100003 ┆ INVEB000100002 ┆ 1700000        ┆ 1      ┆ 1     ┆ 1      │
│ …              ┆ …              ┆ …              ┆ …      ┆ …     ┆ …      │
│ AAAAB228100607 ┆ INVES000100223 ┆ 1280000        ┆ 1      ┆ 2281  ┆ 1      │
│ AAAAB228100607 ┆ INVES000100223 ┆ 1280000        ┆ 1      ┆ 2281  ┆ 1      │
│ AAAAB228100607 ┆ INVES000100223 ┆ 

In [59]:
self_crossing_df = final_df.filter(
    (pl.col("buyer") == pl.col("seller")) &
    (pl.col("buyer") != "1")
)


In [60]:
self_crossing_df.write_csv("self_crossing.csv")

In [61]:
filtered_df_missing_info = final_df.filter(
    pl.col("buyer").is_null() | pl.col("seller").is_null()
)

In [48]:
filtered_df_missing_info

buy_token,sell_token,crossing_price,volume,buyer,seller
str,str,i64,i64,str,str


In [32]:
latent_bid = 101
latent_offer = 100

In [33]:
latent_bid = latent_offer - 1 if latent_bid == latent_offer else latent_bid

In [34]:
latent_bid

101