In [62]:
import re
import polars as pl

In [79]:
# Function to parse the log file
def parse_log_file(log_file_path):
    user_list = []
    token_list = []
    
    with open(log_file_path, "r") as f:
        for line in f:
            # Check if the line contains the word 'confirm'
            if 'confirm' in line:
                # Extract user and token using regular expressions
                user_match = re.search(r'trader (\d+): confirm', line)
                token_match = re.search(r'token (\w+)\.', line)
                
                if user_match and token_match:
                    user = user_match.group(1)
                    token = token_match.group(1)
                    
                    user_list.append(user)
                    token_list.append(token)

    # Create a Polars DataFrame
    df = pl.DataFrame(
        {
            "user": user_list,
            "token": token_list
        }
    )

    return df.unique()


In [80]:
# Path to your log file
log_file_path = "log_hft.txt"

# Parse the log file and create a Polars DataFrame
df = parse_log_file(log_file_path)

# Show the DataFrame
print(df)


shape: (543, 2)
┌──────┬────────────────┐
│ user ┆ token          │
│ ---  ┆ ---            │
│ str  ┆ str            │
╞══════╪════════════════╡
│ 1    ┆ INVEB000100028 │
│ 2342 ┆ BBBBB234200614 │
│ 1    ┆ INVEB000100177 │
│ 2341 ┆ AAAAB234100012 │
│ …    ┆ …              │
│ 2341 ┆ AAAAS234100219 │
│ 2341 ┆ AAAAS234100329 │
│ 1    ┆ INVEB000100104 │
│ 2342 ┆ BBBBB234200928 │
└──────┴────────────────┘


In [81]:
# Count unique users per token
users_by_token = df.groupby("token").agg(
    pl.col("user").n_unique().alias("unique_users")
)

In [82]:
users_by_token.filter(pl.col("unique_users") > 1)

token,unique_users
str,u32


In [83]:
# Count unique tokens per user
tokens_by_user = df.groupby("user").agg(
    pl.col("token").n_unique().alias("unique_tokens")
)

print(tokens_by_user)

shape: (3, 2)
┌──────┬───────────────┐
│ user ┆ unique_tokens │
│ ---  ┆ ---           │
│ str  ┆ u32           │
╞══════╪═══════════════╡
│ 2342 ┆ 161           │
│ 2341 ┆ 152           │
│ 1    ┆ 230           │
└──────┴───────────────┘


In [84]:
def read_token_crossing_log(file_path):
    buy_tokens = []
    sell_tokens = []
    crossing_prices = []
    volumes = []
    
    # Regex pattern to extract buy token, sell token, crossing price, and volume
    pattern = r"Orders \(b'(.*?)', b'(.*?)'\) crossed at price (\d+), volume (\d+)"
    
    # Read log file line-by-line and extract relevant information
    with open(file_path, 'r') as f:
        for line in f:
            if 'root.process_cross' in line:
                match = re.search(pattern, line)
                if match:
                    buy_token, sell_token, crossing_price, volume = match.groups()
                    buy_tokens.append(buy_token)
                    sell_tokens.append(sell_token)
                    crossing_prices.append(int(crossing_price))
                    volumes.append(int(volume))
                    
    # Create a DataFrame using the extracted information
    df_crossing = pl.DataFrame({
        'buy_token': buy_tokens,
        'sell_token': sell_tokens,
        'crossing_price': crossing_prices,
        'volume': volumes
    })
    
    return df_crossing.unique()

In [85]:
# Example usage
file_path = 'log_exchange_server.txt'  # Replace with the actual path to your log file
df_crossing = read_token_crossing_log(file_path)
print(df_crossing)

shape: (153, 4)
┌────────────────┬────────────────┬────────────────┬────────┐
│ buy_token      ┆ sell_token     ┆ crossing_price ┆ volume │
│ ---            ┆ ---            ┆ ---            ┆ ---    │
│ str            ┆ str            ┆ i64            ┆ i64    │
╞════════════════╪════════════════╪════════════════╪════════╡
│ BBBBS234200532 ┆ AAAAB234100325 ┆ 1140000        ┆ 1      │
│ INVES000100093 ┆ INVEB000100085 ┆ 1170000        ┆ 1      │
│ BBBBS234200001 ┆ INVEB000100004 ┆ 1690000        ┆ 1      │
│ AAAAS234100308 ┆ BBBBB234200382 ┆ 1170000        ┆ 1      │
│ …              ┆ …              ┆ …              ┆ …      │
│ BBBBS234200056 ┆ INVEB000100071 ┆ 1140000        ┆ 1      │
│ BBBBS234200290 ┆ AAAAB234100288 ┆ 1120000        ┆ 1      │
│ INVEB000100098 ┆ BBBBS234200086 ┆ 1100000        ┆ 1      │
│ AAAAS234100331 ┆ INVEB000100188 ┆ 1150000        ┆ 1      │
└────────────────┴────────────────┴────────────────┴────────┘


In [86]:
# Joining to get buyer information
final_df = (
    df_crossing
    .join(df, left_on="buy_token", right_on="token", how="left")
    .rename({"user": "buyer"})
    .join(df, left_on="sell_token", right_on="token", how="left")
    .rename({"user": "seller"})
    )


In [87]:
print(final_df)


shape: (153, 6)
┌────────────────┬────────────────┬────────────────┬────────┬───────┬────────┐
│ buy_token      ┆ sell_token     ┆ crossing_price ┆ volume ┆ buyer ┆ seller │
│ ---            ┆ ---            ┆ ---            ┆ ---    ┆ ---   ┆ ---    │
│ str            ┆ str            ┆ i64            ┆ i64    ┆ str   ┆ str    │
╞════════════════╪════════════════╪════════════════╪════════╪═══════╪════════╡
│ BBBBS234200532 ┆ AAAAB234100325 ┆ 1140000        ┆ 1      ┆ 2342  ┆ 2341   │
│ INVES000100093 ┆ INVEB000100085 ┆ 1170000        ┆ 1      ┆ 1     ┆ 1      │
│ BBBBS234200001 ┆ INVEB000100004 ┆ 1690000        ┆ 1      ┆ 2342  ┆ 1      │
│ AAAAS234100308 ┆ BBBBB234200382 ┆ 1170000        ┆ 1      ┆ 2341  ┆ 2342   │
│ …              ┆ …              ┆ …              ┆ …      ┆ …     ┆ …      │
│ BBBBS234200056 ┆ INVEB000100071 ┆ 1140000        ┆ 1      ┆ 2342  ┆ 1      │
│ BBBBS234200290 ┆ AAAAB234100288 ┆ 1120000        ┆ 1      ┆ 2342  ┆ 2341   │
│ INVEB000100098 ┆ BBBBS234200086 ┆ 

In [88]:
self_crossing_df = final_df.filter(
    (pl.col("buyer") == pl.col("seller")) &
    (pl.col("buyer") != "1")
)


In [89]:
self_crossing_df.write_csv("self_crossing.csv")

In [90]:
filtered_df_missing_info = final_df.filter(
    pl.col("buyer").is_null() | pl.col("seller").is_null()
)

In [91]:
filtered_df_missing_info

buy_token,sell_token,crossing_price,volume,buyer,seller
str,str,i64,i64,str,str
