In [31]:
import re
import polars as pl

In [32]:
# Function to parse the log file
def parse_log_file(log_file_path):
    user_list = []
    token_list = []
    
    with open(log_file_path, "r") as f:
        for line in f:
            # Check if the line contains the word 'confirm'
            if 'confirm' in line:
                # Extract user and token using regular expressions
                user_match = re.search(r'trader (\d+): confirm', line)
                token_match = re.search(r'token (\w+)\.', line)
                
                if user_match and token_match:
                    user = user_match.group(1)
                    token = token_match.group(1)
                    
                    user_list.append(user)
                    token_list.append(token)

    # Create a Polars DataFrame
    df = pl.DataFrame(
        {
            "user": user_list,
            "token": token_list
        }
    )

    return df.unique()


In [33]:
# Path to your log file
log_file_path = "log_hft.txt"

# Parse the log file and create a Polars DataFrame
df = parse_log_file(log_file_path)

# Show the DataFrame
print(df)


shape: (422, 2)
┌──────┬────────────────┐
│ user ┆ token          │
│ ---  ┆ ---            │
│ str  ┆ str            │
╞══════╪════════════════╡
│ 1    ┆ INVEB000100047 │
│ 3061 ┆ AAAAB306100009 │
│ 1    ┆ INVEB000100043 │
│ 3061 ┆ AAAAB306100299 │
│ …    ┆ …              │
│ 3061 ┆ AAAAB306100341 │
│ 1    ┆ INVES000100021 │
│ 1    ┆ INVES000100082 │
│ 1    ┆ INVEB000100147 │
└──────┴────────────────┘


In [34]:
# Count unique users per token
users_by_token = df.groupby("token").agg(
    pl.col("user").n_unique().alias("unique_users")
)

In [35]:
users_by_token.filter(pl.col("unique_users") > 1)

token,unique_users
str,u32


In [36]:
# Count unique tokens per user
tokens_by_user = df.groupby("user").agg(
    pl.col("token").n_unique().alias("unique_tokens")
)

print(tokens_by_user)

shape: (3, 2)
┌──────┬───────────────┐
│ user ┆ unique_tokens │
│ ---  ┆ ---           │
│ str  ┆ u32           │
╞══════╪═══════════════╡
│ 3062 ┆ 86            │
│ 3061 ┆ 106           │
│ 1    ┆ 230           │
└──────┴───────────────┘


In [37]:
def read_token_crossing_log(file_path):
    buy_tokens = []
    sell_tokens = []
    crossing_prices = []
    volumes = []
    
    # Regex pattern to extract buy token, sell token, crossing price, and volume
    pattern = r"Orders \(b'(.*?)', b'(.*?)'\) crossed at price (\d+), volume (\d+)"
    
    # Read log file line-by-line and extract relevant information
    with open(file_path, 'r') as f:
        for line in f:
            if 'root.process_cross' in line:
                match = re.search(pattern, line)
                if match:
                    buy_token, sell_token, crossing_price, volume = match.groups()
                    buy_tokens.append(buy_token)
                    sell_tokens.append(sell_token)
                    crossing_prices.append(int(crossing_price))
                    volumes.append(int(volume))
                    
    # Create a DataFrame using the extracted information
    df_crossing = pl.DataFrame({
        'buy_token': buy_tokens,
        'sell_token': sell_tokens,
        'crossing_price': crossing_prices,
        'volume': volumes
    })
    
    return df_crossing.unique()

In [38]:
# Example usage
file_path = 'log_exchange_server.txt'  # Replace with the actual path to your log file
df_crossing = read_token_crossing_log(file_path)
print(df_crossing)

shape: (112, 4)
┌────────────────┬────────────────┬────────────────┬────────┐
│ buy_token      ┆ sell_token     ┆ crossing_price ┆ volume │
│ ---            ┆ ---            ┆ ---            ┆ ---    │
│ str            ┆ str            ┆ i64            ┆ i64    │
╞════════════════╪════════════════╪════════════════╪════════╡
│ BBBBS306200147 ┆ INVEB000100095 ┆ 1120000        ┆ 1      │
│ BBBBB306200099 ┆ INVES000100077 ┆ 1150000        ┆ 1      │
│ INVEB000100216 ┆ AAAAS306100318 ┆ 1140000        ┆ 1      │
│ INVEB000100122 ┆ AAAAS306100172 ┆ 1040000        ┆ 1      │
│ …              ┆ …              ┆ …              ┆ …      │
│ AAAAS306100004 ┆ INVEB000100004 ┆ 1690000        ┆ 1      │
│ INVES000100014 ┆ AAAAB306100014 ┆ 1680000        ┆ 1      │
│ INVES000100027 ┆ BBBBB306200029 ┆ 1680000        ┆ 1      │
│ INVES000100103 ┆ AAAAB306100148 ┆ 1090000        ┆ 1      │
└────────────────┴────────────────┴────────────────┴────────┘


In [39]:
# Joining to get buyer information
final_df = (
    df_crossing
    .join(df, left_on="buy_token", right_on="token", how="left")
    .rename({"user": "buyer"})
    .join(df, left_on="sell_token", right_on="token", how="left")
    .rename({"user": "seller"})
    )


In [40]:
print(final_df)


shape: (112, 6)
┌────────────────┬────────────────┬────────────────┬────────┬───────┬────────┐
│ buy_token      ┆ sell_token     ┆ crossing_price ┆ volume ┆ buyer ┆ seller │
│ ---            ┆ ---            ┆ ---            ┆ ---    ┆ ---   ┆ ---    │
│ str            ┆ str            ┆ i64            ┆ i64    ┆ str   ┆ str    │
╞════════════════╪════════════════╪════════════════╪════════╪═══════╪════════╡
│ BBBBS306200147 ┆ INVEB000100095 ┆ 1120000        ┆ 1      ┆ 3062  ┆ 1      │
│ BBBBB306200099 ┆ INVES000100077 ┆ 1150000        ┆ 1      ┆ 3062  ┆ 1      │
│ INVEB000100216 ┆ AAAAS306100318 ┆ 1140000        ┆ 1      ┆ 1     ┆ 3061   │
│ INVEB000100122 ┆ AAAAS306100172 ┆ 1040000        ┆ 1      ┆ 1     ┆ 3061   │
│ …              ┆ …              ┆ …              ┆ …      ┆ …     ┆ …      │
│ AAAAS306100004 ┆ INVEB000100004 ┆ 1690000        ┆ 1      ┆ 3061  ┆ 1      │
│ INVES000100014 ┆ AAAAB306100014 ┆ 1680000        ┆ 1      ┆ 1     ┆ 3061   │
│ INVES000100027 ┆ BBBBB306200029 ┆ 

In [41]:
self_crossing_df = final_df.filter(
    (pl.col("buyer") == pl.col("seller")) &
    (pl.col("buyer") != "1")
)


In [42]:
print(self_crossing_df)

shape: (0, 6)
┌───────────┬────────────┬────────────────┬────────┬───────┬────────┐
│ buy_token ┆ sell_token ┆ crossing_price ┆ volume ┆ buyer ┆ seller │
│ ---       ┆ ---        ┆ ---            ┆ ---    ┆ ---   ┆ ---    │
│ str       ┆ str        ┆ i64            ┆ i64    ┆ str   ┆ str    │
╞═══════════╪════════════╪════════════════╪════════╪═══════╪════════╡
└───────────┴────────────┴────────────────┴────────┴───────┴────────┘


In [43]:
self_crossing_df.write_csv("self_crossing.csv")

In [44]:
filtered_df_missing_info = final_df.filter(
    pl.col("buyer").is_null() | pl.col("seller").is_null()
)

In [45]:
filtered_df_missing_info

buy_token,sell_token,crossing_price,volume,buyer,seller
str,str,i64,i64,str,str
