## Dump the join logs first then run the cells, changing 'credits' and 'ratings' accordingly.

## How to dump all the logs:

Run in the terminal:
```bash
for i in $(seq 1 5); do docker logs join_credits_$i >& cred$i.log; docker logs join_ratings_$i >& rat$i.log; done
```

Then run the cells below to read the logs and create the DataFrames.

In [16]:
import re
import pandas as pd
from pathlib import Path

join = 'credits'

if join == 'credits':
    file_prefix = 'cred'
    log = '[Join-Credits]'
else:
    file_prefix = 'rat'
    log = '[Join-Ratings]'


# Define the regex to extract the required fields
pattern = re.compile(
    r'Message ID: (\d+) .*? for client ([a-f0-9-]+) has joined data of length (\d+)\.', re.IGNORECASE
)

# Function to parse one file and extract relevant data
def extract_joined_data(filepath):
    data = []
    node_match = re.search(rf'{file_prefix}(\d+)\.log$', filepath.name)
    if not node_match:
        print(f"Skipping file {filepath} as it does not match expected pattern.")
        return data
    node = int(node_match.group(1)) if node_match else None

    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        print(f"Processing file: {filepath}")
        for line in f:
            if log in line:
                match = pattern.search(line)
                if match:
                    message_id = int(match.group(1))
                    client_id = match.group(2)
                    joined_length = int(match.group(3))

                    # Also extract the total original length from earlier in the line
                    length_match = re.search(r'length (\d+)', line)
                    original_length = int(length_match.group(1)) if length_match else None

                    data.append({
                        'node': node,
                        'message_id': message_id,
                        'client_id': client_id,
                        'original_length': original_length,
                        'joined_length': joined_length,
                    })
    return data

# Load all relevant log files
log_files = list(Path('.').glob(f'{file_prefix}*.log'))

print(f"Found {len(log_files)} log files matching pattern '{file_prefix}*.log'.")

# Extract and combine all the data
all_data = []
for log_file in log_files:
    all_data.extend(extract_joined_data(log_file))

# Convert to DataFrame
df = pd.DataFrame(all_data)

df['node'].value_counts()


Found 5 log files matching pattern 'cred*.log'.
Processing file: cred1.log
Processing file: cred2.log
Processing file: cred3.log
Processing file: cred4.log
Processing file: cred5.log


node
2    112
1     86
4     48
3     36
5     20
Name: count, dtype: int64

In [17]:
df

Unnamed: 0,node,message_id,client_id,original_length,joined_length
0,1,64,56b1abcc-77c4-4b28-9379-b1caacfd0bb5,180,1
1,1,64,c19ff158-75ab-4c5a-be48-a8cc1b10babe,180,1
2,1,67,56b1abcc-77c4-4b28-9379-b1caacfd0bb5,159,2
3,1,67,c19ff158-75ab-4c5a-be48-a8cc1b10babe,159,2
4,1,74,56b1abcc-77c4-4b28-9379-b1caacfd0bb5,148,1
...,...,...,...,...,...
297,5,212,c19ff158-75ab-4c5a-be48-a8cc1b10babe,67,3
298,5,213,c19ff158-75ab-4c5a-be48-a8cc1b10babe,103,2
299,5,192,56b1abcc-77c4-4b28-9379-b1caacfd0bb5,17,1
300,5,192,c19ff158-75ab-4c5a-be48-a8cc1b10babe,17,1


In [18]:
import pandas as pd

# Assumes `df` exists with: client_id, message_id, original_length, joined_length, node

# Step 1: Create a unique key for comparison
df['msg_key'] = list(zip(df['message_id'], df['original_length'], df['joined_length']))

# Step 2: Create sets of msg_keys per (client, node)
grouped = df.groupby(['client_id', 'node'])['msg_key'].apply(set).reset_index()

# Step 3: Pivot to have clients as columns, nodes as rows
pivoted = grouped.pivot(index='node', columns='client_id', values='msg_key')

# Step 4: Compare all client sets pairwise per node
def compare_clients(row):
    sets = [s for s in row if isinstance(s, set)]
    return all(s == sets[0] for s in sets[1:])

pivoted['all_clients_equal'] = pivoted.apply(compare_clients, axis=1)

# Display nodes where clients differ
inconsistent_nodes = pivoted[~pivoted['all_clients_equal']]
print(f"⚠️ {len(inconsistent_nodes)} nodes where clients saw different message data:")
inconsistent_nodes


⚠️ 0 nodes where clients saw different message data:


client_id,56b1abcc-77c4-4b28-9379-b1caacfd0bb5,c19ff158-75ab-4c5a-be48-a8cc1b10babe,all_clients_equal
node,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [None]:
# Choose a specific node to inspect (e.g., node 2)
node_to_check = 1
client_sets = pivoted.loc[node_to_check, pivoted.columns != 'all_clients_equal']

# Find pairwise differences
clients = client_sets.index.tolist()
for i in range(len(clients)):
    for j in range(i+1, len(clients)):
        c1, c2 = clients[i], clients[j]
        only_c1 = client_sets[c1] - client_sets[c2]
        only_c2 = client_sets[c2] - client_sets[c1]
        print(f"\nBetween {c1} and {c2} on node {node_to_check}:")
        print(f"  ✅ Shared: {len(client_sets[c1] & client_sets[c2])}")
        print(f"  ❌ Only in {c1}: {only_c1}")
        print(f"  ❌ Only in {c2}: {only_c2}")


Between 3df84a97-7358-4202-9721-a445e8fef206 and fdeb2329-64cd-4805-9f62-93b098f50f0c on node 3:
  ✅ Shared: 18
  ❌ Only in 3df84a97-7358-4202-9721-a445e8fef206: set()
  ❌ Only in fdeb2329-64cd-4805-9f62-93b098f50f0c: set()


In [None]:
# Choose a specific node to inspect (e.g., node 2)
node_to_check = 2
client_sets = pivoted.loc[node_to_check, pivoted.columns != 'all_clients_equal']

# Find pairwise differences
clients = client_sets.index.tolist()
for i in range(len(clients)):
    for j in range(i+1, len(clients)):
        c1, c2 = clients[i], clients[j]
        only_c1 = client_sets[c1] - client_sets[c2]
        only_c2 = client_sets[c2] - client_sets[c1]
        print(f"\nBetween {c1} and {c2} on node {node_to_check}:")
        print(f"  ✅ Shared: {len(client_sets[c1] & client_sets[c2])}")
        print(f"  ❌ Only in {c1}: {only_c1}")
        print(f"  ❌ Only in {c2}: {only_c2}")