In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.subplots as sp
import seaborn as sns


In [16]:
data = np.load("cooc_n_batches_500_layer_8/matryoshka_jaccard_8_0.npz")

# Get the co-occurrence matrix
cooc_matrix = data["arr_0"]


In [17]:
np.fill_diagonal(cooc_matrix, 0)

In [18]:
# Calculate mean of each row
row_means = np.mean(cooc_matrix, axis=1)

# Create indices array
indices = np.arange(len(row_means))

# Create scatter plot using plotly
fig = px.scatter(
    x=indices,
    y=row_means,
    labels={"x": "Feature Index", "y": "Mean Co-occurrence"},
    title="Mean Co-occurrence vs Feature Index",
)

# Update layout for better readability
fig.update_layout(
    showlegend=False,
    template="plotly_white",
)
# fig.update_yaxes(type='log')

fig.show()


In [49]:
graph_data = pd.read_csv(
    "graph_cooc_n_batches_500_layer_8/matryoshka/dataframes/matryoshka_node_info_8_0.csv"
)


In [50]:
import plotly.express as px

# Assuming 'feature_activations' is a column in merged_data
fig_histogram = px.histogram(
    graph_data,
    x="feature_activations",
    y=None,
    title="Histogram of Feature Activations",
    labels={"feature_activations": "Number of Activations"},
    histnorm="density",
)

fig_histogram.update_layout(template="plotly_white", height=500, width=1200)

fig_histogram.show()

In [20]:
# Calculate correlation between node_id and subgraph_size
correlation = graph_data["node_id"].corr(graph_data["subgraph_size"])

# Create scatter plot using plotly
fig_correlation = px.scatter(
    graph_data,
    x="node_id",
    y="subgraph_size",
    labels={"node_id": "Node ID", "subgraph_size": "Subgraph Size"},
    title=f"Correlation between Node ID and Subgraph Size (Correlation: {correlation:.2f})",
)

# Update layout for better readability
fig_correlation.update_layout(
    showlegend=False,
    template="plotly_white",
)

fig_correlation.show()


In [21]:
# Filter out subgraphs of size <=2 and >=100
filtered_graph_data = graph_data[
    (graph_data["subgraph_size"] > 2) & (graph_data["subgraph_size"] < 100)
]

# Calculate correlation between node_id and subgraph_size
correlation = filtered_graph_data["node_id"].corr(filtered_graph_data["subgraph_size"])

# Create scatter plot using plotly
fig_correlation = px.scatter(
    filtered_graph_data,
    x="node_id",
    y="subgraph_size",
    labels={"node_id": "Node ID", "subgraph_size": "Subgraph Size"},
    title=f"Correlation between Node ID and Subgraph Size (Correlation: {correlation:.2f})",
)

# Update layout for better readability
fig_correlation.update_layout(
    showlegend=False,
    template="plotly_white",
)

fig_correlation.show()

In [25]:
import plotly.express as px

# Define group sizes
# Define group sizes
group_sizes = [768, 768, 768 * 2, 768 * 4]

# Create a new column for group assignment based on node_id
# Fix: Use cumulative sums for proper binning
group_boundaries = [0] + list(np.cumsum(group_sizes)) + [graph_data["node_id"].max()]
graph_data["group"] = pd.cut(
    graph_data["node_id"],
    bins=group_boundaries,
    labels=[f"Group {i + 1}" for i in range(len(group_sizes) + 1)],
    right=False,
)

# Filter for subgraphs of size >= 2
filtered_graph_data = graph_data[
    (graph_data["subgraph_size"] >= 2) & (graph_data["subgraph_size"] != 192)
]

# Create box plot
fig_box = px.box(
    filtered_graph_data,
    x="group",
    y="subgraph_size",
    labels={"group": "Group", "subgraph_size": "Subgraph Size"},
    title="Box Plot of Subgraph Sizes per Group",
)

# Update layout for better readability
fig_box.update_layout(
    showlegend=False,
    template="plotly_white",
)

fig_box.show()


In [27]:
import plotly.express as px

# Define group sizes
group_sizes = [768, 768, 768 * 2, 768 * 4]

# Create a new column for group assignment based on node_id
group_boundaries = [0] + list(np.cumsum(group_sizes)) + [graph_data["node_id"].max()]
graph_data["group"] = pd.cut(
    graph_data["node_id"],
    bins=group_boundaries,
    labels=[f"matryoshka_group_{i + 1}" for i in range(len(group_sizes) + 1)],
    right=False,
)

# Filter for subgraphs of size >= 2
filtered_graph_data = graph_data[
    (graph_data["subgraph_size"] >= 2) & (graph_data["subgraph_size"] != 192)
]

# Create box plot
fig_box = px.box(
    filtered_graph_data,
    x="group",
    y="subgraph_size",
    labels={"group": "Group", "subgraph_size": "Subgraph Size"},
    title="Box Plot of Subgraph Sizes per Group",
    category_orders={
        "group": [f"matryoshka_group_{i + 1}" for i in range(len(group_sizes) + 1)]
    },
)

# Update layout for better readability
fig_box.update_layout(
    showlegend=False,
    template="plotly_white",
)

fig_box.show()

In [28]:
import pandas as pd

# Calculate the correlation between subgraph size and group
correlation_data = (
    filtered_graph_data.groupby("group")["subgraph_size"].mean().reset_index()
)
correlation = filtered_graph_data["subgraph_size"].corr(
    filtered_graph_data["group"].cat.codes
)

# Print the correlation result
print(f"Correlation between subgraph size and group: {correlation:.4f}")

# Optionally, visualize the correlation
fig_corr = px.scatter(
    correlation_data,
    x="group",
    y="subgraph_size",
    title="Correlation between Subgraph Size and Group",
    labels={"group": "Group", "subgraph_size": "Average Subgraph Size"},
)
fig_corr.show()


Correlation between subgraph size and group: -0.1807






In [34]:
import pandas as pd

# Calculate the average subgraph size per group
correlation_data = (
    filtered_graph_data.groupby("group")["subgraph_size"].mean().reset_index()
)

# Calculate the number of nodes per group
nodes_per_group = (
    graph_data.groupby("group")["node_id"].nunique().reset_index(name="node_count")
)

# Merge the correlation data with nodes per group
correlation_data = pd.merge(correlation_data, nodes_per_group, on="group")

# Normalize the average subgraph size by the number of nodes per group
correlation_data["normalized_subgraph_size"] = (
    correlation_data["subgraph_size"] / correlation_data["node_count"]
)

# Calculate the correlation between normalized subgraph size and group
correlation = correlation_data["normalized_subgraph_size"].corr(
    correlation_data["group"].cat.codes
)

# Print the correlation result
print(f"Correlation between normalized subgraph size and group: {correlation:.4f}")

# Optionally, visualize the correlation
fig_corr = px.scatter(
    correlation_data,
    x="group",
    y="normalized_subgraph_size",
    title="Correlation between Normalized Subgraph Size and Group",
    labels={
        "group": "Group",
        "normalized_subgraph_size": "Normalized Average Subgraph Size",
    },
)
fig_corr.show()

Correlation between normalized subgraph size and group: -0.9825








In [32]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Calculate the number of subgraphs per group, filtering by size
subgraph_count_data = (
    filtered_graph_data.groupby("group").size().reset_index(name="subgraph_count")
)

# Calculate the number of nodes per group
nodes_per_group = (
    graph_data.groupby("group")["node_id"].nunique().reset_index(name="node_count")
)

# Merge the counts
merged_data = pd.merge(subgraph_count_data, nodes_per_group, on="group")

# Calculate normalized counts
merged_data["normalized_count"] = (
    merged_data["subgraph_count"] / merged_data["node_count"]
)

# Create subplots with raw counts and normalized counts
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("Raw Subgraph Counts", "Normalized Subgraph Counts (per Node)"),
)

# Raw counts
fig.add_trace(
    go.Bar(x=merged_data["group"], y=merged_data["subgraph_count"], name="Raw Count"),
    row=1,
    col=1,
)

# Normalized counts
fig.add_trace(
    go.Bar(
        x=merged_data["group"],
        y=merged_data["normalized_count"],
        name="Normalized Count",
    ),
    row=1,
    col=2,
)

# Update layout
fig.update_layout(
    height=500,
    width=1200,
    showlegend=False,
    template="plotly_white",
    title_text="Subgraph Distribution Analysis",
)

# Update axes labels
fig.update_xaxes(title_text="Group", row=1, col=1)
fig.update_xaxes(title_text="Group", row=1, col=2)
fig.update_yaxes(title_text="Number of Subgraphs", row=1, col=1)
fig.update_yaxes(title_text="Subgraphs per Node", row=1, col=2)

fig.show()

# Print the detailed statistics
print("\nDetailed Statistics:")
print(merged_data.to_string(index=False))








Detailed Statistics:
             group  subgraph_count  node_count  normalized_count
matryoshka_group_1             342         768          0.445312
matryoshka_group_2             387         768          0.503906
matryoshka_group_3             584        1536          0.380208
matryoshka_group_4             801        3072          0.260742
matryoshka_group_5            2166       18431          0.117519


# Compare to a normal SAE

Will pretend that there are groups as in matryoshka case

In [38]:
graph_data = pd.read_csv(
    "graph_cooc_n_batches_500_layer_8/resjb/dataframes/resjb_node_info_8_0.csv"
)

In [None]:
import plotly.express as px

# Define group sizes
group_sizes = [768, 768, 768 * 2, 768 * 4]

# Create a new column for group assignment based on node_id
group_boundaries = [0] + list(np.cumsum(group_sizes)) + [graph_data["node_id"].max()]
graph_data["group"] = pd.cut(
    graph_data["node_id"],
    bins=group_boundaries,
    labels=[f"matryoshka_group_{i + 1}" for i in range(len(group_sizes) + 1)],
    right=False,
)

# Filter for subgraphs of size >= 2
filtered_graph_data = graph_data[
    (graph_data["subgraph_size"] >= 2) & (graph_data["subgraph_size"] != 192)
]

# Create box plot
fig_box = px.box(
    filtered_graph_data,
    x="group",
    y="subgraph_size",
    labels={"group": "Group", "subgraph_size": "Subgraph Size"},
    title="Box Plot of Subgraph Sizes per Group",
    category_orders={
        "group": [
            f"false_matryoshka_group_{i + 1}" for i in range(len(group_sizes) + 1)
        ]
    },
)

# Update layout for better readability
fig_box.update_layout(
    showlegend=False,
    template="plotly_white",
)

fig_box.show()

In [40]:
import pandas as pd

# Calculate the correlation between subgraph size and group
correlation_data = (
    filtered_graph_data.groupby("group")["subgraph_size"].mean().reset_index()
)
correlation = filtered_graph_data["subgraph_size"].corr(
    filtered_graph_data["group"].cat.codes
)

# Print the correlation result
print(f"Correlation between subgraph size and group: {correlation:.4f}")

# Optionally, visualize the correlation
fig_corr = px.scatter(
    correlation_data,
    x="group",
    y="subgraph_size",
    title="Correlation between Subgraph Size and Group",
    labels={"group": "Group", "subgraph_size": "Average Subgraph Size"},
)
fig_corr.show()


Correlation between subgraph size and group: -0.0077






In [41]:
import pandas as pd

# Calculate the average subgraph size per group
correlation_data = (
    filtered_graph_data.groupby("group")["subgraph_size"].mean().reset_index()
)

# Calculate the number of nodes per group
nodes_per_group = (
    graph_data.groupby("group")["node_id"].nunique().reset_index(name="node_count")
)

# Merge the correlation data with nodes per group
correlation_data = pd.merge(correlation_data, nodes_per_group, on="group")

# Normalize the average subgraph size by the number of nodes per group
correlation_data["normalized_subgraph_size"] = (
    correlation_data["subgraph_size"] / correlation_data["node_count"]
)

# Calculate the correlation between normalized subgraph size and group
correlation = correlation_data["normalized_subgraph_size"].corr(
    correlation_data["group"].cat.codes
)

# Print the correlation result
print(f"Correlation between normalized subgraph size and group: {correlation:.4f}")

# Optionally, visualize the correlation
fig_corr = px.scatter(
    correlation_data,
    x="group",
    y="normalized_subgraph_size",
    title="Correlation between Normalized Subgraph Size and Group",
    labels={
        "group": "Group",
        "normalized_subgraph_size": "Normalized Average Subgraph Size",
    },
)
fig_corr.show()

Correlation between normalized subgraph size and group: -0.9859








In [42]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Calculate the number of subgraphs per group, filtering by size
subgraph_count_data = (
    filtered_graph_data.groupby("group").size().reset_index(name="subgraph_count")
)

# Calculate the number of nodes per group
nodes_per_group = (
    graph_data.groupby("group")["node_id"].nunique().reset_index(name="node_count")
)

# Merge the counts
merged_data = pd.merge(subgraph_count_data, nodes_per_group, on="group")

# Calculate normalized counts
merged_data["normalized_count"] = (
    merged_data["subgraph_count"] / merged_data["node_count"]
)

# Create subplots with raw counts and normalized counts
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("Raw Subgraph Counts", "Normalized Subgraph Counts (per Node)"),
)

# Raw counts
fig.add_trace(
    go.Bar(x=merged_data["group"], y=merged_data["subgraph_count"], name="Raw Count"),
    row=1,
    col=1,
)

# Normalized counts
fig.add_trace(
    go.Bar(
        x=merged_data["group"],
        y=merged_data["normalized_count"],
        name="Normalized Count",
    ),
    row=1,
    col=2,
)

# Update layout
fig.update_layout(
    height=500,
    width=1200,
    showlegend=False,
    template="plotly_white",
    title_text="Subgraph Distribution Analysis",
)

# Update axes labels
fig.update_xaxes(title_text="Group", row=1, col=1)
fig.update_xaxes(title_text="Group", row=1, col=2)
fig.update_yaxes(title_text="Number of Subgraphs", row=1, col=1)
fig.update_yaxes(title_text="Subgraphs per Node", row=1, col=2)

fig.show()

# Print the detailed statistics
print("\nDetailed Statistics:")
print(merged_data.to_string(index=False))








Detailed Statistics:
             group  subgraph_count  node_count  normalized_count
matryoshka_group_1             335         768          0.436198
matryoshka_group_2             339         768          0.441406
matryoshka_group_3             681        1536          0.443359
matryoshka_group_4            1400        3072          0.455729
matryoshka_group_5            8008       18431          0.434485


In [44]:
graph_data.columns


Index(['node_id', 'sae_name', 'activity_threshold', 'subgraph_id',
       'subgraph_size', 'feature_activations', 'group'],
      dtype='object')

In [47]:
import plotly.express as px

# Assuming 'feature_activations' is a column in merged_data
fig_histogram = px.histogram(
    graph_data,
    x="feature_activations",
    y=None,
    title="Histogram of Feature Activations",
    labels={"feature_activations": "Number of Activations"},
    histnorm="density",
)

fig_histogram.update_layout(template="plotly_white", height=500, width=1200)

fig_histogram.show()
