<a href="https://colab.research.google.com/github/LostMa-ERC/simMAtree_workshop/blob/main/simmatree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Manuscript Transmission as Speciation: Using Agent-Based Models and Bayesian Inference

### Digital Approaches to Pre-Modern Texts and Manuscripts (Workshop)

#### Jean-Baptiste Camps, Kelly Christensen, Ulysse Godreau, and Théo Moins

12 June 2025

## Simulation Based Inference using SimMAtree

#### Package installation

In [None]:
# Install simmatree directly from GitHub

# !pip install git+https://github.com/LostMa-ERC/simMAtree.git

# Issue on dependancies with colab! Run this 2 lines alternatively:
!pip install git+https://github.com/LostMa-ERC/simMAtree.git --no-deps
!pip install pandas numpy matplotlib seaborn pydantic click rich pyyaml sbi

# Test installation
!simmatree-test

### Imports and Setup

In [None]:
import os
import tempfile
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import simmatree functions directly
from src.cli.config import Config
from src.cli.generate import generate
from src.cli.inference import inference
from src.cli.score import score

print("All imports successful!")

### Configuration example

In [None]:
# Define experiment configuration

config_dict_Yule = {

    # Type of model we are focusing on
    'model': {
        'name': 'Yule', # 'Yule' or 'BirthDeath' here
        'config': {
            'n_init': 1, # Initial number of trees
            'Nact': 1000, # Number of active iterations
            'Ninact': 1000, # Number of inactive iterations (only deaths)
            'max_pop': 50000 # Maximum population size
        }
    },

    # For generation or scoring : the corresponding parameters (=rate) of a simulation
    'params': {
        'LDA': 0.3,
        'lda': 0.008,
        'gamma': 0.001,
        'mu': 0.0033
    },

    # Configuration of the inference model
    'inference': {
        'name': 'SBI', # For the future : other inference method will be investigated
        'config': {
            'method': 'NPE',
            'num_simulations': 200,
            'num_rounds': 2,
            'random_seed': 42,
            'num_samples': 100,
            'num_workers': 2,       # Reduced for Colab!
            'device': 'cpu'
        }
    }
}

# Create temporary directory for our experiment
temp_dir = "/content/"
config_file_Yule = os.path.join(temp_dir, 'Yule_example.yml')

# Save configuration to YAML file
with open(config_file_Yule, 'w') as f:
    yaml.dump(config_dict_Yule, f, default_flow_style=False)

# Parse configuration using simmatree's Config class
config_Yule = Config(config_file_Yule)

print(f"Configuration saved to: {config_file_Yule}")


### Abundance Data generation

This simulates the copying and transmission process of manuscripts.

In [None]:
synthetic_data_file = os.path.join(temp_dir, 'sample_data/synthetic_data.csv')

# Use the generate function directly
success = generate(
    data_path=synthetic_data_file,
    model=config_Yule.model,
    parameters=config_Yule.params,
    seed=42,
    show_params=False
)

print(f"\nGeneration successful: {success}")
print(f"Synthetic data saved to: {synthetic_data_file}")

# CLI equivalent:
print(f"\n💡 CLI equivalent: simmatree -c {config_file_Yule} generate -o {synthetic_data_file} --show-params")

In [None]:
# Load and examine the synthetic data
df = pd.read_csv(synthetic_data_file, sep=';')

print("\n🔍 First 10 rows:")
print(df.head(10))

# Analyze witness distribution
witness_counts = df.groupby('text_ID')['witness_ID'].count()

print(f"\n📈 Witness Distribution Statistics:")
print(f"Mean number of witnesses per text: {witness_counts.mean():.2f}")
print(f"Median number of witnesses per text: {witness_counts.median():.1f}")
print(f"Max number of witnesses for one text: {witness_counts.max()}")
print(f"Texts with only 1 witness: {(witness_counts == 1).sum()}")

In [None]:
# Create visualization of witness distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram of witness counts per text
axes[0].hist(witness_counts, bins=20, alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Number of Witnesses per Text')
axes[0].set_ylabel('Number of Texts')
axes[0].set_title('Distribution of Witnesses per Text')
axes[0].grid(True, alpha=0.3)

# Log-scale version for better visualization
witness_freq = witness_counts.value_counts().sort_index()
x_values = witness_freq.index.values
y_values = witness_freq.values

axes[1].plot(x_values, y_values, linestyle='--', marker='o',
             markersize=6, linewidth=2, alpha=0.8)
axes[1].set_xscale('log')
axes[1].set_yscale('log')
axes[1].set_xlabel('Number of Witnesses per Text (log scale)')
axes[1].set_ylabel('Number of Texts (log scale)')
axes[1].set_title('Distribution of Witnesses per Text (Log-Log Scale)')
axes[1].grid(True, alpha=0.3, which='both')

plt.tight_layout()
plt.show()

##  Comparaison with Birth Death distribution

In [None]:
# Define another experiment configuration

config_dict_Yule["model"]["name"] = "BirthDeath"
config_dict_Yule["model"]["config"]["n_init"] = 150

config_file_BD = os.path.join(temp_dir, 'BD_example.yml')

# Save configuration to YAML file
with open(config_file_BD, 'w') as f:
    yaml.dump(config_dict_Yule, f, default_flow_style=False)

# Parse configuration using simmatree's Config class
config_BD = Config(config_file_BD)


synthetic_data_BD = os.path.join(temp_dir, 'sample_data/synthetic_data_BD.csv')
success = generate(
    data_path=synthetic_data_BD,
    model=config_BD.model,
    parameters=config_BD.params,
    seed=42,
    show_params=False
)


In [None]:
# Load and examine the synthetic data
df = pd.read_csv(synthetic_data_BD, sep=';')
witness_counts = df.groupby('text_ID')['witness_ID'].count()

print(f"\n📈 Witness Distribution Statistics:")
print(f"Mean number of witnesses per text: {witness_counts.mean():.2f}")
print(f"Median number of witnesses per text: {witness_counts.median():.1f}")
print(f"Max number of witnesses for one text: {witness_counts.max()}")
print(f"Texts with only 1 witness: {(witness_counts == 1).sum()}")


In [None]:
# Create visualization of witness distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram of witness counts per text
axes[0].hist(witness_counts, bins=20, alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Number of Witnesses per Text')
axes[0].set_ylabel('Number of Texts')
axes[0].set_title('Distribution of Witnesses per Text')
axes[0].grid(True, alpha=0.3)

# Log-scale version for better visualization
witness_freq = witness_counts.value_counts().sort_index()
x_values = witness_freq.index.values
y_values = witness_freq.values

axes[1].plot(x_values, y_values, linestyle='--', marker='o',
             markersize=6, linewidth=2, alpha=0.8)
axes[1].set_xscale('log')
axes[1].set_yscale('log')
axes[1].set_xlabel('Number of Witnesses per Text (log scale)')
axes[1].set_ylabel('Number of Texts (log scale)')
axes[1].set_title('Distribution of Witnesses per Text (Log-Log Scale)')
axes[1].grid(True, alpha=0.3, which='both')

plt.tight_layout()
plt.show()

# Bayesian Inference

This will estimate the model parameters from the observed data.
This may take a few minutes depending on the configuration.

### Applying SBI with Birth Death simulator

In [None]:
# Set up results directory
results_dir = Path(temp_dir) / 'inference_results'
results_dir.mkdir(exist_ok=True)

# Run inference using the Python function directly
inference_data = inference(
    csv_file=synthetic_data_file,
    model=config_BD.model,
    backend=config_BD.backend,
    dir=results_dir,
    csv_separator=';'
)

# List generated files
result_files = list(results_dir.glob('*'))
print(f"\n Generated files: {[f.name for f in result_files]}")

# CLI equivalent:
print(f"\n💡 CLI equivalent: simmatree -c {config_file_Yule} infer -i {synthetic_data_file} -o {results_dir}")


In [None]:

# Load posterior summary
posterior_summary = pd.read_csv(results_dir / 'posterior_summary.csv')

print(" Posterior Summary Statistics:")
print(posterior_summary.round(6))

# Compare with true parameters
true_params = config_BD.params
estimated_params = posterior_summary['hpdi_95%'].values

param_names = ['lda', 'mu']
true_values = [true_params[name] for name in param_names]

print(f"\n Parameter Comparison:")
print(f"{'Parameter':<10} {'True Value':<12} {'HPDI Point':<12} {'Relative Error':<15}")
print("-" * 55)

for i, name in enumerate(param_names):
    true_val = true_values[i]
    estimated_val = estimated_params[i]
    rel_error = abs(estimated_val - true_val) / true_val * 100
    print(f"{name:<10} {true_val:<12.6f} {estimated_val:<12.6f} {rel_error:<15.2f}%")

In [None]:
# Display generated plots
from IPython.display import Image, display
import matplotlib.image as mpimg

# Show posterior distributions
plots_to_show = [
    ('pairplot.png', 'Parameter Correlations and Posterior Distributions'),
    ('posterior.png', 'Marginal Posterior Distributions'),
    ('pp_summaries.png', 'Posterior Predictive Checks')
]

for plot_file, title in plots_to_show:
    plot_path = results_dir / plot_file
    if plot_path.exists():
        print(f"\n{title}")
        img = mpimg.imread(plot_path)
        plt.figure(figsize=(12, 8))
        plt.imshow(img)
        plt.axis('off')
        plt.title(title)
        plt.tight_layout()
        plt.show()
    else:
        print(f" Plot not found: {plot_file}")

As we have here the ground truth, we can evaluate how accurate we are in our inference :

In [None]:
print("Evaluating inference performance against true parameters...")

params_BD = {'lda': config_Yule.params['lda'], 'mu': config_Yule.params['mu']}

# Run scoring using the Python function directly
score(param_dict=params_BD, results_dir=str(results_dir))

# Load evaluation metrics
metrics_file = results_dir / 'summary_metrics.csv'
if metrics_file.exists():
    metrics = pd.read_csv(metrics_file)
    print("\n Evaluation Metrics:")
    print(metrics.round(6))

    print(f"\n Performance Summary:")
    print(f"Root Mean Square Error (RMSE): {metrics['rmse'].iloc[0]:.6f}")
    print(f"Normalized RMSE: {metrics['nrmse'].iloc[0]:.6f}")
    print(f"Mean Relative Error: {metrics['mean_rel_error_pct'].iloc[0]:.2f}%")
    print(f"Coverage Probability: {metrics['coverage_probability'].iloc[0]:.2f}")
else:
    print(" Evaluation metrics file not found.")

# CLI equivalent:
print(f"\n💡 CLI equivalent: simmatree -c {config_file_Yule} score -d {results_dir}")

In [None]:
# Display generated plots
from IPython.display import Image, display
import matplotlib.image as mpimg

# Show posterior distributions
plots_to_show = [
    ('pairplot.png', 'Parameter Correlations and Posterior Distributions'),
    ('posterior.png', 'Marginal Posterior Distributions'),
    ('relative_error.png', 'Relative error of estimates')
]

for plot_file, title in plots_to_show:
    plot_path = results_dir / plot_file
    if plot_path.exists():
        print(f"\n{title}")
        img = mpimg.imread(plot_path)
        plt.figure(figsize=(12, 8))
        plt.imshow(img)
        plt.axis('off')
        plt.title(title)
        plt.tight_layout()
        plt.show()
    else:
        print(f" Plot not found: {plot_file}")

In [None]:
# Set up results directory
results_dir_Yule = Path(temp_dir) / 'inference_results_Yule'
results_dir_Yule.mkdir(exist_ok=True)

# Run inference using the Python function directly
inference_data = inference(
    csv_file=synthetic_data_file,
    model=config_Yule.model,
    backend=config_Yule.backend,
    dir=results_dir_Yule,
    csv_separator=';'
)

# List generated files
result_files = list(results_dir_Yule.glob('*'))
print(f"\n Generated files: {[f.name for f in result_files]}")

# CLI equivalent:
print(f"\n💡 CLI equivalent: simmatree -c {config_file_Yule} infer -i {synthetic_data_file} -o {results_dir_Yule}")


In [None]:
# Display generated plots
from IPython.display import Image, display
import matplotlib.image as mpimg

# Show posterior distributions
plots_to_show = [
    ('pairplot.png', 'Parameter Correlations and Posterior Distributions'),
    ('posterior.png', 'Marginal Posterior Distributions'),
    ('pp_summaries.png', 'Posterior Predictive Checks')
]

for plot_file, title in plots_to_show:
    plot_path = results_dir_Yule / plot_file
    if plot_path.exists():
        print(f"\n{title}")
        img = mpimg.imread(plot_path)
        plt.figure(figsize=(12, 8))
        plt.imshow(img)
        plt.axis('off')
        plt.title(title)
        plt.tight_layout()
        plt.show()
    else:
        print(f" Plot not found: {plot_file}")

In [None]:
print("Evaluating inference performance against true parameters...")

params_Yule = {'LDA': config_Yule.params['LDA'], 'lda': config_Yule.params['lda'], 'gamma': config_Yule.params['gamma'], 'mu': config_Yule.params['mu']}

# Run scoring using the Python function directly
score(param_dict=params_Yule, results_dir=str(results_dir_Yule))

# Load evaluation metrics
metrics_file = results_dir_Yule / 'summary_metrics.csv'
if metrics_file.exists():
    metrics = pd.read_csv(metrics_file)
    print("\n Evaluation Metrics:")
    print(metrics.round(6))

    print(f"\n Performance Summary:")
    print(f"Root Mean Square Error (RMSE): {metrics['rmse'].iloc[0]:.6f}")
    print(f"Normalized RMSE: {metrics['nrmse'].iloc[0]:.6f}")
    print(f"Mean Relative Error: {metrics['mean_rel_error_pct'].iloc[0]:.2f}%")
    print(f"Coverage Probability: {metrics['coverage_probability'].iloc[0]:.2f}")
else:
    print(" Evaluation metrics file not found.")

# CLI equivalent:
print(f"\n💡 CLI equivalent: simmatree -c {config_file_Yule} score -d {results_dir_Yule}")

In [None]:
# Display generated plots
from IPython.display import Image, display
import matplotlib.image as mpimg

# Show posterior distributions
plots_to_show = [
    ('pairplot.png', 'Parameter Correlations and Posterior Distributions'),
    ('posterior.png', 'Marginal Posterior Distributions'),
    ('relative_error.png', 'Relative error of estimates')
]

for plot_file, title in plots_to_show:
    plot_path = results_dir_Yule / plot_file
    if plot_path.exists():
        print(f"\n{title}")
        img = mpimg.imread(plot_path)
        plt.figure(figsize=(12, 8))
        plt.imshow(img)
        plt.axis('off')
        plt.title(title)
        plt.tight_layout()
        plt.show()
    else:
        print(f" Plot not found: {plot_file}")