In [None]:
# !pip install git+https://github.com/LostMa-ERC/simMAtree.git
# !pip install openpyxl
# !pip install copia

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import numpy as np
import matplotlib.ticker as mticker


## 1. Corpus Analysis


##### Copia import

In [None]:
from copia import density, accumulation_curve, species_accumulation
from copia.utils import evenness, survival_ratio
from copia.hill import hill_numbers
from copia.plot import evenness_plot


##### Load and prepare Data abundance

In [None]:
# Load data files
files = ["prose.xlsx", "poetry.xlsx"]
folder_path = "data/"

dfs_with_source = []

for file in files:
    file_path = os.path.join(folder_path, file)
    if os.path.exists(file_path):
        df = pd.read_excel(file_path)
        df['source'] = 'Prose' if 'prose' in file.lower() else 'Poetry'
        dfs_with_source.append(df)
    else:
        print(f"Warning: File {file_path} not found.")

if not dfs_with_source:
    raise FileNotFoundError("No data files found. Check path and file names.")
    
df_combined = pd.concat(dfs_with_source, ignore_index=True)

# Count witnesses per work for each source
counts_all = df_combined['workTitle'].value_counts()
all_witness_counts = counts_all.values

## 2. Results on prose and peotry texts combined

#### 2.1 Unseen Species approach

Bootstrap distribution of text survival rates upper bound for iChao1 estimator :

In [None]:
# Calculate survival ratios using Chao1 estimator
wsurvival_all = survival_ratio(all_witness_counts, method='ichao1', n_iter=5000, n_jobs=1)

# Create enhanced density plot
fig, ax = plt.subplots(figsize=(10, 7), dpi=300)
density(wsurvival_all, ax = ax, figsize=(10, 7), xlim=(0.1, 1.1))

ax.set_facecolor('white')
ax.grid(True, alpha=0.4, linestyle='-', linewidth=0.5, color='gray')
ax.set_axisbelow(True)

ax.set_yticklabels([])
ax.tick_params(axis='y', which='both', left=False)
ax.set_ylabel(None)
ax.spines['left'].set_visible(False)
ax.set_xlabel('Survival Ratio', fontsize=24)
ax.tick_params(axis='x', which='major', labelsize=20)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_linewidth(1.5)
ax.set_ylabel('Density', fontsize=24, fontweight='bold')


for line in ax.get_lines():
    line.set_linewidth(2.5)
ax.grid(axis='x', alpha=0.3, linestyle='--', linewidth=0.8)

for text in ax.texts:
    text.set_fontsize(25)
    text.set_bbox(dict(boxstyle='round,pad=0.5', facecolor='white', 
                       edgecolor='gray', alpha=0.8, linewidth=1.5))


plt.tight_layout()
plt.savefig("figs/ichaoboot.png")
plt.show()

##### Witnesses survival rates

Estimation and variability using the bootstrap procedure. 
Line in red correspond to the estimate, and dashed line represent the confidence interval at 95%.

In [None]:
dsurvival_all = survival_ratio(all_witness_counts, method='minsample', n_iter=1000, n_jobs=1)

# Create enhanced density plot
fig, ax = plt.subplots(figsize=(10, 7), dpi=300)
density(dsurvival_all, ax = ax, figsize=(10, 7), xlim=(0, 0.4))
ax.set_facecolor('white')
ax.grid(True, alpha=0.4, linestyle='-', linewidth=0.5, color='gray')
ax.set_axisbelow(True)

ax.set_yticklabels([])
ax.tick_params(axis='y', which='both', left=False)
ax.set_ylabel(None)
ax.spines['left'].set_visible(False)
ax.set_xlabel('Survival Ratio', fontsize=24)
ax.tick_params(axis='x', which='major', labelsize=20)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_linewidth(1.5)
ax.set_ylabel('Density', fontsize=24, fontweight='bold')


for line in ax.get_lines():
    line.set_linewidth(2.5)
ax.grid(axis='x', alpha=0.3, linestyle='--', linewidth=0.8)

for text in ax.texts:
    text.set_fontsize(25)
    text.set_bbox(dict(boxstyle='round,pad=0.5', facecolor='white', 
                       edgecolor='gray', alpha=0.8, linewidth=1.5))


plt.tight_layout()
plt.savefig("figs/minsampleboot.png")
plt.show()

##### Species accumulation curve

The violet point represents empirical data, the dashed line is the extrapolation curve, and the green shaded area illustrates the uncertainty range.

In [None]:
# Set realistic maximum steps for the corpus
max_steps = 20000

# Calculate species accumulation
accumulation = species_accumulation(all_witness_counts, max_steps=max_steps)

fig, ax = plt.subplots(figsize=(10, 7), dpi=300)
ax.set_facecolor('white')
ax.grid(True, alpha=0.4, linestyle='-', linewidth=0.5, color='gray')
ax.set_axisbelow(True)

# Plot accumulation curve
accumulation_curve(all_witness_counts, accumulation, c0='C2', c1='C2',
                   xlabel='Texts', ylabel='Witnesses',
                   ax=ax, xlim=(0, max_steps), label='Accumulation')

# Enhance inset plot appearance
ax.xaxis.get_offset_text().set_fontsize(24)
ax.tick_params(axis='both', labelsize=20)
ax.tick_params(axis='y', which='minor', left=False)
ax.ticklabel_format(style='sci', axis='x', scilimits=(0, 0), useMathText=True)
ax.set_xlabel('Texts', fontsize=24, fontweight='bold')
ax.set_ylabel('Witnesses', fontsize=24, fontweight='bold')

# Increase line width for better visibility
for line in ax.get_lines():
    line.set_linewidth(3)

# Add grid for better readability
ax.grid(True, alpha=0.3, linestyle='--', linewidth=0.8)
plt.savefig("figs/accumulation_curve.png")
plt.show()

#### 2.1 Birth Death approach

In [None]:
# !simmatree -c config_church_fathers_pretrained.yml infer -i data/all.csv -o results_BD/all

In [None]:
from src.cli.inference import inference
from src.cli.config import Config
from pathlib import Path


config_cf = Config("config_church_fathers_pretrained.yml")

idata = inference(
                  csv_file = "data/all.csv",
                  generator = config_cf.generator,
                  stats = config_cf.stats,
                  prior = config_cf.prior,
                  backend = config_cf.backend,
                  dir = Path("results_BD/all/"),
                  csv_separator = ";",
                  save_model = False,
                 )