## Supporting information

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

# Get project directory
PROJECT_DIR = Path.cwd().parent.parent.parent
print(f"Project directory: {PROJECT_DIR}")

# Plots
plt.rcParams["figure.dpi"] = 300

# Data path
PHYLO_DATA_PATH = PROJECT_DIR / "data" / "03_Model_Outputs" / "predictions" / "phylogenetic_error_assesment"
APPENDIX_PATH = PROJECT_DIR / "data" / "04_Supplementary_Material"
DATA_PATH = PROJECT_DIR / "data" / "01_Training_Validation_Data" / "splits"


assert PHYLO_DATA_PATH.exists(), f"Phylogenetic data path not found: {PHYLO_DATA_PATH}"
assert APPENDIX_PATH.exists(), f"Appendix path not found: {APPENDIX_PATH}"
assert DATA_PATH.exists(), f"Data path not found: {DATA_PATH}"

In [None]:
# Phylogenetic error results
data_path = PHYLO_DATA_PATH / "phylogenetic_error_results_anglecam_v2_genus_statistics.csv"

phylo_df = pd.read_csv(data_path)

In [None]:
# Distribution of genus in training and validation data
data_path = DATA_PATH / "training.csv"
training_df = pd.read_csv(data_path)

data_path = DATA_PATH / "validation.csv"
validation_df = pd.read_csv(data_path)

In [None]:
phylo_df.head()

In [None]:
training_df.head()

In [None]:
validation_df.head()

In [None]:
# Load datasets
print(f"Training: {len(training_df)} samples")
print(f"Validation: {len(validation_df)} samples")
print(f"Phylogenetic: {len(phylo_df)} genera")

# Count genus occurrences in training and validation
training_counts = training_df.groupby('genus').size().reset_index(name='training_count')
validation_counts = validation_df.groupby('genus').size().reset_index(name='validation_count')

# Prepare phylogenetic data
phylo_summary = phylo_df[['genus', 'sample_size', 'mean_abs_error']].copy()
phylo_summary.columns = ['genus', 'phylo_count', 'phylo_mean_abs_error']

# Get all unique genera
all_genera = sorted(set(
    training_counts['genus'].tolist() + 
    validation_counts['genus'].tolist() + 
    phylo_summary['genus'].tolist()
))

# Create base table
genus_table = pd.DataFrame({'genus': all_genera})

# Merge all data
genus_table = (genus_table
               .merge(training_counts, on='genus', how='left')
               .merge(validation_counts, on='genus', how='left')
               .merge(phylo_summary, on='genus', how='left'))

# Fill missing counts with 0
genus_table['training_count'] = genus_table['training_count'].fillna(0).astype(int)
genus_table['validation_count'] = genus_table['validation_count'].fillna(0).astype(int)
genus_table['phylo_count'] = genus_table['phylo_count'].fillna(0).astype(int)

# Calculate total samples and phylo availability
genus_table['total_samples'] = genus_table['training_count'] + genus_table['validation_count']
genus_table['has_phylo_data'] = ~genus_table['phylo_mean_abs_error'].isna()

genus_table = genus_table[~genus_table['genus'].isin(['Pinus', 'Podocarpus', 'Abies'])]

# Sort as requested: phylo data first, then by total samples (desc), then alphabetically
genus_table = genus_table.sort_values([
    'has_phylo_data',   # True first (genera with phylo data)
    'total_samples',    # Descending order  
    'genus'            # Alphabetical for ties
], ascending=[False, False, True]).reset_index(drop=True)

# Select final columns
final_table = genus_table[['genus', 'training_count', 'validation_count', 
                          'total_samples', 'phylo_count', 'phylo_mean_abs_error']].copy()

# Save the table
output_file = APPENDIX_PATH / 'genus_appendix_table.csv'
final_table.to_csv(output_file, index=False, float_format='%.4f')

# Display results
print(f"\n{'='*70}")
print("GENUS APPENDIX TABLE FOR MANUSCRIPT")
print(f"{'='*70}")

with_phylo = genus_table['has_phylo_data'].sum()
without_phylo = len(genus_table) - with_phylo

print(f"Total genera: {len(genus_table)}")
print(f"Genera with phylogenetic data: {with_phylo}")
print(f"Genera without phylogenetic data: {without_phylo}")
print(f"Total training samples: {genus_table['training_count'].sum()}")
print(f"Total validation samples: {genus_table['validation_count'].sum()}")
print(f"Total phylogenetic samples: {genus_table['phylo_count'].sum()}")

print(f"\nTop 20 genera by total sample count:")
print(f"{'#':<3} {'Genus':<18} {'Train':<6} {'Val':<5} {'Total':<6} {'Phylo':<6} {'MAE':<8}")
print("-" * 65)

for i, row in genus_table.head(20).iterrows():
    mae = f"{row['phylo_mean_abs_error']:.3f}" if pd.notna(row['phylo_mean_abs_error']) else "N/A"
    print(f"{i+1:<3} {row['genus']:<18} {row['training_count']:<6} {row['validation_count']:<5} "
          f"{row['total_samples']:<6} {row['phylo_count']:<6} {mae:<8}")

# Show sample of genera without phylo data
no_phylo_genera = genus_table[~genus_table['has_phylo_data']]
if len(no_phylo_genera) > 0:
    print(f"\nFirst 10 genera without phylogenetic data ({len(no_phylo_genera)} total):")
    print(f"{'Genus':<18} {'Train':<6} {'Val':<5} {'Total':<6}")
    print("-" * 40)
    for _, row in no_phylo_genera.head(10).iterrows():
        print(f"{row['genus']:<18} {row['training_count']:<6} {row['validation_count']:<5} {row['total_samples']:<6}")

print(f"\nTable saved to: {output_file}")
print(f"{'='*70}")

# Return table for further use
final_table

### Final table for manuscript

In [None]:
# Load your table
genus_table = pd.read_csv(APPENDIX_PATH / 'genus_appendix_table.csv')

# Remove the empty last row if it exists
genus_table = genus_table.dropna(subset=['genus'])

# Create LaTeX table
def create_latex_table(df):
    latex_lines = []
    
    # Table header
    latex_lines.append("\\begin{table}[h!]")
    latex_lines.append("\\centering")
    latex_lines.append("\\caption{Genus-level representation across training, validation, and phylogenetic analysis datasets. Training and validation counts show the number of images per genus used for model development. Phylogenetic analysis includes three samples per genus where available, with mean absolute error (MAE) values representing prediction accuracy. Empty MAE values indicate genera not included in the phylogenetic analysis. Genera are ordered by phylogenetic data availability and total sample count.}")
    latex_lines.append("\\label{tab:genus_appendix}")
    
    # Table structure - using tabularx for automatic column width
    latex_lines.append("\\begin{tabularx}{\\textwidth}{@{}l*{5}{>{\\centering\\arraybackslash}X}@{}}")
    latex_lines.append("\\toprule")
    
    # Column headers
    latex_lines.append("Genus & Training & Validation & Total & Phylo & MAE \\\\")
    latex_lines.append("\\midrule")
    
    # Add data rows
    for _, row in df.iterrows():
        genus_name = f"\\textit{{{row['genus']}}}"
        training = int(row['training_count'])
        validation = int(row['validation_count']) 
        total = int(row['total_samples'])
        phylo = int(row['phylo_count']) if row['phylo_count'] > 0 else "---"
        
        # Format MAE
        if pd.notna(row['phylo_mean_abs_error']):
            mae = f"{row['phylo_mean_abs_error']:.2f}"
        else:
            mae = "---"
            
        latex_lines.append(f"{genus_name} & {training} & {validation} & {total} & {phylo} & {mae} \\\\")
    
    # Table footer
    latex_lines.append("\\bottomrule")
    latex_lines.append("\\end{tabularx}")
    latex_lines.append("\\end{table}")
    
    return "\n".join(latex_lines)

# Generate the LaTeX table
latex_table = create_latex_table(genus_table)

# Save to file
with open(APPENDIX_PATH / 'genus_appendix_table.tex', 'w') as f:
    f.write(latex_table)

print("LaTeX table saved to genus_appendix_table.tex")
print("\nFirst few lines of the table:")
print("\n".join(latex_table.split('\n')[:15]))