## Data loading

In [None]:
import sys, types, importlib

# ---- ‚ÄúRepair shims‚Äù to alias legacy NumPy module paths to current ones ----
def _install_numpy_compat_shims():
    import numpy as np
    m = {}

    # Load base modules
    try:
        np_core = importlib.import_module("numpy.core")
        m["numpy.core"] = np_core
    except Exception:
        pass

    try:
        np_core_numeric = importlib.import_module("numpy.core.numeric")
        m["numpy.core.numeric"] = np_core_numeric
    except Exception:
        pass

    try:
        np_core_multiarray = importlib.import_module("numpy.core.multiarray")
        m["numpy.core.multiarray"] = np_core_multiarray
    except Exception:
        np_core_multiarray = None

    # Alias old paths -> current paths
    # e.g., resolve 'numpy._core.numeric' as 'numpy.core.numeric'
    if "numpy.core" in m:
        mod = types.ModuleType("numpy._core")
        mod.__dict__.update(m["numpy.core"].__dict__)
        sys.modules["numpy._core"] = mod

    if "numpy.core.numeric" in m:
        sys.modules["numpy._core.numeric"] = m["numpy.core.numeric"]

    # Also handle internal modules that some old pickles may reference
    # (skip if unavailable)
    try:
        np_umath = importlib.import_module("numpy.core._multiarray_umath")
        sys.modules["numpy._core._multiarray_umath"] = np_umath
        sys.modules["numpy.core._multiarray_umath"] = np_umath
    except Exception:
        # Ignore if not present in the current NumPy build/environment
        pass

    # Some old pickles may reference 'numpy.core.multiarray.number'
    # In modern NumPy this corresponds to np.number, so we inject it as an attribute
    try:
        if np_core_multiarray is not None and not hasattr(np_core_multiarray, "number"):
            import numpy as np
            setattr(np_core_multiarray, "number", np.number)
    except Exception:
        pass

_install_numpy_compat_shims()

# ---- Normal loading starts here ----
import pickle
import pandas as pd

file_path = "/Users/path"

print("Loading the pickle file...")

try:
    # Specify encoding as a safeguard
    with open(file_path, "rb") as f:
        data = pickle.load(f, encoding="latin1")

    print("‚úÖ Loaded successfully!")
    print(f"Data type: {type(data)}")

    if isinstance(data, pd.DataFrame):
        print(f"Shape: {data.shape}")
        print(f"Number of columns: {len(data.columns)}")
        print(f"Column names (first 20): {list(data.columns)[:20]}")
        print("\n--- First 5 rows ---")
        print(data.head())
        print("\n--- DataFrame info ---")
        data.info()
    else:
        # If it's not a DataFrame, inspect as much as possible
        if hasattr(data, "keys"):
            try:
                print("Example keys:", list(data.keys())[:10])
            except Exception:
                pass
        print("Preview:", str(data)[:1000])

except Exception as e:
    print(f"‚ùå Error: {e}")
    print("\nüîß Next steps if it still fails:")
    print("A) Retry with dill:  pip install dill  ‚Üí  import dill; dill.load(open(path,'rb'))")
    print("B) Use a temporary venv dedicated to converting pkl ‚Üí CSV/Parquet (keeps your main env clean)")


## Preprocessing

In [None]:
# Reindex starting from 1
print("=" * 50)
print("Status before reindexing")
print("=" * 50)
print(f"Current index range: {data.index.min()} - {data.index.max()}")
print(f"Actual number of rows: {len(data)}")
print(f"Current index (first 10): {data.index[:10].tolist()}")

# Reset index to a consecutive sequence starting from 1
data_reindexed = data.reset_index(drop=True)
data_reindexed.index = data_reindexed.index + 1  # Adjust to start from 1

print("\n" + "=" * 50)
print("Status after reindexing")
print("=" * 50)
print(f"New index range: {data_reindexed.index.min()} - {data_reindexed.index.max()}")
print(f"Number of rows: {len(data_reindexed)}")
print(f"New index (first 10): {data_reindexed.index[:10].tolist()}")

# Replace the original data with the reindexed version
data = data_reindexed
print("‚úÖ Index has been reset to a consecutive sequence starting from 1")

print("\n" + "=" * 50)
print("Detailed missing value inspection")
print("=" * 50)

# Count missing values per column
missing_count = data.isnull().sum()
missing_percentage = (data.isnull().sum() / len(data)) * 100

# Create a DataFrame summarizing missing value information
missing_info = pd.DataFrame({
    'Column name': missing_count.index,
    'Missing count': missing_count.values,
    'Missing rate (%)': missing_percentage.values,
    'Data type': data.dtypes.values
})

# Display only columns with missing values
missing_columns = missing_info[missing_info['Missing count'] > 0].sort_values(
    'Missing count', ascending=False
)

if len(missing_columns) > 0:
    print("Columns with missing values:")
    print(missing_columns.to_string(index=False))

    print(f"\nNumber of columns with missing values: {len(missing_columns)}")
    print(f"Number of columns with complete data: {len(data.columns) - len(missing_columns)}")

    # Details for columns with a high missing rate
    print(f"\nColumns with missing rate ‚â• 50%:")
    high_missing = missing_columns[missing_columns['Missing rate (%)'] >= 50]
    if len(high_missing) > 0:
        print(high_missing.to_string(index=False))
    else:
        print("None")

    # Columns that are completely missing
    print(f"\nCompletely missing columns (100% missing):")
    completely_missing = missing_columns[missing_columns['Missing rate (%)'] == 100]
    if len(completely_missing) > 0:
        print(completely_missing['Column name'].tolist())
    else:
        print("None")

else:
    print("‚úÖ No missing values detected!")

print(f"\nOverall missing value statistics:")
print(f"Total number of cells: {data.shape[0] * data.shape[1]:,}")
print(f"Number of missing cells: {data.isnull().sum().sum():,}")
print(
    f"Overall missing rate: "
    f"{(data.isnull().sum().sum() / (data.shape[0] * data.shape[1])) * 100:.2f}%"
)

# Display basic dataset information
print("\n" + "=" * 50)
print("Basic dataset information after reindexing")
print("=" * 50)
print(f"Data shape: {data.shape}")
print(f"Index range: {data.index.min()} to {data.index.max()}")
print("\nFirst 3 rows:")
data.head(3)


Confirm that there are some missing values.

Year ‚á® Missing values ‚Äã‚Äãare reflected in the prompt.

Genus, Author ‚á® Do not use for analyzing Species.

Missing Species cannot be fixed, so delete them.

In [None]:
print("=" * 50)
print("Removal of rows with missing Species")
print("=" * 50)

# Check dataset status before removal
print("Dataset status before removal:")
print(f"Total number of records: {len(data):,}")

# Check missing status of the Species column
if 'Species' in data.columns:
    species_missing = data['Species'].isnull().sum()
    species_valid = data['Species'].notnull().sum()

    print("Status of the Species column:")
    print(f"  Valid entries: {species_valid:,}")
    print(f"  Missing entries: {species_missing:,}")
    print(f"  Missing rate: {(species_missing / len(data)) * 100:.2f}%")

    if species_missing > 0:
        print(f"\nNumber of rows to be removed: {species_missing:,}")

        # Display samples of rows with missing Species
        missing_species_rows = data[data['Species'].isnull()]
        print("\nSample of rows to be removed (first 3 rows):")
        print(missing_species_rows.head(3))

        # Remove rows with missing Species
        data_cleaned = data[data['Species'].notnull()].copy()

        # Reindex starting from 1
        data_cleaned = data_cleaned.reset_index(drop=True)
        data_cleaned.index = data_cleaned.index + 1

        print("\nDataset status after removal:")
        print(f"Total number of records: {len(data_cleaned):,}")
        print(f"Number of removed records: {len(data) - len(data_cleaned):,}")
        print(f"Retention rate: {(len(data_cleaned) / len(data)) * 100:.2f}%")

        # Verify Species column after removal
        print("\nVerification of Species column after removal:")
        print(f"  Valid entries: {data_cleaned['Species'].notnull().sum():,}")
        print(f"  Missing entries: {data_cleaned['Species'].isnull().sum():,}")

        # Update the original data
        data = data_cleaned

        print("\n‚úÖ Removal of rows with missing Species completed")
        print(f"New index range: {data.index.min()} to {data.index.max()}")

        # Display sample of cleaned data
        print("\nSample of cleaned data (first 3 rows):")
        print(data.head(3))

        # Display example values from the Species column
        print("\nExample values from the Species column (first 10 unique values):")
        unique_species = data['Species'].dropna().unique()
        print(unique_species[:10])

    else:
        print("\n‚úÖ No missing values found in the Species column")

else:
    print("‚ùå Error: 'Species' column does not exist")
    print(f"Available columns: {data.columns.tolist()}")

print("\n" + "=" * 50)
print("Processing completed")
print("=" * 50)


## Label prediction with LLM

In [None]:
import openai
from openai import OpenAI
import pandas as pd
import re
from tqdm.auto import tqdm
import traceback
import concurrent.futures
import threading
import time
import random
import os

# Register tqdm with pandas
tqdm.pandas()

# API key list
api_key_list = []

# Thread-local storage (each thread keeps its own API key / client)
thread_local = threading.local()

def get_client():
    """Get an OpenAI client with a per-thread rotated API key."""
    if not hasattr(thread_local, "client"):
        # Select an API key based on the thread ID
        thread_id = threading.get_ident()
        key_index = thread_id % len(api_key_list)
        api_key = api_key_list[key_index]
        thread_local.client = OpenAI(api_key=api_key)
    return thread_local.client

def categorize_scientific_name_six(species_name, year, max_retries=3):
    """Six-category classification (3x Morphology + People (male/female) + Geography + Other)."""

    # Handle Year: use a default if missing/invalid
    if pd.isna(year) or year is None or str(year).lower() in ['nan', 'none', '']:
        year_str = "unknown"
        year_context = "The description year is unknown, so focus on the etymological meaning of the epithet itself."
    else:
        year_str = str(int(float(year))) if str(year).replace('.', '').isdigit() else str(year)
        year_context = f"The species was described in {year_str}, which may influence the etymology and classification."

    # ‚Äî‚Äî Minimal prompt edits only (7‚Üí6 categories, People split by male/female, Indigenous removed) ‚Äî‚Äî
    prompt = (
    "You are an expert in taxonomic nomenclature. Scientific species epithets are derived primarily from Latin or Latinized Greek, "
    "and they often reflect morphological, ecological, geographical, cultural, or personal aspects. Please analyze the following species epithet "
    "(do not include the genus) and assign it to one or more of the following six categories:\n\n"
    "1. Abstract Morphology: Referring to physical appearance in general terms (e.g., small, hairy, sharp, modest).\n"
    "2. Specific Morphology: Referring to concrete, visually verifiable traits such as color, pattern, number, or body parts (e.g., four-banded, yellow-haired, thin-legged).\n"
    "3. Conceptual Morphology: Referring to conceptual or evaluative traits derived from morphology (e.g., unique, devil-like, different).\n"
    "4. People: Dedicated to a person, with the final answer indicating Eponym (male) or Eponym (female). For example, 'darwini' (male) honors Charles Darwin, while 'mariae' (female) honors a woman named Maria; note that '-i' often indicates male, and '-ae' should always indicates female.\n"
    "5. Geography: Referring to a place of origin, region, or type locality (e.g., japonicus = of Japan, tibetensis = of Tibet).\n"
    "6. Other: If the epithet has a meaning but does not clearly fall into any of the above categories (including ecology, behavior, culture, or nonsensical names).\n\n"
    "Internally, perform a detailed chain-of-thought analysis explaining how you arrived at your classification. "
    "However, in your final output, provide only the final answer in the following format (do not include your internal reasoning):\n\n"
    "Format: Abstract_Morphology: [Yes/None], Specific_Morphology: [Yes/None], Conceptual_Morphology: [Yes/None], "
    "People: [Eponym (male)/Eponym (female)/None], Geography: [Yes/None], Other: [Yes/None]\n\n"
    "The species was described in the year, which may influence whether it references an older historical figure, a more modern cultural reference, "
    "or a classical Latin/Greek etymology. Consider this date in your classification.\n\n"
    "Few-shot examples:\n\n"
    "Example 1:\n"
    "Species epithet: \"pusillus\"\n"
    "Year: 1847\n"
    "Chain-of-Thought: \"The epithet 'pusillus' is Latin for 'very small', which clearly refers to a general size-related characteristic. "
    "This is a morphological descriptor of an abstract type (size) rather than a specific or conceptual trait. "
    "It is not related to geography, people, or other categories.\"\n"
    "Final Answer: Abstract_Morphology: Yes, Specific_Morphology: None, Conceptual_Morphology: None, People: None, Geography: None, Other: None\n\n"
    "Example 2:\n"
    "Species epithet: \"flavopilosus\"\n"
    "Year: 1901\n"
    "Chain-of-Thought: \"The epithet combines 'flavo-' (yellow) and 'pilosus' (hairy), referring directly to a visible color and body hair trait. "
    "This is a clear example of specific, visually verifiable morphology. "
    "It does not involve abstract description, conceptual imagery, geography, or dedication to a person.\"\n"
    "Final Answer: Abstract_Morphology: None, Specific_Morphology: Yes, Conceptual_Morphology: None, People: None, Geography: None, Other: None\n\n"
    "Example 3:\n"
    "Species epithet: \"unica\"\n"
    "Year: 1923\n"
    "Chain-of-Thought: \"The epithet 'unica' means 'unique' in Latin. This does not describe a direct visual feature but rather conveys a conceptual evaluation "
    "about the organism's distinctiveness. Thus, it falls into conceptual morphology. "
    "There is no geographical, personal, or cultural association.\"\n"
    "Final Answer: Abstract_Morphology: None, Specific_Morphology: None, Conceptual_Morphology: Yes, People: None, Geography: None, Other: None\n\n"
    "Example 4:\n"
    "Species epithet: \"darwini\"\n"
    "Year: 1845\n"
    "Chain-of-Thought: \"The epithet 'darwini' is formed in the genitive case, honoring Charles Darwin. The '-i' ending indicates a male eponym. "
    "This is a personal dedication, not related to morphology, geography, or cultural references.\"\n"
    "Final Answer: Abstract_Morphology: None, Specific_Morphology: None, Conceptual_Morphology: None, People: Eponym (male), Geography: None, Other: None\n\n"
    "Example 5:\n"
    "Species epithet: \"mariae\"\n"
    "Year: 1903\n"
    "Chain-of-Thought: \"The epithet 'mariae' is also in the genitive case, dedicated to a woman named Maria. The '-ae' ending is a strong indicator that the honoree is female. "
    "Therefore, this is classified as a female eponym.\"\n"
    "Final Answer: Abstract_Morphology: None, Specific_Morphology: None, Conceptual_Morphology: None, People: Eponym (female), Geography: None, Other: None\n\n"
    "Example 6:\n"
    "Species epithet: \"formosensis\"\n"
    "Year: 1935\n"
    "Chain-of-Thought: \"The epithet 'formosensis' refers to Formosa (Taiwan). The suffix '-ensis' is a standard indicator of geographic origin. "
    "It is not morphological, personal, or cultural, but clearly geographical.\"\n"
    "Final Answer: Abstract_Morphology: None, Specific_Morphology: None, Conceptual_Morphology: None, People: None, Geography: Yes, Other: None\n\n"
    "Example 7:\n"
    "Species epithet: \"nocturnus\"\n"
    "Year: 1901\n"
    "Chain-of-Thought: \"The epithet 'nocturnus' means 'active at night', describing the behavior or ecological niche of the species. "
    "It does not correspond to morphology, geography, or people, but instead represents an ecological/behavioral trait. "
    "Thus, it falls under 'Other'.\"\n"
    "Final Answer: Abstract_Morphology: None, Specific_Morphology: None, Conceptual_Morphology: None, People: None, Geography: None, Other: Yes\n\n"
    "Now, please analyze the following species epithet and output only the final answer.\n"
    f"Species epithet: {species_name}\n"
    f"Year: {year}\n\n"
    "Final Answer:"
)
    # ‚Äî‚Äî End of prompt edits ‚Äî‚Äî

    client = get_client()

    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model="gpt-4.1-mini-2025-04-14",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=150,
                temperature=0
            )
            result = response.choices[0].message.content.strip()
            return result
        except Exception as e:
            if attempt < max_retries - 1:
                # Wait briefly on rate limits
                wait_time = (2 ** attempt) + random.uniform(0, 1)
                time.sleep(wait_time)
                continue
            else:
                return f"Error after {max_retries} attempts: {e}"

    return "Error: Max retries exceeded"

def parse_llm_result_six(llm_str):
    """Parse the LLM output into six categories (People is split into male/female)."""
    categories = {
        "Abstract_Morphology": 0,
        "Specific_Morphology": 0,
        "Conceptual_Morphology": 0,
        "People_Male": 0,
        "People_Female": 0,
        "Geography": 0,
        "Other": 0
    }
    try:
        parts = [p.strip() for p in str(llm_str).split(",")]
        for part in parts:
            if ":" in part:
                key, value = part.split(":", 1)
                key = key.strip()
                value_norm = value.strip().lower()
                # Morphology / Geography / Other: Yes/None
                if key in ["Abstract_Morphology", "Specific_Morphology", "Conceptual_Morphology", "Geography", "Other"]:
                    if value_norm != "none":
                        categories[key] = 1
                # People requires special handling
                elif key == "People":
                    if "male" in value_norm:
                        categories["People_Male"] = 1
                    elif "female" in value_norm:
                        categories["People_Female"] = 1
        return categories
    except Exception:
        return categories

def process_single_item(args):
    """Process a single item (for parallel execution)."""
    idx, species_name, year = args
    try:
        result = categorize_scientific_name_six(species_name, year)
        return idx, result
    except Exception as e:
        return idx, f"Error: {e}"

def process_batch_parallel(batch_data, batch_id, max_workers=15):
    """Process a batch in parallel."""
    print(f"Starting batch {batch_id} ({len(batch_data)} records)")

    # Prepare inputs
    process_args = [(idx, row['Species'], row['Year']) for idx, row in batch_data.iterrows()]

    results = {}

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit each row for parallel processing
        future_to_idx = {
            executor.submit(process_single_item, args): args[0]
            for args in process_args
        }

        # Collect results
        for future in tqdm(
            concurrent.futures.as_completed(future_to_idx),
            total=len(future_to_idx),
            desc=f"Batch {batch_id}"
        ):
            try:
                idx, result = future.result(timeout=120)  # Timeout after 120 seconds
                results[idx] = result
            except Exception as e:
                idx = future_to_idx[future]
                print(f"Error occurred (idx {idx}): {e}")
                results[idx] = f"Error: {e}"

    # Restore original order
    ordered_results = [results[idx] for idx in batch_data.index]

    print(f"Finished batch {batch_id}")
    return ordered_results

def main_processing():
    """Main processing function (six-category version)."""
    print("=" * 60)
    print("Starting six-category LLM classification for Animalia scientific epithets")
    print("=" * 60)

    # Data summary
    print(f"Total records to process: {len(data):,}")

    # Validate required columns
    if 'Species' not in data.columns:
        print("‚ùå Error: 'Species' column not found")
        print(f"Available columns: {data.columns.tolist()}")
        return

    if 'Year' not in data.columns:
        print("‚ùå Error: 'Year' column not found")
        print(f"Available columns: {data.columns.tolist()}")
        return

    # Batch settings
    batch_size = 3000  # Process in chunks of 3000
    max_workers = 25   # Parallel workers
    total_batches = (len(data) + batch_size - 1) // batch_size

    print(f"Batch size: {batch_size}")
    print(f"Total batches: {total_batches}")
    print(f"Parallel workers: {max_workers}")

    # Create output directory
    save_dir = "/Users/path"
    os.makedirs(save_dir, exist_ok=True)

    # Store all outputs
    all_results = []

    # Process batches
    for i in tqdm(range(0, len(data), batch_size), desc="Overall progress"):
        batch_data = data.iloc[i:i+batch_size]
        batch_id = i // batch_size + 1

        print(f"\n{'='*40}")
        print(f"Starting batch {batch_id}/{total_batches}")
        print(f"Range: {i:,} - {min(i+batch_size-1, len(data)-1):,}")
        print(f"{'='*40}")

        # Run parallel batch processing
        batch_results = process_batch_parallel(batch_data, batch_id, max_workers)
        all_results.extend(batch_results)

        # Show the last inference result in the batch (roughly every 1000+ entries depending on batch)
        if len(batch_results) > 0:
            last_idx = i + len(batch_results) - 1
            last_species = batch_data.iloc[-1]['Species']
            last_year = batch_data.iloc[-1]['Year']
            last_result = batch_results[-1]

            print(f"\nüìù Inference result at record {len(all_results)}:")
            print(f"   Species: {last_species}")
            print(f"   Year:    {last_year}")
            print(f"   Output:  {last_result}")

            # Parse and display categories (six-category aware)
            if 'Error' not in str(last_result):
                parsed = parse_llm_result_six(last_result)
                categories = []
                if parsed["Abstract_Morphology"]:
                    categories.append('Abstract morphology')
                if parsed["Specific_Morphology"]:
                    categories.append('Specific morphology')
                if parsed["Conceptual_Morphology"]:
                    categories.append('Conceptual morphology')
                if parsed["People_Male"]:
                    categories.append('People (male)')
                if parsed["People_Female"]:
                    categories.append('People (female)')
                if parsed["Geography"]:
                    categories.append('Geography')
                if parsed["Other"]:
                    categories.append('Other')

                cat_str = ', '.join(categories) if categories else 'No category'
                print(f"   Categories: {cat_str}")

        # Interim save
        print(f"\nüìä Interim save: processed {len(all_results):,} records")

        temp_data = data.iloc[:len(all_results)].copy()
        temp_data['LLM_Scientific_Categories_Final6'] = all_results

        interim_path = os.path.join(save_dir, f"LLM_interim_6cat_{len(all_results):07d}.csv")
        temp_data.to_csv(interim_path, index=False)
        print(f"üíæ Saved interim file: {interim_path}")

        # Rate-limit mitigation
        if batch_id < total_batches:
            print("‚è≥ Waiting briefly to mitigate rate limits...")
            time.sleep(2)

    print(f"\nüéâ Completed all records: {len(all_results):,}")

    # Write final outputs
    data['LLM_Scientific_Categories_Final6'] = all_results

    # Create category flags (six-category aware)
    print("üè∑Ô∏è  Creating category flags...")

    tqdm.pandas(desc="Abstract_Morphology")
    data["LLM_Scientific_Abstract_Morphology"] = data["LLM_Scientific_Categories_Final6"].progress_apply(
        lambda x: parse_llm_result_six(x)["Abstract_Morphology"]
    )

    tqdm.pandas(desc="Specific_Morphology")
    data["LLM_Scientific_Specific_Morphology"] = data["LLM_Scientific_Categories_Final6"].progress_apply(
        lambda x: parse_llm_result_six(x)["Specific_Morphology"]
    )

    tqdm.pandas(desc="Conceptual_Morphology")
    data["LLM_Scientific_Conceptual_Morphology"] = data["LLM_Scientific_Categories_Final6"].progress_apply(
        lambda x: parse_llm_result_six(x)["Conceptual_Morphology"]
    )

    tqdm.pandas(desc="People_Male")
    data["LLM_Scientific_People_Male"] = data["LLM_Scientific_Categories_Final6"].progress_apply(
        lambda x: parse_llm_result_six(x)["People_Male"]
    )

    tqdm.pandas(desc="People_Female")
    data["LLM_Scientific_People_Female"] = data["LLM_Scientific_Categories_Final6"].progress_apply(
        lambda x: parse_llm_result_six(x)["People_Female"]
    )

    tqdm.pandas(desc="Geography")
    data["LLM_Scientific_Geography"] = data["LLM_Scientific_Categories_Final6"].progress_apply(
        lambda x: parse_llm_result_six(x)["Geography"]
    )

    tqdm.pandas(desc="Other")
    data["LLM_Scientific_Other"] = data["LLM_Scientific_Categories_Final6"].progress_apply(
        lambda x: parse_llm_result_six(x)["Other"]
    )

    # Save final results
    final_path = os.path.join(save_dir, "animalia_LLM_classified_final_6categories.csv")
    data.to_csv(final_path, index=False)

    print(f"\nüíæ Saved final results: {final_path}")

    # Summary (six-category aware)
    print("\n" + "=" * 60)
    print("üìà Summary")
    print("=" * 60)

    result_columns = [
        'Species', 'Year', 'LLM_Scientific_Categories_Final6',
        'LLM_Scientific_Abstract_Morphology', 'LLM_Scientific_Specific_Morphology',
        'LLM_Scientific_Conceptual_Morphology', 'LLM_Scientific_People_Male',
        'LLM_Scientific_People_Female', 'LLM_Scientific_Geography', 'LLM_Scientific_Other'
    ]

    print("\nüìã First 10 results:")
    print(data[result_columns].head(10))

    print("\nüìä Category statistics:")
    total_count = len(data)
    print(f"Abstract Morphology:    {data['LLM_Scientific_Abstract_Morphology'].sum():,} ({data['LLM_Scientific_Abstract_Morphology'].mean()*100:.1f}%)")
    print(f"Specific Morphology:    {data['LLM_Scientific_Specific_Morphology'].sum():,} ({data['LLM_Scientific_Specific_Morphology'].mean()*100:.1f}%)")
    print(f"Conceptual Morphology:  {data['LLM_Scientific_Conceptual_Morphology'].sum():,} ({data['LLM_Scientific_Conceptual_Morphology'].mean()*100:.1f}%)")
    print(f"People (Male):          {data['LLM_Scientific_People_Male'].sum():,} ({data['LLM_Scientific_People_Male'].mean()*100:.1f}%)")
    print(f"People (Female):        {data['LLM_Scientific_People_Female'].sum():,} ({data['LLM_Scientific_People_Female'].mean()*100:.1f}%)")
    print(f"Geography:              {data['LLM_Scientific_Geography'].sum():,} ({data['LLM_Scientific_Geography'].mean()*100:.1f}%)")
    print(f"Other:                  {data['LLM_Scientific_Other'].sum():,} ({data['LLM_Scientific_Other'].mean()*100:.1f}%)")

    # Error count
    error_count = data['LLM_Scientific_Categories_Final6'].str.contains('Error', na=False).sum()
    print(f"\n‚ùå Errors: {error_count:,} ({error_count/total_count*100:.2f}%)")

    print("\n‚úÖ All done!")

    return data

# Convenience runner
def run_classification():
    """Run the classification pipeline (six-category version)."""
    print("üöÄ Starting LLM-based scientific epithet classification for Animalia...")
    print(f"üìä Total records: {len(data):,}")
    print("üìù The last inference result will be printed periodically.")
    print("üìä Six categories: Abstract morphology, Specific morphology, Conceptual morphology, People (male/female), Geography, Other")

    # Confirmation
    response = input("Start processing? (y/n): ")
    if response.lower() != 'y':
        print("Cancelled.")
        return

    # Run
    result_data = main_processing()
    return result_data

print("\nüîß Ready!")
print("To run: call run_classification()")
print(f"Target: {len(data):,} records")
print("üìä Six categories: Abstract morphology, Specific morphology, Conceptual morphology, People (male/female), Geography, Other")


In [None]:
# Start processing
run_classification()