In [1]:
#
# -----------------------------------------------------------------------------
#
#                           ATLAS v3: ITS Data Preparation (Fungi)
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       Refine the data preparation pipeline for the ITS (Internal Transcribed
#       Spacer) region, the primary barcode for fungi identification, using
#       the UNITE database.
#
#   METHODOLOGY:
#
#       1.  Create a Development Sample: Extract a small, manageable sample
#           from the full UNITE database (`.tgz` archive) to enable rapid,
#           interactive development.
#       2.  Develop a Custom Parser: Create a new taxonomy parser specifically
#           designed for the UNITE database's `k__Fungi;p__Ascomycota;...`
#           header format.
#       3.  Clean and Process: Apply the full data cleaning and feature
#           engineering workflow (k-mer counting, vectorizing, splitting).
#       4.  Save Artifacts: Save the final, model-ready artifacts for the ITS
#           pipeline.
#
# -----------------------------------------------------------------------------
#

# --- Imports ---
import pandas as pd
import numpy as np
from Bio import SeqIO
from tqdm.auto import tqdm
from pathlib import Path
import sys
import tarfile # Required for reading from .tgz archives
import io      # Required for stream handling

# --- Setup Project Path ---
# This ensures that the notebook can find the project's root directory
try:
    project_root = Path(__file__).parent.parent
except NameError:
    project_root = Path.cwd().parent
print(f"Project Root: {project_root}")

# --- Define Core Directories ---
RAW_DATA_DIR = project_root / "data" / "raw"
PROCESSED_DATA_DIR = project_root / "data" / "processed"
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# --- Define ITS Specific File Paths ---

# --- NOTE: Update this filename if you download a different UNITE release ---
FULL_UNITE_PATH = RAW_DATA_DIR / "sh_general_release_19.02.2025.tgz"

# Path to the small sample file we will create for development
SAMPLE_UNITE_PATH = RAW_DATA_DIR / "UNITE_sample_10k.fasta"

# --- Verification Step ---
if FULL_UNITE_PATH.exists():
    print(f"\nSource UNITE database archive found.")
    print(f"  - Location: {FULL_UNITE_PATH}")
else:
    print(f"\n[ERROR] The source UNITE database archive was not found.")
    print(f"  - Expected: {FULL_UNITE_PATH}")
    print("        Please ensure the file is downloaded and correctly named in the 'data/raw' directory.")

Project Root: C:\Users\jampa\Music\atlas

Source UNITE database archive found.
  - Location: C:\Users\jampa\Music\atlas\data\raw\sh_general_release_19.02.2025.tgz


In [2]:
#
# -----------------------------------------------------------------------------
#
#       STEP 1: CREATE A DEVELOPMENT SAMPLE FROM THE UNITE ARCHIVE
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To extract the first 10,000 sequences from the main FASTA file
#       contained within the UNITE `.tgz` archive.
#
#   RATIONALE:
#
#       Working with a smaller sample file (`UNITE_sample_10k.fasta`) allows
#       for rapid, interactive development and debugging of the subsequent
#       parsing and cleaning steps. This script is designed to be run only
#       once; if the sample file is found, this step will be skipped.
#
# -----------------------------------------------------------------------------
#

# --- Configuration ---
SAMPLE_SIZE = 10000

# --- Main Logic ---
# This check prevents us from re-running this process unnecessarily.
if not SAMPLE_UNITE_PATH.exists():
    print(f"Creating a sample of {SAMPLE_SIZE} sequences from the UNITE archive...")
    print("This may take a moment...")

    sample_records = []
    try:
        # Open the .tgz archive for reading
        with tarfile.open(FULL_UNITE_PATH, "r:gz") as tar:
            # Find the main FASTA file within the archive
            fasta_member = None
            for member in tar.getmembers():
                if member.name.endswith('.fasta'):
                    fasta_member = member
                    break

            if fasta_member:
                print(f"  - Found FASTA file in archive: {fasta_member.name}")
                # Extract the file content into an in-memory text stream
                fasta_file = tar.extractfile(fasta_member)
                fasta_stream = io.TextIOWrapper(fasta_file, encoding="utf-8")

                # Parse the stream and collect the sample records with a progress bar
                records_iterator = SeqIO.parse(fasta_stream, "fasta")
                for i, record in tqdm(enumerate(records_iterator), total=SAMPLE_SIZE, desc="  - Sampling records"):
                    if i >= SAMPLE_SIZE:
                        break
                    sample_records.append(record)

                # Write the collected records to our new sample file
                with open(SAMPLE_UNITE_PATH, "w") as handle_out:
                    SeqIO.write(sample_records, handle_out, "fasta")

                print(f"\n[SUCCESS] Created sample file with {len(sample_records)} sequences.")
                print(f"  - Location: {SAMPLE_UNITE_PATH}")
            else:
                print("\n[ERROR] No .fasta file was found inside the .tgz archive.")

    except Exception as e:
        print(f"\n[ERROR] An error occurred while processing the archive: {e}")

else:
    print(f"UNITE sample file already exists. No action needed.")
    print(f"  - Location: {SAMPLE_UNITE_PATH}")

Creating a sample of 10000 sequences from the UNITE archive...
This may take a moment...
  - Found FASTA file in archive: sh_general_release_dynamic_19.02.2025.fasta


  - Sampling records:   0%|          | 0/10000 [00:00<?, ?it/s]


[SUCCESS] Created sample file with 10000 sequences.
  - Location: C:\Users\jampa\Music\atlas\data\raw\UNITE_sample_10k.fasta


In [3]:
#
# -----------------------------------------------------------------------------
#
#               STEP 2: DEVELOP A CUSTOM PARSER FOR UNITE TAXONOMY
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To create and validate a Python function that can accurately parse
#       the unique FASTA header format used by the UNITE database.
#
#   RATIONALE:
#
#       The UNITE database uses a format like `k__Fungi;p__Ascomycota;...`
#       which is distinct from both SILVA and BOLD. A dedicated parser is
#       required to correctly map the prefixes (k, p, c, o, f, g, s) to their
#       corresponding taxonomic ranks (kingdom, phylum, class, etc.).
#
# -----------------------------------------------------------------------------
#

# --- A list to hold our structured data ---
parsed_data = []

# --- Define the new, UNITE-specific parsing function ---
def parse_unite_taxonomy(description):
    """
    Parses the UNITE database header format (e.g., "ACCESSION|k__Fungi;p__...").
    """
    # Initialize a dictionary with all ranks set to None
    parsed_ranks = {
        'kingdom': None, 'phylum': None, 'class': None, 'order': None,
        'family': None, 'genus': None, 'species': None
    }
    
    try:
        # The taxonomy is the second part of the pipe-separated header
        taxonomy_str = description.split('|')[1]
        
        # Split the taxonomy string into individual rank components
        ranks = taxonomy_str.split(';')
        
        for rank_str in ranks:
            # Each component is like "k__Fungi". Split by the double underscore.
            parts = rank_str.split('__')
            if len(parts) == 2:
                prefix, name = parts
                if not name: continue # Skip if the name is empty

                # Map the prefix to the correct rank
                if   prefix == 'k': parsed_ranks['kingdom'] = name
                elif prefix == 'p': parsed_ranks['phylum'] = name
                elif prefix == 'c': parsed_ranks['class'] = name
                elif prefix == 'o': parsed_ranks['order'] = name
                elif prefix == 'f': parsed_ranks['family'] = name
                elif prefix == 'g': parsed_ranks['genus'] = name
                elif prefix == 's': parsed_ranks['species'] = name
    except IndexError:
        # This will catch headers that don't have the expected '|' separator
        pass
        
    return parsed_ranks

# --- Apply the new parser to our sample data ---
print("Applying UNITE taxonomy parser to sample data...")

# Loop through the records in our sample file with a progress bar
with open(SAMPLE_UNITE_PATH, "r") as handle:
    for record in tqdm(SeqIO.parse(handle, "fasta"), total=SAMPLE_SIZE, desc="  - Parsing headers"):
        # Parse the description line using our new function
        taxonomy_dict = parse_unite_taxonomy(record.description)
        
        # Store the essential sequence information
        taxonomy_dict['id'] = record.id
        taxonomy_dict['sequence'] = str(record.seq)
        
        parsed_data.append(taxonomy_dict)

# --- Create and Verify the DataFrame ---
# Convert the list of dictionaries into a pandas DataFrame
df = pd.DataFrame(parsed_data)

print(f"\n[SUCCESS] Parsing complete. Created a DataFrame with {len(df)} rows.")

# Display the first 5 rows for a preliminary check
print("\n--- ASCII PREVIEW: First 5 Rows ---")
display(df.head())

# Display 5 random rows to check for consistency across the dataset
print("\n--- ASCII PREVIEW: 5 Random Rows ---")
display(df.sample(5))

Applying UNITE taxonomy parser to sample data...


  - Parsing headers:   0%|          | 0/10000 [00:00<?, ?it/s]


[SUCCESS] Parsing complete. Created a DataFrame with 10000 rows.

--- ASCII PREVIEW: First 5 Rows ---


Unnamed: 0,kingdom,phylum,class,order,family,genus,species,id,sequence
0,,,,,,,,Abrothallus_subhalei|MT153946|SH1227328.10FU|r...,CAACCCTTGCTTACCTACCACGTTGCTTCGGCGGGCCCGGGGCAAG...
1,,,,,,,,Mucor_inaequisporus|JN206177|SH1227742.10FU|re...,ATCATTAAATAATTTGATAATTAYACAATTATCTAATTTACTGTGA...
2,,,,,,,,Candida_vrieseae|KY102517|SH1232203.10FU|refs|...,CAGTTAGTTTATGTTCTCTCTGCCTGCGCTTAGTTGCGCGGCGAGG...
3,,,,,,,,Exophiala_lecanii-corni|AY857528|SH1233462.10F...,ATCATTAACGAGTTAGGGTCTTTTATAGGCTCGACCTCCCAACCCT...
4,,,,,,,,Johansonia_chapadensis|HQ423449|SH1236832.10FU...,CCGAGTGAGGGTCCTCGTGGCCCAACCTCCAACCCCCTGTGAGACC...



--- ASCII PREVIEW: 5 Random Rows ---


Unnamed: 0,kingdom,phylum,class,order,family,genus,species,id,sequence
7201,,,,,,,,Coprotus_sp|UDB04190379|SH1077527.10FU|reps|k_...,CCAAATACTGCTTGCTTGGATGGACTTGTCTGTTCTGCAAACAAAA...
2705,,,,,,,,Preussia_sp|UDB05236033|SH1341051.10FU|reps|k_...,TCGTGGGGCTTCGGCCCTATCGAGATAGAACCCTTGCCTTTATAGT...
1284,,,,,,,,Usnea_pygmoidea|MW741877|SH1199509.10FU|reps|k...,CCGAGAGAGGGGCCTCGCGCTCCCGGGGGTTTCGGCCTCCACCTCT...
7742,,,,,,,,Pyronemataceae_sp|UDB05845757|SH1089705.10FU|r...,ACATGTTGGAGTGGCCTTCGGGTCGCGACCTCCATAAACCCACCTC...
3402,,,,,,,,Polyplosphaeria_sp|OR427324|SH1242162.10FU|rep...,CCGTGGGGGCCTCGTGGCCCCTACCGAGATAGCACCCTCTGTCTTC...
