In [1]:
#
# -----------------------------------------------------------------------------
#
#                           ATLAS v3: ITS Data Preparation (Fungi)
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       Refine the data preparation pipeline for the ITS (Internal Transcribed
#       Spacer) region, the primary barcode for fungi identification, using
#       the UNITE database.
#
#   METHODOLOGY:
#
#       1.  Create a Development Sample: Extract a small, manageable sample
#           from the full UNITE database (`.tgz` archive) to enable rapid,
#           interactive development.
#       2.  Develop a Custom Parser: Create a new taxonomy parser specifically
#           designed for the UNITE database's `k__Fungi;p__Ascomycota;...`
#           header format.
#       3.  Clean and Process: Apply the full data cleaning and feature
#           engineering workflow (k-mer counting, vectorizing, splitting).
#       4.  Save Artifacts: Save the final, model-ready artifacts for the ITS
#           pipeline.
#
# -----------------------------------------------------------------------------
#

# --- Imports ---
import pandas as pd
import numpy as np
from Bio import SeqIO
from tqdm.auto import tqdm
from pathlib import Path
import sys
import tarfile # Required for reading from .tgz archives
import io      # Required for stream handling

# --- Setup Project Path ---
# This ensures that the notebook can find the project's root directory
try:
    project_root = Path(__file__).parent.parent
except NameError:
    project_root = Path.cwd().parent
print(f"Project Root: {project_root}")

# --- Define Core Directories ---
RAW_DATA_DIR = project_root / "data" / "raw"
PROCESSED_DATA_DIR = project_root / "data" / "processed"
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# --- Define ITS Specific File Paths ---

# --- NOTE: Update this filename if you download a different UNITE release ---
FULL_UNITE_PATH = RAW_DATA_DIR / "sh_general_release_19.02.2025.tgz"

# Path to the small sample file we will create for development
SAMPLE_UNITE_PATH = RAW_DATA_DIR / "UNITE_sample_10k.fasta"

# --- Verification Step ---
if FULL_UNITE_PATH.exists():
    print(f"\nSource UNITE database archive found.")
    print(f"  - Location: {FULL_UNITE_PATH}")
else:
    print(f"\n[ERROR] The source UNITE database archive was not found.")
    print(f"  - Expected: {FULL_UNITE_PATH}")
    print("        Please ensure the file is downloaded and correctly named in the 'data/raw' directory.")

Project Root: C:\Users\jampa\Music\atlas

Source UNITE database archive found.
  - Location: C:\Users\jampa\Music\atlas\data\raw\sh_general_release_19.02.2025.tgz


In [2]:
#
# -----------------------------------------------------------------------------
#
#       STEP 1: CREATE A DEVELOPMENT SAMPLE FROM THE UNITE ARCHIVE
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To extract the first 10,000 sequences from the main FASTA file
#       contained within the UNITE `.tgz` archive.
#
#   RATIONALE:
#
#       Working with a smaller sample file (`UNITE_sample_10k.fasta`) allows
#       for rapid, interactive development and debugging of the subsequent
#       parsing and cleaning steps. This script is designed to be run only
#       once; if the sample file is found, this step will be skipped.
#
# -----------------------------------------------------------------------------
#

# --- Configuration ---
SAMPLE_SIZE = 10000

# --- Main Logic ---
# This check prevents us from re-running this process unnecessarily.
if not SAMPLE_UNITE_PATH.exists():
    print(f"Creating a sample of {SAMPLE_SIZE} sequences from the UNITE archive...")
    print("This may take a moment...")

    sample_records = []
    try:
        # Open the .tgz archive for reading
        with tarfile.open(FULL_UNITE_PATH, "r:gz") as tar:
            # Find the main FASTA file within the archive
            fasta_member = None
            for member in tar.getmembers():
                if member.name.endswith('.fasta'):
                    fasta_member = member
                    break

            if fasta_member:
                print(f"  - Found FASTA file in archive: {fasta_member.name}")
                # Extract the file content into an in-memory text stream
                fasta_file = tar.extractfile(fasta_member)
                fasta_stream = io.TextIOWrapper(fasta_file, encoding="utf-8")

                # Parse the stream and collect the sample records with a progress bar
                records_iterator = SeqIO.parse(fasta_stream, "fasta")
                for i, record in tqdm(enumerate(records_iterator), total=SAMPLE_SIZE, desc="  - Sampling records"):
                    if i >= SAMPLE_SIZE:
                        break
                    sample_records.append(record)

                # Write the collected records to our new sample file
                with open(SAMPLE_UNITE_PATH, "w") as handle_out:
                    SeqIO.write(sample_records, handle_out, "fasta")

                print(f"\n[SUCCESS] Created sample file with {len(sample_records)} sequences.")
                print(f"  - Location: {SAMPLE_UNITE_PATH}")
            else:
                print("\n[ERROR] No .fasta file was found inside the .tgz archive.")

    except Exception as e:
        print(f"\n[ERROR] An error occurred while processing the archive: {e}")

else:
    print(f"UNITE sample file already exists. No action needed.")
    print(f"  - Location: {SAMPLE_UNITE_PATH}")

Creating a sample of 10000 sequences from the UNITE archive...
This may take a moment...
  - Found FASTA file in archive: sh_general_release_dynamic_19.02.2025.fasta


  - Sampling records:   0%|          | 0/10000 [00:00<?, ?it/s]


[SUCCESS] Created sample file with 10000 sequences.
  - Location: C:\Users\jampa\Music\atlas\data\raw\UNITE_sample_10k.fasta


In [4]:
#
# -----------------------------------------------------------------------------
#
#           STEP 2 (REVISED): CORRECT AND RE-RUN THE UNITE PARSER
#
# -----------------------------------------------------------------------------
#
#   DIAGNOSIS:
#
#       The previous step produced a DataFrame with all 'None' values for the
#       taxonomic ranks. This indicates the parser logic was incorrect. It was
#       attempting to read the second element of the header, but the actual
#       taxonomy string is the LAST element.
#
#   ACTION:
#
#       We will define a corrected `v2` of the parser that correctly targets
#       the last element of the pipe-separated string and re-process the
#       sample data to create a valid DataFrame.
#
# -----------------------------------------------------------------------------
#

# --- A fresh list to hold our structured data ---
parsed_data = []

# --- Define the CORRECTED (v2) UNITE-specific parsing function ---
def parse_unite_taxonomy_v2(description):
    """
    Parses the UNITE database header format, correctly targeting the LAST
    pipe-separated element for the taxonomy string.
    """
    parsed_ranks = {
        'kingdom': None, 'phylum': None, 'class': None, 'order': None,
        'family': None, 'genus': None, 'species': None
    }
    try:
        # --- FIX: Target the last element, not the second ---
        taxonomy_str = description.split('|')[-1]
        
        ranks = taxonomy_str.split(';')
        for rank_str in ranks:
            parts = rank_str.split('__')
            if len(parts) == 2:
                prefix, name = parts
                if not name: continue
                if   prefix == 'k': parsed_ranks['kingdom'] = name
                elif prefix == 'p': parsed_ranks['phylum'] = name
                elif prefix == 'c': parsed_ranks['class'] = name
                elif prefix == 'o': parsed_ranks['order'] = name
                elif prefix == 'f': parsed_ranks['family'] = name
                elif prefix == 'g': parsed_ranks['genus'] = name
                elif prefix == 's': parsed_ranks['species'] = name
    except IndexError:
        pass
    return parsed_ranks

# --- Apply the CORRECTED parser to our sample data ---
print("Applying corrected UNITE taxonomy parser (v2) to sample data...")
with open(SAMPLE_UNITE_PATH, "r") as handle:
    for record in tqdm(SeqIO.parse(handle, "fasta"), total=SAMPLE_SIZE, desc="  - Parsing headers"):
        taxonomy_dict = parse_unite_taxonomy_v2(record.description)
        taxonomy_dict['id'] = record.id
        taxonomy_dict['sequence'] = str(record.seq)
        parsed_data.append(taxonomy_dict)

# --- Create and Verify the DataFrame ---
df = pd.DataFrame(parsed_data)
print(f"\n[SUCCESS] Re-parsing complete. Created a DataFrame with {len(df)} rows.")
print("\n--- ASCII PREVIEW: First 5 Rows (Corrected) ---")
display(df.head())

#
# -----------------------------------------------------------------------------
#
#                 STEP 3: CLEAN AND FILTER THE DATAFRAME
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To prepare the parsed data for feature engineering by removing unusable
#       rows. We will perform our standard two-step cleaning process.
#
#   WORKFLOW:
#
#       1.  Remove Missing Targets: Drop any rows that do not have a 'genus'
#           label, as they cannot be used for supervised training.
#       2.  Remove Rare Classes: Filter out any genera represented by fewer
#           than 3 sequences to ensure data quality and stable model training.
#
# -----------------------------------------------------------------------------
#

# --- Configuration for this phase ---
TARGET_RANK = 'genus'
MIN_CLASS_MEMBERS = 3

# --- 1. Remove Missing Targets ---
print(f"\n--- Data Cleaning Initiated ---")
initial_rows = len(df)
df_cleaned = df.dropna(subset=[TARGET_RANK]).copy()
rows_after_dropna = len(df_cleaned)
print(f"  - Step 1: Removed {initial_rows - rows_after_dropna} rows with missing '{TARGET_RANK}' labels.")

# --- 2. Remove Rare Classes ---
class_counts = df_cleaned[TARGET_RANK].value_counts()
classes_to_keep = class_counts[class_counts >= MIN_CLASS_MEMBERS].index
df_filtered = df_cleaned[df_cleaned[TARGET_RANK].isin(classes_to_keep)].copy()
rows_after_filter = len(df_filtered)
print(f"  - Step 2: Removed {rows_after_dropna - rows_after_filter} rows for rare genera (less than {MIN_CLASS_MEMBERS} members).")

# --- Final Verification ---
print("-----------------------------------")
print(f"[SUCCESS] Cleaning complete.")
print(f"          Final DataFrame has {len(df_filtered)} sequences ready for feature engineering.")

Applying corrected UNITE taxonomy parser (v2) to sample data...


  - Parsing headers:   0%|          | 0/10000 [00:00<?, ?it/s]


[SUCCESS] Re-parsing complete. Created a DataFrame with 10000 rows.

--- ASCII PREVIEW: First 5 Rows (Corrected) ---


Unnamed: 0,kingdom,phylum,class,order,family,genus,species,id,sequence
0,Fungi,Ascomycota,Dothideomycetes,Abrothallales,Abrothallaceae,Abrothallus,Abrothallus_subhalei,Abrothallus_subhalei|MT153946|SH1227328.10FU|r...,CAACCCTTGCTTACCTACCACGTTGCTTCGGCGGGCCCGGGGCAAG...
1,Fungi,Mucoromycota,Mucoromycetes,Mucorales,Mucoraceae,Mucor,Mucor_inaequisporus,Mucor_inaequisporus|JN206177|SH1227742.10FU|re...,ATCATTAAATAATTTGATAATTAYACAATTATCTAATTTACTGTGA...
2,Fungi,Ascomycota,Saccharomycetes,Saccharomycetales,Saccharomycetales_fam_Incertae_sedis,Candida,Candida_vrieseae,Candida_vrieseae|KY102517|SH1232203.10FU|refs|...,CAGTTAGTTTATGTTCTCTCTGCCTGCGCTTAGTTGCGCGGCGAGG...
3,Fungi,Ascomycota,Eurotiomycetes,Chaetothyriales,Herpotrichiellaceae,Exophiala,Exophiala_lecanii-corni,Exophiala_lecanii-corni|AY857528|SH1233462.10F...,ATCATTAACGAGTTAGGGTCTTTTATAGGCTCGACCTCCCAACCCT...
4,Fungi,Ascomycota,Dothideomycetes,Capnodiales,Johansoniaceae,Johansonia,Johansonia_chapadensis,Johansonia_chapadensis|HQ423449|SH1236832.10FU...,CCGAGTGAGGGTCCTCGTGGCCCAACCTCCAACCCCCTGTGAGACC...



--- Data Cleaning Initiated ---
  - Step 1: Removed 0 rows with missing 'genus' labels.
  - Step 2: Removed 1630 rows for rare genera (less than 3 members).
-----------------------------------
[SUCCESS] Cleaning complete.
          Final DataFrame has 8370 sequences ready for feature engineering.


In [5]:
#
# -----------------------------------------------------------------------------
#
#       STEPS 4-7: FEATURE ENGINEERING, VECTORIZING, AND SAVING
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To execute the final data transformation and persistence steps,
#       converting our cleaned DataFrame into model-ready numerical artifacts.
#
#   WORKFLOW:
#
#       4.  Engineer Features: Calculate k-mer counts for each sequence.
#           A k-mer size of 7 is chosen for the variable ITS region to
#           balance specificity and generality.
#       5.  Vectorize Data: Convert k-mer counts and text labels into
#           numerical matrices (X and y).
#       6.  Split Data: Partition the dataset into training (80%) and
#           testing (20%) sets for model development and evaluation.
#       7.  Save Artifacts: Save all processed data and encoders to disk
#           with unique 'its' filenames.
#
# -----------------------------------------------------------------------------
#

# --- Imports for this final phase ---
from collections import Counter
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.sparse import save_npz
import pickle

# --- Configuration for this phase ---
KMER_SIZE = 7 # Using a k-mer size of 7 for the variable ITS region
TEST_SPLIT_SIZE = 0.2
RANDOM_STATE = 42
MODELS_DIR = project_root / "models"
MODELS_DIR.mkdir(exist_ok=True)

# --- Step 4: Feature Engineering (K-mer Counting) ---
print(f"--- Step 4: Engineering {KMER_SIZE}-mer features ---")
def get_kmer_counts(sequence, k):
    """A reusable function to calculate k-mer counts for a sequence."""
    counts = Counter()
    for i in range(len(sequence) - k + 1):
        kmer = sequence[i:i+k]
        if "N" not in kmer.upper():
            counts[kmer] += 1
    return dict(counts)

df_filtered['kmer_counts'] = list(tqdm((get_kmer_counts(seq, KMER_SIZE) for seq in df_filtered['sequence']), total=len(df_filtered), desc="  - Calculating k-mers"))
print("-----------------------------------------")


# --- Step 5: Vectorize Features and Labels ---
print("--- Step 5: Vectorizing data ---")
vectorizer = DictVectorizer(sparse=True)
X = vectorizer.fit_transform(df_filtered['kmer_counts'])
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_filtered[TARGET_RANK])
print(f"  - Feature matrix shape: {X.shape}")
print(f"  - Label vector shape:   {y.shape}")
print("-----------------------------------------")


# --- Step 6: Split Data ---
print("--- Step 6: Splitting data into training/testing sets ---")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SPLIT_SIZE, random_state=RANDOM_STATE, stratify=y)
print(f"  - Training set shape: {X_train.shape}")
print(f"  - Testing set shape:  {X_test.shape}")
print("-----------------------------------------")


# --- Step 7: Save All Processed Artifacts ---
print("--- Step 7: Saving all ITS artifacts to disk ---")
# Save data matrices with unique 'its' names
save_npz(PROCESSED_DATA_DIR / "X_train_its.npz", X_train)
save_npz(PROCESSED_DATA_DIR / "X_test_its.npz", X_test)
np.save(PROCESSED_DATA_DIR / "y_train_its.npy", y_train)
np.save(PROCESSED_DATA_DIR / "y_test_its.npy", y_test)

# Save encoders with unique 'its' names
with open(MODELS_DIR / "its_genus_vectorizer.pkl", 'wb') as f:
    pickle.dump(vectorizer, f)
with open(MODELS_DIR / "its_genus_label_encoder.pkl", 'wb') as f:
    pickle.dump(label_encoder, f)
print("  - All artifacts saved successfully.")
print("-----------------------------------------")
print("\n[SUCCESS] ITS DATA PREPARATION COMPLETE.")

--- Step 4: Engineering 7-mer features ---


  - Calculating k-mers:   0%|          | 0/8370 [00:00<?, ?it/s]

-----------------------------------------
--- Step 5: Vectorizing data ---
  - Feature matrix shape: (8370, 18837)
  - Label vector shape:   (8370,)
-----------------------------------------
--- Step 6: Splitting data into training/testing sets ---
  - Training set shape: (6696, 18837)
  - Testing set shape:  (1674, 18837)
-----------------------------------------
--- Step 7: Saving all ITS artifacts to disk ---
  - All artifacts saved successfully.
-----------------------------------------

[SUCCESS] ITS DATA PREPARATION COMPLETE.
