In [1]:
#
# -----------------------------------------------------------------------------
#
#             ATLAS v3: "Explorer" Unsupervised Pipeline Development
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To develop the "Explorer" pipeline, the unsupervised learning component
#       of ATLAS. This pipeline is responsible for processing sequences that
#       were NOT classified by the "Filter" models, discovering novel taxonomic
#       groups, and providing a "best guess" annotation for them.
#
#   METHODOLOGY:
#
#       1.  Simulate Input: Create a sample FASTA file of "unclassified"
#           sequences for development purposes.
#       2.  Sequence Vectorization: Implement the Doc2Vec algorithm to convert
#           raw DNA sequences into meaningful numerical vectors (embeddings).
#           This involves creating a "corpus" of k-mers and training a model.
#       3.  Clustering: Apply the HDBSCAN algorithm to the sequence vectors
#           to group them into clusters of related organisms. HDBSCAN is
#           chosen for its ability to handle noise and find clusters of
#           varying shapes.
#       4.  Interpretation: For each discovered cluster, select a representative
#           sequence and (conceptually) outline how a BLAST search would be
#           used to provide a taxonomic hypothesis.
#
# -----------------------------------------------------------------------------
#

# --- Imports ---
import pandas as pd
import numpy as np
from Bio import SeqIO
from tqdm.auto import tqdm
from pathlib import Path
import sys
from collections import Counter

# Gensim for Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# HDBSCAN for clustering
import hdbscan

# Scikit-learn for helper functions
from sklearn.preprocessing import normalize

# --- Setup Project Path ---
try:
    project_root = Path(__file__).parent.parent
except NameError:
    project_root = Path.cwd().parent

print(f"Project Root: {project_root}")

# --- Define Directories ---
# We will use the existing directory structure
RAW_DATA_DIR = project_root / "data" / "raw"
MODELS_DIR = project_root / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# --- Verification ---
print("\nEnvironment is set up. Ready to begin Explorer pipeline development.")

Project Root: C:\Users\jampa\Music\atlas

Environment is set up. Ready to begin Explorer pipeline development.


In [4]:
#
# -----------------------------------------------------------------------------
#
#                  STEP 1 (REVISED): SIMULATE THE INPUT DATA
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To create a sample FASTA file that represents the "unclassified"
#       sequences that would be the output of the "Filter" pipelines.
#
#   RATIONALE (UPDATED):
#
#       Based on a more rigorous approach, we will source our "unclassified"
#       sequences from a database that was NOT used to train our most recent
#       (ITS) model. Using the full SILVA database provides a diverse set of
#       16S and 18S sequences that are novel from the perspective of the ITS
#       classifier. This avoids data leakage and creates a more realistic
#       development environment for the Explorer pipeline.
#
# -----------------------------------------------------------------------------
#

# --- Configuration ---
SIMULATED_INPUT_PATH = RAW_DATA_DIR / "unclassified_sample_for_explorer.fasta"
# --- FIX: Use the full SILVA database as the source ---
SOURCE_FILE_PATH = RAW_DATA_DIR / "SILVA_138.1_SSURef_NR99_tax_silva.fasta"
NUM_SEQUENCES_TO_SIMULATE = 5000

# --- Main Logic ---
# This check prevents us from re-creating the file on every run
if not SIMULATED_INPUT_PATH.exists():
    print(f"Simulating input data for the Explorer pipeline...")
    print(f"  - Source: {SOURCE_FILE_PATH.name}")
    print(f"  - Destination: {SIMULATED_INPUT_PATH.name}")
    
    simulated_records = []
    try:
        with open(SOURCE_FILE_PATH, "r") as handle_in:
            # Use tqdm to show progress as reading the large file can take a moment
            records_iterator = SeqIO.parse(handle_in, "fasta")
            for i, record in tqdm(enumerate(records_iterator), total=NUM_SEQUENCES_TO_SIMULATE, desc="  - Sampling records"):
                if i >= NUM_SEQUENCES_TO_SIMULATE:
                    break
                simulated_records.append(record)
        
        # Write the collected records to the new file
        with open(SIMULATED_INPUT_PATH, "w") as handle_out:
            SeqIO.write(simulated_records, handle_out, "fasta")
            
        print(f"\n[SUCCESS] Created simulated input file with {len(simulated_records)} sequences.")

    except FileNotFoundError:
        print(f"\n[ERROR] Source file not found: {SOURCE_FILE_PATH}")
        print("        Please ensure the full SILVA FASTA file exists in `data/raw`.")
    except Exception as e:
        print(f"\n[ERROR] An error occurred: {e}")
        
else:
    print(f"Simulated input file already exists. No action needed.")
    print(f"  - Location: {SIMULATED_INPUT_PATH}")

# --- Load the sequences into memory for the next steps ---
print("\nLoading simulated sequences into memory...")
try:
    unclassified_sequences = list(SeqIO.parse(SIMULATED_INPUT_PATH, "fasta"))
    print(f"  - Successfully loaded {len(unclassified_sequences)} sequences.")
except FileNotFoundError:
    print(f"[ERROR] Could not load sequences. Please check for the file at {SIMULATED_INPUT_PATH}")
    unclassified_sequences = []

Simulating input data for the Explorer pipeline...
  - Source: SILVA_138.1_SSURef_NR99_tax_silva.fasta
  - Destination: unclassified_sample_for_explorer.fasta


  - Sampling records:   0%|          | 0/5000 [00:00<?, ?it/s]


[SUCCESS] Created simulated input file with 5000 sequences.

Loading simulated sequences into memory...
  - Successfully loaded 5000 sequences.
