In [1]:
#
# -----------------------------------------------------------------------------
#
#             ATLAS v3: "Explorer" Unsupervised Pipeline Development
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To develop the "Explorer" pipeline, the unsupervised learning component
#       of ATLAS. This pipeline is responsible for processing sequences that
#       were NOT classified by the "Filter" models, discovering novel taxonomic
#       groups, and providing a "best guess" annotation for them.
#
#   METHODOLOGY:
#
#       1.  Simulate Input: Create a sample FASTA file of "unclassified"
#           sequences for development purposes.
#       2.  Sequence Vectorization: Implement the Doc2Vec algorithm to convert
#           raw DNA sequences into meaningful numerical vectors (embeddings).
#           This involves creating a "corpus" of k-mers and training a model.
#       3.  Clustering: Apply the HDBSCAN algorithm to the sequence vectors
#           to group them into clusters of related organisms. HDBSCAN is
#           chosen for its ability to handle noise and find clusters of
#           varying shapes.
#       4.  Interpretation: For each discovered cluster, select a representative
#           sequence and (conceptually) outline how a BLAST search would be
#           used to provide a taxonomic hypothesis.
#
# -----------------------------------------------------------------------------
#

# --- Imports ---
import pandas as pd
import numpy as np
from Bio import SeqIO
from tqdm.auto import tqdm
from pathlib import Path
import sys
from collections import Counter

# Gensim for Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# HDBSCAN for clustering
import hdbscan

# Scikit-learn for helper functions
from sklearn.preprocessing import normalize

# --- Setup Project Path ---
try:
    project_root = Path(__file__).parent.parent
except NameError:
    project_root = Path.cwd().parent

print(f"Project Root: {project_root}")

# --- Define Directories ---
# We will use the existing directory structure
RAW_DATA_DIR = project_root / "data" / "raw"
MODELS_DIR = project_root / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# --- Verification ---
print("\nEnvironment is set up. Ready to begin Explorer pipeline development.")

Project Root: C:\Users\jampa\Music\atlas

Environment is set up. Ready to begin Explorer pipeline development.


In [4]:
#
# -----------------------------------------------------------------------------
#
#                  STEP 1 (REVISED): SIMULATE THE INPUT DATA
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To create a sample FASTA file that represents the "unclassified"
#       sequences that would be the output of the "Filter" pipelines.
#
#   RATIONALE (UPDATED):
#
#       Based on a more rigorous approach, we will source our "unclassified"
#       sequences from a database that was NOT used to train our most recent
#       (ITS) model. Using the full SILVA database provides a diverse set of
#       16S and 18S sequences that are novel from the perspective of the ITS
#       classifier. This avoids data leakage and creates a more realistic
#       development environment for the Explorer pipeline.
#
# -----------------------------------------------------------------------------
#

# --- Configuration ---
SIMULATED_INPUT_PATH = RAW_DATA_DIR / "unclassified_sample_for_explorer.fasta"
# --- FIX: Use the full SILVA database as the source ---
SOURCE_FILE_PATH = RAW_DATA_DIR / "SILVA_138.1_SSURef_NR99_tax_silva.fasta"
NUM_SEQUENCES_TO_SIMULATE = 5000

# --- Main Logic ---
# This check prevents us from re-creating the file on every run
if not SIMULATED_INPUT_PATH.exists():
    print(f"Simulating input data for the Explorer pipeline...")
    print(f"  - Source: {SOURCE_FILE_PATH.name}")
    print(f"  - Destination: {SIMULATED_INPUT_PATH.name}")
    
    simulated_records = []
    try:
        with open(SOURCE_FILE_PATH, "r") as handle_in:
            # Use tqdm to show progress as reading the large file can take a moment
            records_iterator = SeqIO.parse(handle_in, "fasta")
            for i, record in tqdm(enumerate(records_iterator), total=NUM_SEQUENCES_TO_SIMULATE, desc="  - Sampling records"):
                if i >= NUM_SEQUENCES_TO_SIMULATE:
                    break
                simulated_records.append(record)
        
        # Write the collected records to the new file
        with open(SIMULATED_INPUT_PATH, "w") as handle_out:
            SeqIO.write(simulated_records, handle_out, "fasta")
            
        print(f"\n[SUCCESS] Created simulated input file with {len(simulated_records)} sequences.")

    except FileNotFoundError:
        print(f"\n[ERROR] Source file not found: {SOURCE_FILE_PATH}")
        print("        Please ensure the full SILVA FASTA file exists in `data/raw`.")
    except Exception as e:
        print(f"\n[ERROR] An error occurred: {e}")
        
else:
    print(f"Simulated input file already exists. No action needed.")
    print(f"  - Location: {SIMULATED_INPUT_PATH}")

# --- Load the sequences into memory for the next steps ---
print("\nLoading simulated sequences into memory...")
try:
    unclassified_sequences = list(SeqIO.parse(SIMULATED_INPUT_PATH, "fasta"))
    print(f"  - Successfully loaded {len(unclassified_sequences)} sequences.")
except FileNotFoundError:
    print(f"[ERROR] Could not load sequences. Please check for the file at {SIMULATED_INPUT_PATH}")
    unclassified_sequences = []

Simulating input data for the Explorer pipeline...
  - Source: SILVA_138.1_SSURef_NR99_tax_silva.fasta
  - Destination: unclassified_sample_for_explorer.fasta


  - Sampling records:   0%|          | 0/5000 [00:00<?, ?it/s]


[SUCCESS] Created simulated input file with 5000 sequences.

Loading simulated sequences into memory...
  - Successfully loaded 5000 sequences.


In [5]:
#
# -----------------------------------------------------------------------------
#
#                   STEP 2: SEQUENCE VECTORIZATION (DOC2VEC)
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To convert our list of 5,000 "unclassified" DNA sequences into
#       high-quality numerical vectors using the Doc2Vec algorithm.
#
#   RATIONALE:
#
#       Unlike simple k-mer counting, Doc2Vec learns the contextual
#       relationships between k-mers within a sequence. This produces a
#       dense vector embedding for each sequence where similar sequences are
#       mapped to nearby points in vector space, making them ideal for
#       clustering algorithms.
#
#   WORKFLOW:
#
#       1.  Prepare a "corpus" by converting each DNA sequence into a list of
#           its constituent k-mers (our "words").
#       2.  Tag each document (sequence) with its unique ID.
#       3.  Define and train a Gensim Doc2Vec model on this corpus.
#       4.  Save the trained model for future use.
#       5.  Extract the final 100-dimensional vector for each sequence.
#
# -----------------------------------------------------------------------------
#

# --- Configuration for this phase ---
KMER_SIZE = 6          # A smaller k-mer is good for finding general patterns
VECTOR_SIZE = 100      # The dimensionality of our final sequence vectors
DOC2VEC_MODEL_PATH = MODELS_DIR / "explorer_doc2vec.model"

# --- 1. Prepare the Corpus for Doc2Vec ---
print("--- Step 2.1: Preparing the Doc2Vec Corpus ---")

def sequence_to_kmers(sequence_str, k):
    """Converts a DNA sequence string into a list of k-mers."""
    return [sequence_str[i:i+k] for i in range(len(sequence_str) - k + 1)]

# Create a list of TaggedDocument objects, which is the required input for Gensim
# Each document's "words" are its k-mers, and its "tag" is its sequence ID.
corpus = [
    TaggedDocument(
        words=sequence_to_kmers(str(seq.seq), KMER_SIZE),
        tags=[seq.id]
    )
    for seq in tqdm(unclassified_sequences, desc="  - Processing sequences")
]

print(f"  - Corpus prepared with {len(corpus)} documents.")


# --- 2. Define and Train the Doc2Vec Model ---
print("\n--- Step 2.2: Training the Doc2Vec Model ---")
# Instantiate the model with key parameters
# `dm=1` specifies the 'distributed memory' (PV-DM) algorithm
# `min_count=3` ignores rare k-mers to reduce noise
# `window=8` looks at a context of 8 k-mers on either side
# `epochs=40` is a good number of training iterations for this dataset size
doc2vec_model = Doc2Vec(
    vector_size=VECTOR_SIZE,
    dm=1,
    min_count=3,
    window=8,
    epochs=40,
    workers=4 # Use 4 CPU cores for training
)

# Build the vocabulary from our corpus
doc2vec_model.build_vocab(corpus)
print(f"  - Vocabulary built with {len(doc2vec_model.wv.key_to_index)} unique k-mers.")

# Train the model
print("  - Starting training (this may take a minute)...")
doc2vec_model.train(
    corpus,
    total_examples=doc2vec_model.corpus_count,
    epochs=doc2vec_model.epochs
)
print("  - Training complete.")


# --- 3. Save the Trained Model ---
print(f"\n--- Step 2.3: Saving the Model ---")
try:
    doc2vec_model.save(str(DOC2VEC_MODEL_PATH))
    print(f"  - Model saved successfully to: {DOC2VEC_MODEL_PATH}")
except Exception as e:
    print(f"[ERROR] Could not save the model: {e}")


# --- 4. Extract the Final Vectors ---
print("\n--- Step 2.4: Extracting Sequence Vectors ---")
# The `model.dv` object holds the final vector for each document tag (sequence ID)
sequence_vectors = np.array([doc2vec_model.dv[seq.id] for seq in unclassified_sequences])

# It's good practice to normalize the vectors for clustering
sequence_vectors = normalize(sequence_vectors)
print("  - Vectors extracted and normalized.")

# --- Final Verification ---
print("\n" + "="*45)
print("    VECTORIZATION COMPLETE")
print("="*45)
print(f"  - Final shape of our vector matrix: {sequence_vectors.shape}")
print(f"  - This corresponds to {sequence_vectors.shape[0]} sequences, each with {sequence_vectors.shape[1]} features.")
print("="*45)

--- Step 2.1: Preparing the Doc2Vec Corpus ---


  - Processing sequences:   0%|          | 0/5000 [00:00<?, ?it/s]

  - Corpus prepared with 5000 documents.

--- Step 2.2: Training the Doc2Vec Model ---
  - Vocabulary built with 5793 unique k-mers.
  - Starting training (this may take a minute)...
  - Training complete.

--- Step 2.3: Saving the Model ---
  - Model saved successfully to: C:\Users\jampa\Music\atlas\models\explorer_doc2vec.model

--- Step 2.4: Extracting Sequence Vectors ---
  - Vectors extracted and normalized.

    VECTORIZATION COMPLETE
  - Final shape of our vector matrix: (5000, 100)
  - This corresponds to 5000 sequences, each with 100 features.


In [6]:
#
# -----------------------------------------------------------------------------
#
#                      STEP 3: CLUSTERING WITH HDBSCAN
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To apply the HDBSCAN (Hierarchical Density-Based Spatial Clustering
#       of Applications with Noise) algorithm to our sequence vectors to
#       identify clusters of potentially related, novel organisms.
#
#   RATIONALE:
#
#       HDBSCAN is the ideal choice for this biological discovery task. It
#       excels at identifying clusters of varying densities and, crucially,
#       can mark data points as 'noise' if they do not belong to any distinct
#       group. This allows us to separate potentially novel families of
#       organisms from truly unique, singleton sequences.
#
#   WORKFLOW:
#
#       1.  Instantiate the HDBSCAN clusterer with parameters optimized for
#           our data (e.g., a minimum cluster size of 5).
#       2.  Fit the clusterer to our `sequence_vectors` matrix.
#       3.  Analyze and print a summary of the results, including the number
#           of clusters discovered and the number of sequences classified as noise.
#
# -----------------------------------------------------------------------------
#

# --- 1. Instantiate and Fit the HDBSCAN Clusterer ---
print("--- Step 3.1: Performing HDBSCAN Clustering ---")

# We set `min_cluster_size=5`, meaning a group needs at least 5 related
# sequences to be considered a distinct cluster. This helps filter out
# very small, insignificant groupings.
# `min_samples=1` helps find clusters in less dense regions.
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=5,
    min_samples=1,
    metric='euclidean', # Standard distance metric for vector spaces
    cluster_selection_method='eom' # Excess of Mass is a robust method
)

# Fit the model to our sequence vectors. This is where the clustering happens.
cluster_labels = clusterer.fit_predict(sequence_vectors)
print("  - Clustering complete.")


# --- 2. Analyze the Results ---
print("\n--- Step 3.2: Analyzing Clustering Results ---")

# The cluster labels are an array of integers. -1 represents noise.
# We can find the number of unique clusters by finding the max label value.
num_clusters = len(np.unique(cluster_labels)) - 1 # Subtract 1 for the noise label (-1)
num_noise_points = np.sum(cluster_labels == -1)

# --- Final Verification ---
print("\n" + "="*45)
print("    CLUSTERING COMPLETE")
print("="*45)
print(f"  - Number of new clusters discovered: {num_clusters}")
print(f"  - Number of sequences labeled as noise: {num_noise_points}")
print(f"  - Total sequences processed: {len(cluster_labels)}")
print("="*45)

# --- Store the results in our main DataFrame for easy access ---
# We'll create a new DataFrame to hold our final results
df_results = pd.DataFrame({
    'sequence_id': [seq.id for seq in unclassified_sequences],
    'cluster_label': cluster_labels
})

print("\n--- ASCII PREVIEW: Clustering Results (First 10 Rows) ---")
display(df_results.head(10))

--- Step 3.1: Performing HDBSCAN Clustering ---




  - Clustering complete.

--- Step 3.2: Analyzing Clustering Results ---

    CLUSTERING COMPLETE
  - Number of new clusters discovered: 2
  - Number of sequences labeled as noise: 4935
  - Total sequences processed: 5000

--- ASCII PREVIEW: Clustering Results (First 10 Rows) ---


Unnamed: 0,sequence_id,cluster_label
0,AY846379.1.1791,-1
1,AB001445.1.1538,-1
2,AY929368.1.1768,-1
3,KM209255.204.1909,-1
4,AY955002.1.1727,-1
5,HL281554.1.1313,-1
6,AB002515.1.1332,-1
7,AB002523.1.1496,-1
8,LF644976.16.1783,-1
9,KY857824.1.1808,-1


In [7]:
#
# -----------------------------------------------------------------------------
#
#                     STEP 4: CLUSTER INTERPRETATION
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To analyze the newly discovered clusters and extract a single,
#       representative sequence from each one. This sequence can then be used
#       to infer the potential taxonomic identity of the entire cluster.
#
#   RATIONALE:
#
#       A cluster is just a group of numbers until we can assign a biological
#       hypothesis to it. By finding the sequence closest to the cluster's
#       center in vector space, we select the member that best represents the
#       group's shared characteristics. This sequence is the ideal candidate
#       for a BLAST search to find its nearest relatives in public databases.
#
#   WORKFLOW:
#
#       1.  Iterate through each unique cluster label found by HDBSCAN (ignoring -1).
#       2.  For each cluster, calculate its vector centroid (the mean vector).
#       3.  Find the sequence within the cluster whose vector is closest to the
#           centroid using Euclidean distance.
#       4.  Print a summary report for each cluster, including its size and the
#           ID and full sequence of its representative member.
#
# -----------------------------------------------------------------------------
#

# --- 1. Find and Analyze Each Cluster ---
print("--- Step 4.1: Analyzing Discovered Clusters and Extracting Representatives ---")

# Get the set of unique cluster labels, excluding the noise label (-1)
unique_cluster_ids = sorted(np.unique(cluster_labels))
if -1 in unique_cluster_ids:
    unique_cluster_ids.remove(-1)

# Loop through each discovered cluster ID
for cluster_id in unique_cluster_ids:
    # --- Find all members of the current cluster ---
    # Get the indices of all sequences belonging to this cluster
    cluster_indices = np.where(cluster_labels == cluster_id)[0]
    
    # Get the corresponding high-dimensional vectors for these sequences
    cluster_vectors = sequence_vectors[cluster_indices]
    
    # --- Calculate the Centroid ---
    # The centroid is the mean vector of all members
    centroid = np.mean(cluster_vectors, axis=0)
    
    # --- Find the Most Representative Sequence ---
    # Calculate the distance of each member vector to the centroid
    distances = [np.linalg.norm(vec - centroid) for vec in cluster_vectors]
    
    # The index of the minimum distance within our `cluster_vectors` array
    representative_index_in_cluster = np.argmin(distances)
    
    # The original index in the full 5,000-sequence dataset
    original_index = cluster_indices[representative_index_in_cluster]
    
    # Get the Bio.SeqRecord object for our representative sequence
    representative_sequence_record = unclassified_sequences[original_index]
    
    # --- Print the Report for this Cluster ---
    print("\n" + "#"*70)
    print(f"####### CLUSTER {cluster_id} ANALYSIS")
    print("#"*70)
    print(f"\n  - Number of sequences in cluster: {len(cluster_indices)}")
    print(f"  - Representative Sequence ID: {representative_sequence_record.id}")
    print("\n  - Full Representative Sequence:")
    print("    " + str(representative_sequence_record.seq))
    print("\n" + "-"*70)
    print("  ACTION: To identify this novel group, the above sequence should be")
    print("          submitted to the NCBI BLAST web tool (blastn suite).")
    print("          https://blast.ncbi.nlm.nih.gov/Blast.cgi")
    print("-"*70)

print("\n\n[SUCCESS] Explorer pipeline development and simulation complete.")

--- Step 4.1: Analyzing Discovered Clusters and Extracting Representatives ---

######################################################################
####### CLUSTER 0 ANALYSIS
######################################################################

  - Number of sequences in cluster: 51
  - Representative Sequence ID: JN639430.1.1262

  - Full Representative Sequence:
    ACGGGUGAGUAACGCGUAGGUAACCUACCUCAUAGCGGGGGAUAACUAUUGGAAACGAUAGCUAAUACCGCAUAAAAGUGUUUAACCCAUGUUAAACAUUUAAAAGGUGCAACUGCAUCACUAUGAGAUGGACCUGCGUUGUAUUAGCUAGUUGGUGAGGUAACGGCUCACCAAGGCGACGAUACAUAGCCGACCUGAGAGGGUGAUCGGCCACACUGGGACUGAGACACGGCCCAGACUCCUACGGGAGGCAGCAGUAGGGAAUCUUCGGCAAUGGACGGAAGUCUGACCGAGCAACGCCGCGUGAGUGAAGAAGGUUUUCGGAUCGUAAAGCUCUGUUGUUAGAGAAGAAUGAUGGUGGGAGUGGAAAAUCCACCAUGUGACGGUAACUAACCAGAAAGGGACGGCUAACUACGUGCCAGCAGCCGCGGUAAUACGUAGGUCCCGAGCGUUGUCCGGAUUUAUUGGGCGUAAAGCGAGCGCAGGCGGUUCUUUAAGUCUGAAGUUAAAGGCAGUGGCUCAACCAUUGUACGCUUUGGAAACUGGAGAACUUGAGUGCAGAAGGGGAGAGUGGAAUUCCAUGUGUAGCGGUGAAAUGCGUAGAUAUAUGGAGGAACACCGGUG