In [1]:
#
# -----------------------------------------------------------------------------
#
#             ATLAS v3: "Explorer" Unsupervised Pipeline Development
#
# -----------------------------------------------------------------------------
#
#   OBJECTIVE:
#
#       To develop the "Explorer" pipeline, the unsupervised learning component
#       of ATLAS. This pipeline is responsible for processing sequences that
#       were NOT classified by the "Filter" models, discovering novel taxonomic
#       groups, and providing a "best guess" annotation for them.
#
#   METHODOLOGY:
#
#       1.  Simulate Input: Create a sample FASTA file of "unclassified"
#           sequences for development purposes.
#       2.  Sequence Vectorization: Implement the Doc2Vec algorithm to convert
#           raw DNA sequences into meaningful numerical vectors (embeddings).
#           This involves creating a "corpus" of k-mers and training a model.
#       3.  Clustering: Apply the HDBSCAN algorithm to the sequence vectors
#           to group them into clusters of related organisms. HDBSCAN is
#           chosen for its ability to handle noise and find clusters of
#           varying shapes.
#       4.  Interpretation: For each discovered cluster, select a representative
#           sequence and (conceptually) outline how a BLAST search would be
#           used to provide a taxonomic hypothesis.
#
# -----------------------------------------------------------------------------
#

# --- Imports ---
import pandas as pd
import numpy as np
from Bio import SeqIO
from tqdm.auto import tqdm
from pathlib import Path
import sys
from collections import Counter

# Gensim for Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# HDBSCAN for clustering
import hdbscan

# Scikit-learn for helper functions
from sklearn.preprocessing import normalize

# --- Setup Project Path ---
try:
    project_root = Path(__file__).parent.parent
except NameError:
    project_root = Path.cwd().parent

print(f"Project Root: {project_root}")

# --- Define Directories ---
# We will use the existing directory structure
RAW_DATA_DIR = project_root / "data" / "raw"
MODELS_DIR = project_root / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# --- Verification ---
print("\nEnvironment is set up. Ready to begin Explorer pipeline development.")

Project Root: C:\Users\jampa\Music\atlas

Environment is set up. Ready to begin Explorer pipeline development.
