In [1]:
# =============================================================================
# ATLAS - ITS PIPELINE - SETUP AND PATH DEFINITIONS
# =============================================================================
#
# OBJECTIVE:
#   To configure the notebook environment for the ITS pipeline, define all
#   necessary file paths, and verify that the source data is accessible.
#
# =============================================================================

# --- Imports ---
import pandas as pd
import numpy as np
from Bio import SeqIO
from tqdm.auto import tqdm
from pathlib import Path
import sys
import tarfile # Required for reading from .tgz archives
import io      # Required for stream handling

# --- Setup Project Path ---
project_root = Path.cwd().parent
print(f"Project Root: {project_root}")

# --- Define Core Directories ---
RAW_DATA_DIR = project_root / "data" / "raw"
PROCESSED_DATA_DIR = project_root / "data" / "processed"
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

# --- Define ITS Specific File Paths ---

# NOTE: The exact filename for the UNITE release may change.
# Update the filename below to match your downloaded file.
FULL_UNITE_PATH = RAW_DATA_DIR / "sh_general_release_dynamic_29.11.2022.tgz"

# Path to the small sample file we will create for development
SAMPLE_UNITE_PATH = RAW_DATA_DIR / "UNITE_sample_10k.fasta"

# --- Verification Step ---
if FULL_UNITE_PATH.exists():
    print("\nSource UNITE database archive found.")
    print(f"  - Location: {FULL_UNITE_PATH}")
else:
    print(f"\nERROR: The source UNITE database archive was not found.")
    print(f"  - Expected: {FULL_UNITE_PATH}")
    print("Please ensure the file is downloaded and correctly named in the 'data/raw' directory.")

Project Root: C:\Users\jampa\Music\atlas

ERROR: The source UNITE database archive was not found.
  - Expected: C:\Users\jampa\Music\atlas\data\raw\sh_general_release_dynamic_29.11.2022.tgz
Please ensure the file is downloaded and correctly named in the 'data/raw' directory.
