### Preprocess Data

In [7]:
from pathlib import Path

from gtp.dataloading.data_preprocessors import ButterflyPatternizePreprocessor

ROOT_PHENOTYPE_INPUT_DIR = Path("/local/scratch/carlyn.1/dna/colors/")
ROOT_PHENOTYPE_OUTPUT_DIR = Path("/local/scratch/carlyn.1/dna/colors/processed")

preprocessor = ButterflyPatternizePreprocessor(
    input_dir = ROOT_PHENOTYPE_INPUT_DIR,
    output_dir = ROOT_PHENOTYPE_OUTPUT_DIR
)

for species in ["erato", "melpomene"]:
    for wing in ["forewings", "hindwings"]:
        for color in ["color_1", "color_2", "color_3", "total"]:
            suffix_path = f"{species}_{wing}_PCA/PCA_{color}_loadings.csv"
            preprocessor.process(pca_csv_path_suffix=suffix_path)
            preprocessor.save_result(f"{species}_{wing}_{color}")


In [1]:
import os
from pathlib import Path
from tqdm import tqdm

from gtp.dataloading.data_preprocessors import ButterflyGenePreprocessor

ROOT_GENOTYPE_INPUT_DIR = Path("/local/scratch/carlyn.1/dna/vcfs/")
ROOT_GENOTYPE_OUTPUT_DIR = Path("/local/scratch/carlyn.1/dna/vcfs/processed")

preprocessor = ButterflyGenePreprocessor(
    input_dir = ROOT_GENOTYPE_INPUT_DIR,
    output_dir = ROOT_GENOTYPE_OUTPUT_DIR
)
for species in ["erato", "melpomene"]:
    species_genome_path = Path(f"{species}/genome")
    for root, dirs, files in os.walk(ROOT_GENOTYPE_INPUT_DIR / species_genome_path):
        for i, f in enumerate(tqdm(files, desc=f"Processing: {species}")):
            if species == "erato" and i < 9:
                print("continuing")
                continue # Do this for now. Delete after
            fname = f.split(".")[0]
            preprocessor.process(pca_csv_path_suffix=species_genome_path / f)
            preprocessor.save_result(f"genome/{species}/{fname}")






  """
Processing: erato:   0%|          | 0/195 [00:00<?, ?it/s]

continuing
continuing
continuing
continuing
continuing
continuing
continuing
continuing
continuing


Processing: erato:   9%|▉         | 18/195 [10:27<55:40, 18.87s/it]  

In [1]:
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from pathlib import Path

from gtp.dataloading.data_preprocessors import ButterflyGenePreprocessor

ROOT_GENOTYPE_INPUT_DIR = Path("/local/scratch/carlyn.1/dna/vcfs/")
ROOT_GENOTYPE_OUTPUT_DIR = Path("/local/scratch/carlyn.1/dna/vcfs/processed")

def process_fn(pca_csv_path_suffix, save_dir):
    preprocessor = ButterflyGenePreprocessor(
        input_dir = ROOT_GENOTYPE_INPUT_DIR,
        output_dir = ROOT_GENOTYPE_OUTPUT_DIR
    )
    preprocessor.process(pca_csv_path_suffix=pca_csv_path_suffix)
    preprocessor.save_result(save_dir)
    return True

futures = []
process_data = []
pool = ThreadPoolExecutor(16)

for species in ["melpomene"]:
    species_genome_path = Path(f"{species}/genome")
    for root, dirs, files in os.walk(ROOT_GENOTYPE_INPUT_DIR / species_genome_path):
        for i, f in enumerate(tqdm(files, desc=f"Processing: {species}")):
            fname = f.split(".")[0]
            if os.path.exists(ROOT_GENOTYPE_OUTPUT_DIR / f"genome/{species}/{fname}/ml_ready.npy"): 
                print("Continue")
                continue # Delete this after
            future = pool.submit(process_fn, species_genome_path / f, f"genome/{species}/{fname}")
            futures.append(future)
            process_data.append([species_genome_path / f, f"genome/{species}/{fname}"])
            
progress_bar = tqdm(len(process_data))
progress_bar.set_description("Processing Genotype data")

for future in as_completed(futures):
    success = future.result()
    progress_bar.update(1)
    progress_bar.set_postfix(completed=f":)")

  """
  from .autonotebook import tqdm as notebook_tqdm
Processing: melpomene:   0%|          | 0/38 [00:00<?, ?it/s]


NameError: name 'fname' is not defined