### Preprocess Data

In [1]:
from pathlib import Path

from gtp.dataloading.data_preprocessors import ButterflyPatternizePreprocessor

ROOT_PHENOTYPE_INPUT_DIR = Path("/local/scratch/carlyn.1/dna/colors/")
ROOT_PHENOTYPE_OUTPUT_DIR = Path("/local/scratch/carlyn.1/dna/colors/processed")

preprocessor = ButterflyPatternizePreprocessor(
    input_dir = ROOT_PHENOTYPE_INPUT_DIR,
    output_dir = ROOT_PHENOTYPE_OUTPUT_DIR
)

for species in ["erato", "melpomene"]:
    for wing in ["forewings", "hindwings"]:
        for color in ["color_1", "color_2", "color_3", "total"]:
            suffix_path = f"{species}_{wing}_PCA/PCA_{color}_loadings.csv"
            preprocessor.process(pca_csv_path_suffix=suffix_path)
            preprocessor.save_result(f"{species}_{wing}_{color}")


  """


(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(480, 481)
(188, 189)
(188, 189)
(188, 189)
(188, 189)
(188, 189)
(188, 189)
(188, 189)
(188, 189)
(185, 186)
(185, 186)
(185, 186)
(185, 186)
(185, 186)
(185, 186)
(185, 186)
(185, 186)


In [2]:
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from pathlib import Path

from gtp.dataloading.data_preprocessors import ButterflyGenePreprocessor

ROOT_GENOTYPE_INPUT_DIR = Path("/local/scratch/carlyn.1/dna/vcfs/")
ROOT_GENOTYPE_OUTPUT_DIR = Path("/local/scratch/carlyn.1/dna/vcfs/processed")

def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0

def process_fn(pca_csv_path_suffix, save_dir):
    size = convert_bytes(os.path.getsize(ROOT_GENOTYPE_INPUT_DIR / pca_csv_path_suffix))
    print(f"Processing {pca_csv_path_suffix}: {size} bytes")
    preprocessor = ButterflyGenePreprocessor(
        input_dir = ROOT_GENOTYPE_INPUT_DIR,
        output_dir = ROOT_GENOTYPE_OUTPUT_DIR
    )
    preprocessor.process(pca_csv_path_suffix=pca_csv_path_suffix)
    preprocessor.save_result(save_dir)
    print(f"Completed {pca_csv_path_suffix}: {size} bytes")
    return True

futures = []
process_data = []
pool = ThreadPoolExecutor(1)

for species in ["erato", "melpomene"]:
    species_genome_path = Path(f"{species}/genome")
    for root, dirs, files in os.walk(ROOT_GENOTYPE_INPUT_DIR / species_genome_path):
        for i, f in enumerate(tqdm(files, desc=f"Processing: {species}")):
            fname = f.split(".")[0]
            if os.path.exists(ROOT_GENOTYPE_OUTPUT_DIR / f"genome/{species}/{fname}/ml_ready.npy"): 
                continue # Delete this after
            future = pool.submit(process_fn, species_genome_path / f, f"genome/{species}/{fname}")
            futures.append(future)
            process_data.append([species_genome_path / f, f"genome/{species}/{fname}"])
            
progress_bar = tqdm(len(process_data))
progress_bar.set_description(f"Processing Genotype data. Total: {len(process_data)}")

for future in as_completed(futures):
    success = future.result()
    progress_bar.update(1)
    #progress_bar.set_postfix(total=f"{len(process_data)}")


Processing: erato: 100%|██████████| 195/195 [00:00<00:00, 75437.12it/s]
Processing: melpomene: 100%|██████████| 38/38 [00:00<00:00, 63172.24it/s]
Processing Genotype data. Total: 9: : 9it [22:32, 150.33s/it]
