### Preprocess Data

In [1]:
from pathlib import Path

from gtp.dataloading.data_preprocessors import ButterflyPatternizePreprocessor

ROOT_PHENOTYPE_INPUT_DIR = Path("/local/scratch/david/geno-pheno-data/colors")
ROOT_PHENOTYPE_OUTPUT_DIR = Path("/local/scratch/david/geno-pheno-data/colors/processed")

preprocessor = ButterflyPatternizePreprocessor(
    input_dir = ROOT_PHENOTYPE_INPUT_DIR,
    output_dir = ROOT_PHENOTYPE_OUTPUT_DIR
)

for species in ["erato", "melpomene"]:
    for wing in ["forewings", "hindwings"]:
        for color in ["color_1", "color_2", "color_3", "total"]:
            suffix_path = f"{species}_{wing}_PCA/PCA_{color}_loadings.csv"
            preprocessor.process(pca_csv_path_suffix=suffix_path)
            preprocessor.save_result(f"{species}_{wing}_{color}")


In [12]:
import os
from enum import Enum
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from pathlib import Path

from gtp.dataloading.data_preprocessors import ButterflyGenePreprocessor

class DNA_SCOPE(Enum):
    GENOME = "genome"
    CHROMOSOME = "chromosomes"
    GENE = "genes"
    
THIS_DNA_SCOPE = DNA_SCOPE.GENE.value    
ROOT_GENOTYPE_INPUT_DIR = Path("/local/scratch/david/geno-pheno-data/dna/")
ROOT_GENOTYPE_OUTPUT_DIR = Path("/local/scratch/david/geno-pheno-data/dna/processed")


def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ['bytes', 'KB', 'MB', 'GB', 'TB']:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0

def process_fn(pca_csv_path_suffix, save_dir):
    size = convert_bytes(os.path.getsize(ROOT_GENOTYPE_INPUT_DIR / pca_csv_path_suffix))
    print(f"Processing {pca_csv_path_suffix}: {size} bytes")
    preprocessor = ButterflyGenePreprocessor(
        input_dir = ROOT_GENOTYPE_INPUT_DIR,
        output_dir = ROOT_GENOTYPE_OUTPUT_DIR
    )
    preprocessor.process(pca_csv_path_suffix=pca_csv_path_suffix)
    preprocessor.save_result(save_dir)
    print(f"Completed {pca_csv_path_suffix}: {size} bytes")
    return True

futures = []
process_data = []
pool = ThreadPoolExecutor(1)

for species in ["erato", "melpomene"]:
    species_genome_path = Path(f"{species}/{THIS_DNA_SCOPE}")
    for root, dirs, files in os.walk(ROOT_GENOTYPE_INPUT_DIR / species_genome_path):
        for i, f in enumerate(tqdm(files, desc=f"Processing: {species}")):
            fname = f.split(".")[0]
            if os.path.exists(ROOT_GENOTYPE_OUTPUT_DIR / f"{THIS_DNA_SCOPE}/{species}/{fname}/ml_ready.npy"): 
                continue # Delete this after
            future = pool.submit(process_fn, species_genome_path / f, f"{THIS_DNA_SCOPE}/{species}/{fname}")
            futures.append(future)
            process_data.append([species_genome_path / f, f"{THIS_DNA_SCOPE}/{species}/{fname}"])
            
progress_bar = tqdm(len(process_data))
progress_bar.set_description(f"Processing Genotype data. Total: {len(process_data)}")

for future in as_completed(futures):
    success = future.result()
    progress_bar.update(1)


['Herato1505_cortex.tsv', 'Herato1801_optix.tsv', 'Herato1001_wntA.tsv', 'Herato1301_vvl.tsv', 'Herato1003_elf1a.tsv']


Processing: erato: 100%|██████████| 5/5 [00:00<00:00, 1923.64it/s]


Processing erato/genes/Herato1505_cortex.tsv: 366.1 MB bytes
['Hmel210001o_elf1a.tsv', 'Hmel215003o_cortex.tsv', 'Hmel213001o_vvl.tsv', 'Hmel218003o_optix.tsv', 'Hmel210001o_wntA.tsv']


Processing: melpomene: 100%|██████████| 5/5 [00:00<00:00, 3492.92it/s]
Processing Genotype data. Total: 10: : 10it [00:31,  3.11s/it]
Processing Genotype data. Total: 10: : 0it [00:00, ?it/s]

read_df exe time: 00:00:02


Pandas Apply:   0%|          | 0/484 [00:00<?, ?it/s]

df_extract_states exe time: 00:00:05
create_ml_ready exe time: 00:00:14
_process exe time: 00:00:25


Processing Genotype data. Total: 10: : 1it [00:34, 34.64s/it]

_save_result exe time: 00:00:08
Completed erato/genes/Herato1505_cortex.tsv: 366.1 MB bytes
Processing erato/genes/Herato1801_optix.tsv: 372.4 MB bytes
read_df exe time: 00:00:01


Pandas Apply:   0%|          | 0/484 [00:00<?, ?it/s]

df_extract_states exe time: 00:00:05
create_ml_ready exe time: 00:00:14
_process exe time: 00:00:25


Processing Genotype data. Total: 10: : 2it [01:09, 34.82s/it]

_save_result exe time: 00:00:08
Completed erato/genes/Herato1801_optix.tsv: 372.4 MB bytes
Processing erato/genes/Herato1001_wntA.tsv: 558.6 MB bytes
read_df exe time: 00:00:02


Pandas Apply:   0%|          | 0/484 [00:00<?, ?it/s]

df_extract_states exe time: 00:00:08
create_ml_ready exe time: 00:00:21
_process exe time: 00:00:37


Processing Genotype data. Total: 10: : 3it [02:00, 42.30s/it]

_save_result exe time: 00:00:12
Completed erato/genes/Herato1001_wntA.tsv: 558.6 MB bytes
Processing erato/genes/Herato1301_vvl.tsv: 342.2 MB bytes
read_df exe time: 00:00:01


Pandas Apply:   0%|          | 0/484 [00:00<?, ?it/s]

df_extract_states exe time: 00:00:05
create_ml_ready exe time: 00:00:13
_process exe time: 00:00:23


Processing Genotype data. Total: 10: : 4it [02:32, 38.29s/it]

_save_result exe time: 00:00:07
Completed erato/genes/Herato1301_vvl.tsv: 342.2 MB bytes
Processing erato/genes/Herato1003_elf1a.tsv: 566.6 MB bytes
read_df exe time: 00:00:02


Pandas Apply:   0%|          | 0/484 [00:00<?, ?it/s]

df_extract_states exe time: 00:00:08
create_ml_ready exe time: 00:00:21
_process exe time: 00:00:39


Processing Genotype data. Total: 10: : 5it [03:25, 43.61s/it]

_save_result exe time: 00:00:12
Completed erato/genes/Herato1003_elf1a.tsv: 566.6 MB bytes
Processing melpomene/genes/Hmel210001o_elf1a.tsv: 227.9 MB bytes
read_df exe time: 00:00:00


Pandas Apply:   0%|          | 0/187 [00:00<?, ?it/s]

df_extract_states exe time: 00:00:03
create_ml_ready exe time: 00:00:10
_process exe time: 00:00:16


Processing Genotype data. Total: 10: : 6it [03:50, 37.16s/it]

_save_result exe time: 00:00:07
Completed melpomene/genes/Hmel210001o_elf1a.tsv: 227.9 MB bytes
Processing melpomene/genes/Hmel215003o_cortex.tsv: 75.7 MB bytes
read_df exe time: 00:00:00


Pandas Apply:   0%|          | 0/187 [00:00<?, ?it/s]

df_extract_states exe time: 00:00:01
create_ml_ready exe time: 00:00:03
_process exe time: 00:00:05


Processing Genotype data. Total: 10: : 7it [03:58, 27.74s/it]

_save_result exe time: 00:00:02
Completed melpomene/genes/Hmel215003o_cortex.tsv: 75.7 MB bytes
Processing melpomene/genes/Hmel213001o_vvl.tsv: 105.0 MB bytes
read_df exe time: 00:00:00


Pandas Apply:   0%|          | 0/187 [00:00<?, ?it/s]

df_extract_states exe time: 00:00:01
create_ml_ready exe time: 00:00:04
_process exe time: 00:00:07


Processing Genotype data. Total: 10: : 8it [04:10, 22.53s/it]

_save_result exe time: 00:00:03
Completed melpomene/genes/Hmel213001o_vvl.tsv: 105.0 MB bytes
Processing melpomene/genes/Hmel218003o_optix.tsv: 124.5 MB bytes
read_df exe time: 00:00:00


Pandas Apply:   0%|          | 0/187 [00:00<?, ?it/s]

df_extract_states exe time: 00:00:01
create_ml_ready exe time: 00:00:05
_process exe time: 00:00:09


Processing Genotype data. Total: 10: : 9it [04:23, 19.62s/it]

_save_result exe time: 00:00:03
Completed melpomene/genes/Hmel218003o_optix.tsv: 124.5 MB bytes
Processing melpomene/genes/Hmel210001o_wntA.tsv: 103.4 MB bytes
read_df exe time: 00:00:00


Pandas Apply:   0%|          | 0/187 [00:00<?, ?it/s]

df_extract_states exe time: 00:00:01
create_ml_ready exe time: 00:00:04
_process exe time: 00:00:07


Processing Genotype data. Total: 10: : 10it [04:34, 16.99s/it]

_save_result exe time: 00:00:03
Completed melpomene/genes/Hmel210001o_wntA.tsv: 103.4 MB bytes
