### Preprocess Data

In [1]:
from pathlib import Path
import socket
from enum import Enum


class Hosts(Enum):
    IMAGEOMICS_SERVER = "cse-cnc196909s.coeit.osu.edu"


hostname = socket.gethostname()
if hostname == Hosts.IMAGEOMICS_SERVER.value:
    ROOT_PHENOTYPE_INPUT_DIR = Path("/local/scratch/carlyn.1/dna/colors")
    ROOT_PHENOTYPE_OUTPUT_DIR = Path("/local/scratch/carlyn.1/dna/colors/processed")
    ROOT_GENOTYPE_INPUT_DIR = Path("/local/scratch/carlyn.1/dna/vcfs")
    ROOT_GENOTYPE_OUTPUT_DIR = Path("/local/scratch/carlyn.1/dna/processed")
else:
    ROOT_PHENOTYPE_INPUT_DIR = Path("/local/scratch/david/geno-pheno-data/colors")
    ROOT_PHENOTYPE_OUTPUT_DIR = Path(
        "/local/scratch/david/geno-pheno-data/colors/processed"
    )
    ROOT_GENOTYPE_INPUT_DIR = Path("/local/scratch/david/geno-pheno-data/dna/")
    ROOT_GENOTYPE_OUTPUT_DIR = Path(
        "/local/scratch/david/geno-pheno-data/dna/processed"
    )

In [2]:
from gtp.dataloading.data_preprocessors import ButterflyPatternizePreprocessor

preprocessor = ButterflyPatternizePreprocessor(
    input_dir=ROOT_PHENOTYPE_INPUT_DIR, output_dir=ROOT_PHENOTYPE_OUTPUT_DIR
)

for species in ["erato", "melpomene"]:
    for wing in ["forewings", "hindwings"]:
        for color in ["color_1", "color_2", "color_3", "total"]:
            suffix_path = f"{species}_{wing}_PCA/PCA_{color}_loadings.csv"
            preprocessor.process(pca_csv_path_suffix=suffix_path)
            preprocessor.save_result(f"{species}_{wing}_{color}")


In [None]:
import os
from enum import Enum
from multiprocessing import Pool
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from pathlib import Path

from gtp.dataloading.data_preprocessors import ButterflyGenePreprocessor


class DNA_SCOPE(Enum):
    GENOME = "genome"
    CHROMOSOME = "chromosomes"
    GENE = "genes"


THIS_DNA_SCOPE = DNA_SCOPE.GENOME.value


def convert_bytes(num):
    """
    this function will convert bytes to MB.... GB... etc
    """
    for x in ["bytes", "KB", "MB", "GB", "TB"]:
        if num < 1024.0:
            return "%3.1f %s" % (num, x)
        num /= 1024.0


def process_fn(pca_csv_path_suffix, save_dir):
    size = convert_bytes(os.path.getsize(ROOT_GENOTYPE_INPUT_DIR / pca_csv_path_suffix))
    print(f"Processing {pca_csv_path_suffix}: {size} bytes")
    preprocessor = ButterflyGenePreprocessor(
        input_dir=ROOT_GENOTYPE_INPUT_DIR, output_dir=ROOT_GENOTYPE_OUTPUT_DIR
    )
    preprocessor.process(pca_csv_path_suffix=pca_csv_path_suffix)
    preprocessor.save_result(save_dir)
    print(f"Completed {pca_csv_path_suffix}: {size} bytes")
    return True


futures = []
process_data = []
pool = ThreadPoolExecutor(1)

for species in ["erato", "melpomene"]:
    species_genome_path = Path(f"{species}/{THIS_DNA_SCOPE}")
    print(ROOT_GENOTYPE_INPUT_DIR / species_genome_path)
    for root, dirs, files in os.walk(ROOT_GENOTYPE_INPUT_DIR / species_genome_path):
        for i, f in enumerate(files):
            fname = f.split(".")[0]
            if os.path.exists(
                ROOT_GENOTYPE_OUTPUT_DIR
                / f"{THIS_DNA_SCOPE}/{species}/{fname}/ml_ready.npy"
            ):
                continue  # Delete this after
            
            genome_file_path = species_genome_path / f
            size = os.path.getsize(ROOT_GENOTYPE_INPUT_DIR / genome_file_path)
            process_data.append(
                [size, species, genome_file_path, f"{THIS_DNA_SCOPE}/{species}/{fname}"]
            )

process_data = sorted(process_data, key=lambda x: (x[1], x[0]))

for size, species, genome_file_path, save_dir in process_data:
    future = pool.submit(
        process_fn,
        genome_file_path,
        save_dir,
    )
    futures.append(future)

progress_bar = tqdm(len(process_data))
progress_bar.set_description(f"Processing Genotype data. Total: {len(process_data)}")

for future in as_completed(futures):
    success = future.result()
    progress_bar.update(1)


/local/scratch/david/geno-pheno-data/dna/erato/genome
/local/scratch/david/geno-pheno-data/dna/melpomene/genome
Processing erato/genome/Herato1301.tsv: 11.4 GB bytes


Processing Genotype data. Total: 18: : 0it [00:00, ?it/s]

read_df exe time: 00:00:56
df_extract_states exe time: 00:02:32
create_ml_ready exe time: 00:08:38
_process exe time: 00:13:54


Processing Genotype data. Total: 18: : 1it [18:07, 1087.42s/it]

_save_result exe time: 00:03:37
Completed erato/genome/Herato1301.tsv: 11.4 GB bytes
Processing melpomene/genome/Hmel204001o.tsv: 2.1 GB bytes
read_df exe time: 00:00:12
df_extract_states exe time: 00:00:29
create_ml_ready exe time: 00:01:43
_process exe time: 00:02:43


Processing Genotype data. Total: 18: : 2it [22:01, 585.29s/it] 

_save_result exe time: 00:01:03
Completed melpomene/genome/Hmel204001o.tsv: 2.1 GB bytes
Processing melpomene/genome/Hmel215003o.tsv: 2.2 GB bytes
read_df exe time: 00:00:10
df_extract_states exe time: 00:00:29
create_ml_ready exe time: 00:01:45
_process exe time: 00:02:44


Processing Genotype data. Total: 18: : 3it [25:57, 425.99s/it]

_save_result exe time: 00:01:04
Completed melpomene/genome/Hmel215003o.tsv: 2.2 GB bytes
Processing melpomene/genome/Hmel221001o.tsv: 2.2 GB bytes
read_df exe time: 00:00:10
df_extract_states exe time: 00:00:30
create_ml_ready exe time: 00:01:47
_process exe time: 00:02:47
