### Preprocess Data

In [None]:
from pathlib import Path
import socket
from enum import Enum


class Hosts(Enum):
    IMAGEOMICS_SERVER = "cse-cnc196909s.coeit.osu.edu"


hostname = socket.gethostname()
if hostname == Hosts.IMAGEOMICS_SERVER.value:
    ROOT_PHENOTYPE_INPUT_DIR = Path("/local/scratch/carlyn.1/dna/colors")
    ROOT_PHENOTYPE_OUTPUT_DIR = Path("/local/scratch/carlyn.1/dna/processed/phenotypes")
    ROOT_GENOTYPE_INPUT_DIR = Path("/local/scratch/carlyn.1/dna/vcfs")
    ROOT_GENOTYPE_OUTPUT_DIR = Path("/local/scratch/carlyn.1/dna/processed")
else:
    ROOT_PHENOTYPE_INPUT_DIR = Path("/local/scratch/david/geno-pheno-data/colors")
    ROOT_PHENOTYPE_OUTPUT_DIR = Path(
        "/local/scratch/david/geno-pheno-data/colors/processed"
    )
    ROOT_GENOTYPE_INPUT_DIR = Path("/local/scratch/david/geno-pheno-data/dna/")
    ROOT_GENOTYPE_OUTPUT_DIR = Path(
        "/local/scratch/david/geno-pheno-data/dna/processed"
    )

In [None]:
from gtp.dataloading.data_preprocessors import ButterflyPatternizePreprocessor

preprocessor = ButterflyPatternizePreprocessor(
    input_dir=ROOT_PHENOTYPE_INPUT_DIR, output_dir=ROOT_PHENOTYPE_OUTPUT_DIR
)

for species in ["erato", "melpomene"]:
    for wing in ["forewings", "hindwings"]:
        for color in ["color_1", "color_2", "color_3", "total", "bioclip"]:
            suffix_path = f"{species}_{wing}_PCA/PCA_{color}_loadings.csv"
            preprocessor.process(pca_csv_path_suffix=suffix_path)
            preprocessor.save_result(f"{species}_{wing}_{color}")


In [None]:
import os
from enum import Enum
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from pathlib import Path

from gtp.dataloading.data_preprocessors import ButterflyGenePreprocessor
from gtp.tools.simple import convert_bytes, get_filesize


class DNA_SCOPE(Enum):
    GENOME = "genome"
    CHROMOSOME = "chromosomes"
    GENE = "genes"


THIS_DNA_SCOPE = DNA_SCOPE.GENOME.value


def process_fn(pca_csv_path_suffix, save_dir):
    size = convert_bytes(get_filesize(ROOT_GENOTYPE_INPUT_DIR / pca_csv_path_suffix))
    #print(f"Processing {pca_csv_path_suffix}: {size} bytes")
    preprocessor = ButterflyGenePreprocessor(
        input_dir=ROOT_GENOTYPE_INPUT_DIR, output_dir=ROOT_GENOTYPE_OUTPUT_DIR, save_format="parquet"
    )
    preprocessor.process(pca_csv_path_suffix=pca_csv_path_suffix, processor="pandas")
    preprocessor.save_result(save_dir)
    #print(f"Completed {pca_csv_path_suffix}: {size} bytes")
    return True


futures = []
process_data = []
pool = ThreadPoolExecutor(1)

for species in ["erato", "melpomene"]:
    species_genome_path = Path(f"{species}/{THIS_DNA_SCOPE}")
    #print(ROOT_GENOTYPE_INPUT_DIR / species_genome_path)
    for root, dirs, files in os.walk(ROOT_GENOTYPE_INPUT_DIR / species_genome_path):
        for i, f in enumerate(files):
            fname = f.split(".")[0]
            #if os.path.exists(
            #    ROOT_GENOTYPE_OUTPUT_DIR
            #    / f"{THIS_DNA_SCOPE}/{species}/{fname}/ml_ready.npy"
            #):
            #    continue  # Delete this after
            
            genome_file_path = species_genome_path / f
            size = os.path.getsize(ROOT_GENOTYPE_INPUT_DIR / genome_file_path)
            process_data.append(
                [size, species, genome_file_path, f"{THIS_DNA_SCOPE}/{species}/{fname}"]
            )

process_data = sorted(process_data, key=lambda x: (x[1], x[0]))


pbar = tqdm(process_data, desc="Processing Genotype data.")
for size, species, genome_file_path, save_dir in pbar:
    size = convert_bytes(os.path.getsize(ROOT_GENOTYPE_INPUT_DIR / genome_file_path))
    pbar.set_description_str(f"Processing file of size: {size}")
    process_fn(genome_file_path, save_dir)

In [None]:
import numpy as np
from collections import defaultdict
from enum import Enum
from tqdm import tqdm

class DNA_SCOPE(Enum):
    GENOME = "genome"
    CHROMOSOME = "chromosomes"
    GENE = "genes"


THIS_DNA_SCOPE = DNA_SCOPE.GENOME.value

species = "melpomene"
data_dir = Path(ROOT_GENOTYPE_OUTPUT_DIR, "genotypes", "genome", species)
path_data = []
for root, dirs, files in data_dir.walk():
    if "ml_ready.npy" in files:
        scaffold = root.parts[-1]
        if species == "erato":
            chrom_info = scaffold.replace("Herato", "")
            scaf_num = int(chrom_info)
        elif species == "melpomene":
            chrom_info = scaffold.replace("Hmel2", "")
            scaf_num = int(chrom_info[:-1])
        
        path_data.append([root, scaf_num])
    
path_data = sorted(path_data, key=lambda x: x[1])
    
prev_camids = None

output_dir = Path("/local/scratch/carlyn.1/dna/processed/genotypes/genome_individuals", species)
#output_dir = Path(f"/local/scratch/carlyn.1/dna/vcfs/{species}/genome_individuals")
output_dir.mkdir(exist_ok=True)
for root, _ in tqdm(path_data, desc="Splitting data per individual", colour="#9E2776"):
    if "ml_ready.npy" in files:
        # Assert aligned CAMIDS
        camids = np.load(root / "camids.npy")
        if prev_camids is not None:
            assert (camids == prev_camids).all()
        prev_camids = camids
        
        ml_data = np.load(root / "ml_ready.npy")
        
        for camid, row in zip(camids, ml_data):
            tgt_file = output_dir / f"{camid}.npy"
            if tgt_file.exists():
                ind_data = np.load(tgt_file)
                ind_data = np.concatenate((ind_data, row), axis=0)
                np.save(tgt_file, ind_data)
            else:
                np.save(tgt_file, row)
        
            