In [1]:
from pathlib import Path
import os

import numpy as np
from tqdm import tqdm

root_path = Path("/local/scratch/carlyn.1/dna/training_output")
exp_path = root_path / "base"

data = []
species = set()
wings = set()
colors = set()
chromosomes = set()
for root, dirs, files in tqdm(os.walk(exp_path), desc="extracting data"):
    if "test_attributions.npy" in files:
        x = np.load(root + "/test_attributions.npy")[
            :, 0
        ]  # Only grab first pca attributions
        parts = root.split(os.path.sep)[-1].split("_")
        if len(parts) == 6:
            sp, wing, c, n, _, chromosome = parts
            color = f"{c}_{n}"
        else:
            sp, wing, color, _, chromosome = parts

        chromosome = int(chromosome)
        species.add(sp)
        wings.add(wing)
        colors.add(color)
        chromosomes.add(chromosome)

        data.append(
            [
                sp,
                wing,
                color,
                chromosome,
                x,
            ]
        )

extracting data: 337it [00:02, 124.87it/s]


In [2]:
chromosomes = sorted(list(chromosomes))
chromosomes

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [3]:
data = sorted(data, key=lambda x: (x[0], x[1], x[2], x[3]))

In [None]:
from gtp.dataloading.tools import collect_chromosome_position_metadata

genome_folder = "/local/scratch/carlyn.1/dna/processed/genome"

groups = []

gn = None
group = []
positions = []
group_chromosomes = []
prev_chromosome = None
for row in tqdm(data, desc="processing-rows"):
    sp = row[0]
    # if sp == "erato":
    #    if row[1] == "forewings":
    #        continue
    #    if row[1] == "hindwings" and row[2] != "total":
    #        continue
    chromosome = int(row[3])
    new_gn = "-".join(row[:3])
    if gn is None:
        gn = new_gn
    elif gn != new_gn:
        cat_positions = np.concatenate([np.array(x) for x in positions])
        groups.append(
            (
                gn,
                np.concatenate(group_chromosomes),
                cat_positions,
                np.concatenate(group),
            )
        )
        group = []
        positions = []
        group_chromosomes = []
        prev_chromosome = None
        gn = new_gn
    else:
        assert prev_chromosome == chromosome - 1

    prev_chromosome = chromosome
    group.append(row[-1])
    positions.append(
        collect_chromosome_position_metadata(genome_folder, sp, chromosome)
    )
    group_chromosomes.append(np.ones(row[-1].shape[0]) * chromosome)


processing-rows: 100%|██████████| 336/336 [03:44<00:00,  1.50it/s]


In [5]:
import pandas as pd

outdir = Path("/home/carlyn.1/dna-trait-analysis/tmp/csv_att_data")

for group in tqdm(groups, desc="saving data"):
    gn = group[0]
    print(gn)
    chromosomes = group[1]
    scaffold_positions = group[2]
    attributions = group[3]

    all_data = np.stack(
        [
            chromosomes,
            scaffold_positions[:, 0],
            scaffold_positions[:, 1],
            attributions,
        ],
        axis=1,
    )

    df = pd.DataFrame(
        all_data, columns=["chromosome", "scaffold", "position", "attribution"]
    )
    df.to_csv(Path(outdir, f"{gn}.csv"))

saving data:   0%|          | 0/8 [00:00<?, ?it/s]

erato-hindwings-total


saving data:  12%|█▎        | 1/8 [01:35<11:06, 95.15s/it]

melpomene-forewings-color_1


saving data:  25%|██▌       | 2/8 [03:02<09:04, 90.74s/it]

melpomene-forewings-color_2


saving data:  38%|███▊      | 3/8 [04:42<07:54, 94.81s/it]

melpomene-forewings-color_3


saving data:  50%|█████     | 4/8 [06:34<06:47, 101.81s/it]

melpomene-forewings-total


saving data:  62%|██████▎   | 5/8 [08:21<05:10, 103.36s/it]

melpomene-hindwings-color_1


saving data:  75%|███████▌  | 6/8 [09:39<03:09, 94.96s/it] 

melpomene-hindwings-color_2


saving data:  88%|████████▊ | 7/8 [10:53<01:28, 88.17s/it]

melpomene-hindwings-color_3


saving data: 100%|██████████| 8/8 [11:58<00:00, 89.81s/it]


In [6]:
# create rows
# species, wing, color, chromosome