In [6]:
from pathlib import Path
import numpy as np

camid_path = Path("../../data/melpomene_test.npy")
camids = np.load(camid_path)
print(camids.shape)

(20,)


In [None]:
from pathlib import Path

import pandas as pd

from gtp.tools.simple import convert_bytes, get_filesize

genome_file_dir = Path("/local/scratch/carlyn.1/dna/vcfs/erato/genome")

file_data = []
for root, dirs, files in genome_file_dir.walk():
    for f in files:
        fs = get_filesize(root / f)
        file_data.append([f, fs, convert_bytes(fs)])
        
df = pd.DataFrame(file_data, columns=["filename", "bytes", "bytes_human"])

df.head()

In [None]:
df = df.sort_values("bytes")
df.head()

In [None]:
df[df.bytes_human.str.contains("GB")].head()

In [None]:
df.iloc[-1]

In [None]:
example_gene_path = genome_file_dir / "Herato0821.tsv"

In [None]:
from gtp.tools.timing import ExecutionTimer
from gtp.dataloading.data_preprocessors import ButterflyGenePreprocessor

timer = ExecutionTimer(name="genome_processor")
preprocessor = ButterflyGenePreprocessor(
    input_dir=genome_file_dir, output_dir="/local/scratch/carlyn.1/tmp", save_format="parquet"
)
preprocessor.process(pca_csv_path_suffix="Herato0821.tsv", processor="pandas")
preprocessor.save_result("test_dir")
timer.stop()
timer.print_elapsed_time()

In [None]:
from collections import defaultdict

from tqdm import tqdm
import numpy as np


timer = ExecutionTimer(name="raw_genome_processor")

state_to_bits_map = {
    "0|0" : np.array([0, 0]).astype(np.bool_),
    "0|1" : np.array([0, 1]).astype(np.bool_),
    "1|0" : np.array([1, 0]).astype(np.bool_),
    "1|1" : np.array([1, 1]).astype(np.bool_),
    
}

specimen_states = defaultdict(list)
metadata = []

with open(example_gene_path, 'r') as f:
    lines = f.readlines()
    for line in tqdm(lines, desc="Processing TSV"):
        scaffold_name, position, reference, alternative, *states = line.strip().split("\t")
        for s in states:
            camid, state = s.split("=")
            state = state.replace("/", "|")
            state_array = state_to_bits_map[state]
            specimen_states[camid].append(state_array)
        metadata.append([scaffold_name, position, reference, alternative])


timer.stop()
print(timer.print_elapsed_time())

print(len(specimen_states.keys()), len(metadata))

In [None]:
timer = ExecutionTimer(name="raw_genome_processor_stacking")

camids = []
final_states = []
for camid, states in tqdm(specimen_states.items(), desc="Stacking States"):
    states_stacked = np.stack(states)
    final_states.append(states_stacked)
    camids.append(camid)
    
final_states = np.stack(final_states)
timer.stop()
print(len(camids))
print(final_states.shape)
timer.print_elapsed_time()

In [None]:
final_states.dtype

In [None]:
tmp_save_dir = Path("/local/scratch/carlyn.1/tmp/test_dir")

metadata_df = pd.DataFrame(metadata, columns=["Scaffold", "Position", "Reference", "Alternative"])
metadata_df.Position = metadata_df.Position.astype(np.uint32)
metadata_df.to_parquet(tmp_save_dir / "metadata.parquet")

np.save(tmp_save_dir / "states.npy", final_states)

with open(tmp_save_dir / "camids.txt", 'w') as f:
    f.write("\n".join(camids))

fs = get_filesize(tmp_save_dir / "camids.txt")
print(convert_bytes(fs))

In [None]:
pd.read_csv(tmp_save_dir / "camids.txt", header=None).shape

In [None]:
tmp_load = np.load(tmp_save_dir / "states.npy")

In [None]:
(tmp_load == final_states).all()

In [None]:
ex_row = np.stack(specimen_states[ex])
print(ex_row.shape)
values = ex_row.sum(-1)
ml_ready = np.zeros(values.shape + (3,), dtype=np.bool_)
ml_ready.reshape(-1, 3)[np.arange(values.size), values.reshape(-1)] = True

In [None]:
values = states.select(
    pl.col(str_pos).str.split("|").cast(pl.List(pl.Int32)).list.sum()
).rows()
np_values = np.array(values)
one_hot_size = np_values.max() + 1
ml_ready = np.zeros(np_values.shape + (one_hot_size,))
ml_ready.reshape(-1, one_hot_size)[
    np.arange(np_values.size), np_values.reshape(-1)
] = 1
ml_ready = ml_ready.astype(np.bool_)