In [2]:
from pathlib import Path
import os
import json

import numpy as np
import pandas as pd


def read_json(path):
    with open(path, "r") as f:
        return json.load(f)


root_dir = Path("/home/carlyn.1/dna-trait-analysis/tmp/osc_edit_plots_ws_5000")
for root, dirs, files in os.walk(root_dir):
    for f in files:
        if f.endswith("json"):
            parts = f.split(".")[0].split("_")
            sp, wing, *color_parts = parts
            if len(color_parts) == 2:
                color = "_".join(color_parts)
            else:
                color = color_parts[0]
            data = read_json(Path(root, f))

            group_data = []
            for chromosome in range(1, 22):
                chrom_data = data[str(chromosome)]
                ws = chrom_data["window_size"]
                nom_pos = chrom_data["nominal_positions"]
                real_positions = chrom_data["real_position_metadata"]
                attributions = chrom_data["attributions"]
                AA = attributions["test"]["AA"]
                Aa = attributions["test"]["Aa/aA"]
                aa = attributions["test"]["aa"]
                zeroed = attributions["test"]["zero-out"]
                row_length = len(AA["means"])

                new_data = np.stack(
                    [
                        np.array([sp] * row_length),  # Species
                        np.array([wing] * row_length),  # wing
                        np.array([color] * row_length),  # color
                        np.array([chromosome] * row_length),  # Chromosome
                        np.array([ws] * row_length),  # window size
                        np.array(nom_pos),  # nominal position
                        np.array(real_positions)[:, 0],  # scaffold
                        np.array(real_positions)[:, 1],  # real position
                        np.array(AA["means"]),
                        np.array(AA["stds"]),
                        np.array(Aa["means"]),
                        np.array(Aa["stds"]),
                        np.array(aa["means"]),
                        np.array(aa["stds"]),
                        np.array(zeroed["means"]),
                        np.array(zeroed["stds"]),
                    ],
                    axis=1,
                )

                group_data.append(new_data)

            group_data = np.concatenate(group_data)
            df = pd.DataFrame(
                group_data,
                columns=[
                    "species",
                    "wing",
                    "color",
                    "chromosome",
                    "window_size",
                    "nominal_position",
                    "scaffold",
                    "real_position",
                    "AA_mean",
                    "AA_std",
                    "Aa_aA_mean",
                    "Aa_aA_std",
                    "aa_mean",
                    "aa_std",
                    "zeroed_mean",
                    "zeroed_std",
                ],
            )

            new_fn = Path(root_dir, f.split(".")[0] + ".csv")
            df.to_csv(new_fn)

            group_data = []
# Species, wing, color, chromosome, window size, nominal position, scaffold, real_position, AA mean, AA std, ...
