In [1]:
# Parameters
filename = "6h"


In [2]:
import pandas as pd
import json
import re
from collections import defaultdict

In [3]:
#filename = '6h'

In [4]:
excel_path = 'public/data/raw/' + filename + '.xlsx'
xlsx = pd.ExcelFile(excel_path)

for sheet_name in xlsx.sheet_names:
    print(sheet_name)

DEGs-all
SPD
MITO
Plastids
TF family


In [5]:
# Sheets to exclude
exclude_sheet = 'all DEGs'

# Columns to include in "Details"
details_keys = [
    "Significantly expressed in NO. of clusters",
    "GeneName",
    "Description",
    "TF family"
]

In [6]:
# Expression-level headers in order
expression_categories = [
    "Mesophyll",
    "Leaf pavement cell",
    "Phloem parenchyma",
    "Epidermis",
    "Xylem",
    "G2/M phase",
    "Companion cell",
    "S phase",
    "Leaf guard cell",
    "Unknown"
]

In [7]:
# Dictionary to store sheet data
output = {}

In [8]:
for sheet_name in xlsx.sheet_names:
    if sheet_name == 'all DEGs':
        continue

    df = pd.read_excel(excel_path, sheet_name=sheet_name, header=[0, 1])
    df.columns = pd.MultiIndex.from_tuples([
        (str(a).strip(), str(b).strip()) for a, b in df.columns
    ])

    # Find GeneID and Genenotype columns
    geneid_col = next((col for col in df.columns if 'geneid' in (col[0].lower(), col[1].lower())), None)
    genotype_col = next((col for col in df.columns if 'genenotype' in (col[0].lower(), col[1].lower())), None)

    if not geneid_col or not genotype_col:
        print(f"⚠️ GeneID or Genenotype column not found in {sheet_name}")
        continue

    sheet_data = {}
    for _, row in df.iterrows():
        gene_id = row[geneid_col]
        genotype = row[genotype_col]

        if pd.isna(gene_id) or pd.isna(genotype):
            continue

        gene_id = str(gene_id)
        genotype = str(genotype)

        if gene_id not in sheet_data:
            sheet_data[gene_id] = {
                "Details": {}
            }

        # Extract Details (only once per GeneID)
        if not sheet_data[gene_id]["Details"]:
            details = {}
            for key in details_keys:
                matching_col = next((col for col in df.columns if col[1] == key), None)
                if matching_col and matching_col in row:
                    details[key] = row[matching_col]
            sheet_data[gene_id]["Details"] = details

        # Extract grouped expression data
        expression_by_category = {}
        for category in expression_categories:
            group_data = {
                col[1]: row[col]
                for col in df.columns
                if col[0] == category and pd.notna(row[col])
            }
            if group_data:
                expression_by_category[category] = group_data

        # Add under the genotype level
        sheet_data[gene_id][genotype] = expression_by_category

    output[sheet_name] = sheet_data


⚠️ GeneID or Genenotype column not found in DEGs-all
⚠️ GeneID or Genenotype column not found in SPD


⚠️ GeneID or Genenotype column not found in MITO


⚠️ GeneID or Genenotype column not found in Plastids


⚠️ GeneID or Genenotype column not found in TF family


In [9]:

# Save JSON
json_path = 'public/data/processed/' + filename + '.json'
with open(json_path, 'w', encoding='utf-8') as f:
    json.dump(output, f, indent=4, ensure_ascii=False)

print(f"✅ JSON file created at: {json_path}")

✅ JSON file created at: public/data/processed/6h.json
