## ScenicPlus

In [None]:
import subprocess
import tempfile
script1_path = './3.init_snakemake.sh'
script2_path = './4.scenicplus.sh'
# ==== CHANGE ONLY THIS ====
species_in = "hg38"   # or "mm10"
# ==========================

In [None]:
# Convert the script to Unix format using dos2unix
subprocess.run(['dos2unix', script1_path])

# Run the bash script using subprocess
result = subprocess.run(['bash', script1_path], capture_output=True, text=True)

# Print the output and any errors
print("Output:", result.stdout)
print("Errors:", result.stderr)

In [None]:
# ==== CHANGE ONLY THIS ====
species_in = "hg38"   # or "mm10"
# ==========================

import pathlib
import pandas as pd
from pybiomart import Server  # pip install pybiomart

# Output folder
outdir = pathlib.Path("outs/scplus_pipeline/Snakemake/outs/")
outdir.mkdir(parents=True, exist_ok=True)
genome_annotation_out = outdir / "genome_annotation.tsv"
chromsizes_out = outdir / "chromsizes.tsv"

# Map species to BioMart dataset & UCSC assembly
species_map = {"hg38": ("hsapiens_gene_ensembl", "hg38"),
               "mm10": ("mmusculus_gene_ensembl", "mm10")}
if species_in not in species_map:
    raise ValueError("species_in must be 'hg38' or 'mm10'")
dataset_name, ucsc_assembly = species_map[species_in]

# 1) Get gene annotation from BioMart
server = Server(host="http://www.ensembl.org", use_cache=False)
mart = server["ENSEMBL_MART_ENSEMBL"]
dataset = mart[dataset_name]

external_gene_name_query = "external_gene_name" if "external_gene_name" in dataset.attributes.keys() else "hgnc_symbol"
tss_query = "transcription_start_site" if "transcription_start_site" in dataset.attributes.keys() else "transcript_start"

annot = dataset.query(
    attributes=[
        "chromosome_name", "start_position", "end_position",
        "strand", external_gene_name_query, tss_query, "transcript_biotype"
    ]
)
annot.columns = [
    "Chromosome", "Start", "End", "Strand",
    "Gene", "Transcription_Start_Site", "Transcript_type"
]
annot = annot[annot.Transcript_type == "protein_coding"].copy()
annot['Strand'] = ['+' if int(s) == 1 else '-' for s in annot['Strand']]

# Add chr prefix and filter
annot["Chromosome"] = "chr" + annot["Chromosome"].astype(str)
annot = annot[annot["Chromosome"].str.match(r"^chr[0-9XYM]+$")].copy()

# Save gene annotation
annot.to_csv(genome_annotation_out, sep="\t", index=False)
print(f"✓ Saved genome annotation → {genome_annotation_out}")

# 2) Get chromsizes directly from UCSC
chromsizes = pd.read_table(
    f"http://hgdownload.cse.ucsc.edu/goldenPath/{ucsc_assembly}/bigZips/{ucsc_assembly}.chrom.sizes",
    header=None,
    names=["Chromosome", "End"]
)
chromsizes.insert(1, "Start", 0)

# Keep only chr-prefixed chromosomes consistent with annotation
chromsizes = chromsizes[chromsizes["Chromosome"].str.match(r"^chr[0-9XYM]+$")].copy()

# Save chromsizes
chromsizes.to_csv(chromsizes_out, sep="\t", index=False)
print(f"✓ Saved chromsizes → {chromsizes_out}")

# Quick preview
print("\n== Annotation preview ==")
print(annot.head())
print("\n== Chromsizes preview ==")
print(chromsizes.head())


In [None]:
subprocess.run(['dos2unix', script2_path])

# Run the bash script using subprocess
result = subprocess.run(['bash', script2_path], capture_output=True, text=True)

# Print the output and any errors
print("Output:", result.stdout)
print("Errors:", result.stderr)