# STRUCTURE con `taka_pop.str` (ipyrad)

Este notebook corre **STRUCTURE** directamente sobre tu archivo `.str` (sin importar `ipyrad` en el kernel).


In [None]:
# 1) Instalación (micromamba) + STRUCTURE
# Funciona en entornos tipo Colab/Jupyter (Linux). Si ya lo instalaste antes, puedes re-ejecutar: es idempotente.

import subprocess
from pathlib import Path

MAMBA = Path("/tmp/bin/micromamba")
PREFIX = Path("/tmp/micromamba")
ENV_NAME = "gen_evol"

# Descargar micromamba si no existe
if not MAMBA.exists():
    MAMBA.parent.mkdir(parents=True, exist_ok=True)
    url = "https://micro.mamba.pm/api/micromamba/linux-64/latest"
    tgz = Path("/tmp/micromamba.tar.bz2")
    subprocess.run(["bash","-lc", f"curl -L '{url}' -o '{tgz}'"], check=True)
    subprocess.run(["bash","-lc", f"tar -xjf '{tgz}' -C '{MAMBA.parent}' --strip-components=1 bin/micromamba"], check=True)

# Crear env si no existe (python=3.10 para compatibilidad bioinformática)
subprocess.run([str(MAMBA), "create", "-y", "-p", str(PREFIX/ENV_NAME), "python=3.10",
                "-c", "conda-forge", "-c", "bioconda", "-c", "ipyrad"], check=True)

# Instalar STRUCTURE + CLUMPP
subprocess.run([str(MAMBA), "install", "-y", "-p", str(PREFIX/ENV_NAME),
                "structure", "clumpp",
                "-c", "conda-forge", "-c", "bioconda"], check=True)

print("✅ Instalación lista.")



In [None]:
# 2) Define tu archivo .str (cambia INPUT_STR si hace falta)
from pathlib import Path

CANDIDATES = [
    Path("taka_pop.str"),
    Path("taka_pop")/"taka_pop.str",
    Path("/content")/"taka_pop.str",   # Colab típico
]

INPUT_STR = None
for p in CANDIDATES:
    if p.exists():
        INPUT_STR = p
        break

if INPUT_STR is None:
    raise FileNotFoundError(
        "No encuentro 'taka_pop.str'. Ponlo junto al notebook, o en ./taka_pop/taka_pop.str, "
        "o en /content/taka_pop.str (Colab)."
    )

print("✅ Usando:", INPUT_STR.resolve())



In [None]:
# 3) Construir mainparams/extraparams (inferimos EXTRACOLS y NUMLOCI desde el .str)
import re, statistics
from pathlib import Path

def is_int(tok: str) -> bool:
    return re.fullmatch(r"-?\d+", tok) is not None

lines = INPUT_STR.read_text().splitlines()
all_lines = [ln.strip() for ln in lines if ln.strip()]  # sin vacíos

nrows = len(all_lines)
col_counts = [len(ln.split()) for ln in all_lines[:500]]
ncols = statistics.mode(col_counts)

print(f"DEBUG: rows={nrows}, modal_cols={ncols}")

best = None
for extracols in range(0, 11):
    if (ncols - extracols) <= 0:
        continue
    if (ncols - extracols) % 2 != 0:
        continue
    scores = []
    for ln in all_lines[:200]:
        toks = ln.split()
        if len(toks) != ncols:
            continue
        geno = toks[extracols:]
        scores.append(sum(is_int(t) for t in geno) / len(geno))
    if not scores:
        continue
    med = statistics.median(scores)
    if (best is None) or (med > best[0]):
        best = (med, extracols)

if best is None or best[0] < 0.95:
    preview = all_lines[0].split()[:12]
    raise ValueError(
        f"No pude inferir EXTRACOLS con confianza. best={best}. "
        f"Primera fila (12 tokens): {preview}"
    )

best_med, EXTRACOLS = best
NUMLOCI = (ncols - EXTRACOLS) // 2

# Tu archivo tiene 1 fila por individuo (128 filas). Forzamos ONEROWPERIND=1.
ONEROWPERIND = 1
NUMINDS = nrows

print(f"✅ Inferred: EXTRACOLS={EXTRACOLS} (mediana enteros={best_med:.2f}), NUMLOCI={NUMLOCI}, NUMINDS={NUMINDS}, ONEROWPERIND={ONEROWPERIND}")

def write_params(K: int, outdir: Path, burnin=50000, numreps=100000):
    outdir.mkdir(parents=True, exist_ok=True)
    mainparams = (
        "# STRUCTURE mainparams (autogenerado)\n"
        f"MAXPOPS        {K}\n"
        f"NUMINDS        {NUMINDS}\n"
        f"NUMLOCI        {NUMLOCI}\n"
        "PLOIDY         2\n"
        "MISSING        -9\n"
        f"ONEROWPERIND   {ONEROWPERIND}\n"
        "LABEL          1\n"
        "POPDATA        1\n"
        "POPFLAG        0\n"
        "LOCDATA        0\n"
        "PHENOTYPE      0\n"
        f"EXTRACOLS      {EXTRACOLS}\n"
        "MARKERNAMES    0\n"
        "RECESSIVEALLELES 0\n"
        "MAPDISTANCES   0\n"
        "PHASED         0\n"
        "PHASEINFO      0\n"
        "USEPOPINFO     0\n"
    )
    extraparams = (
        "# STRUCTURE extraparams (autogenerado)\n"
        f"BURNIN         {burnin}\n"
        f"NUMREPS        {numreps}\n"
        "NOADMIX        0\n"
        "ADMBURNIN      0\n"
        "ALPHAPRIOR     1.0\n"
        "FREQSCORR      1\n"
        "INFERALPHA     1\n"
    )
    (outdir / "mainparams").write_text(mainparams)
    (outdir / "extraparams").write_text(extraparams)

print("✅ Listo para correr. (Se escriben mainparams/extraparams por cada K en results/K*/)")



In [None]:
# 4) Run STRUCTURE para un rango de K
import subprocess
from pathlib import Path

MAMBA = Path("/tmp/bin/micromamba")
PREFIX = Path("/tmp/micromamba")
ENV_PATH = PREFIX/"gen_evol"

RESULTS = Path("results")
RESULTS.mkdir(exist_ok=True)

def run_structure(K: int, burnin=20000, numreps=50000):
    outdir = RESULTS / f"K{K}"
    write_params(K, outdir, burnin=burnin, numreps=numreps)

    outprefix = outdir / "out"
    cmd = [
        str(MAMBA), "run", "-p", str(ENV_PATH),
        "structure",
        "-K", str(K),
        "-i", str(INPUT_STR),
        "-o", str(outprefix),
        "-m", str(outdir/"mainparams"),
        "-e", str(outdir/"extraparams"),
    ]
    print(">>", " ".join(cmd))
    p = subprocess.run(cmd, text=True, capture_output=True)

    (outdir/"stdout.txt").write_text(p.stdout)
    (outdir/"stderr.txt").write_text(p.stderr)

    print("---- stdout tail ----")
    print("\n".join(p.stdout.splitlines()[-20:]))

    if p.stderr.strip():
        print("---- stderr tail ----")
        print("\n".join(p.stderr.splitlines()[-40:]))

    if p.returncode != 0:
        raise RuntimeError(f"STRUCTURE falló para K={K}. Revisa {outdir/'stdout.txt'} y {outdir/'stderr.txt'}")

    print(f"✅ Finished K={K} -> {outdir}")

K_values = [2, 3, 4, 5]  # cambia a [2,3,4,5,6,7,8,9,10] si quieres
for K in K_values:
    run_structure(K)

print("🎉 Done.")

