In [None]:
import random
from cyvcf2 import VCF
import polars as pl

In [None]:
# Get all species from halstats file
seabirds = pl.read_csv("seabird_alignment_halstats", skip_lines=4)['GenomeName'].to_list()
seabirds = [s for s in seabirds if s is not None]
seabirds = [s for s in seabirds if not s.startswith("Anc")]
# Filter out c90 (it's subantarctic islands and we have them from the regular pop)
seabirds = [s for s in seabirds if not s.startswith("c90")]
seabirds = [s for s in seabirds if not s.startswith("a9")] # We have a9 in the SNPs as well
# Remove Megadyptes_antipodes
seabirds = [s for s in seabirds if not s.startswith("Megadyptesantipodes")]
# Assert a9 is not in seabirds
assert "a9" not in seabirds, "a9 should not be in seabirds"
len(seabirds)


In [None]:

penguin_prefixes = (
    "Aptenodytes",   # king & emperor
    "Spheniscus",    # banded penguins
    "Pygoscelis",    # brush‑tails
    "Eudyptula",     # little penguins
    "Eudyptes"       # crested penguins (includes Eudyptesmoseleyi)
)

penguins = [sp for sp in seabirds if sp.startswith(penguin_prefixes)]
print(penguins)


In [None]:
# aDNA
def extract_sites(filename):
    df = pl.scan_csv(filename, separator="\t", low_memory=True)
    # Remove sites that are hemi
    #filtered = df.filter(pl.col("GT").is_in(["0/0", "0/1", "1/0", "1/1"]))
    filtered = df.filter(pl.col("modern_GT").is_in(["0/0", "0/1", "1/0", "1/1"]))
    
    return filtered

# Example usage:
waitaha_sites = extract_sites('adna/waitaha/waitaha.tsv').filter(pl.col('reason') == 'lifted_and_genotyped')
richdalei_sites = extract_sites('adna/richdalei/richdalei.tsv').filter(pl.col('reason') == 'lifted_and_genotyped')

In [None]:
# Let's collect and then save the dataframes, to speed up downstream processing
waitaha_sites = waitaha_sites.collect()
richdalei_sites = richdalei_sites.collect()


In [None]:
richdalei_sites.columns

In [None]:
# Find intersecting sites, both have the same column names
waitaha_sites = waitaha_sites.rename({"modern_GT": "waitaha_GT"})
richdalei_sites = richdalei_sites.rename({"modern_GT": "richdalei_GT"})

# Find intersecting sites - Join on modern_chrom and modern_pos
intersecting_sites = waitaha_sites.join(richdalei_sites, on=["modern_chrom", "modern_pos"], how="inner")


In [None]:
intersecting_sites

In [None]:
# Now save them to parquet files
intersecting_sites.write_parquet('adna/intersecting_sites.parquet')
waitaha_sites.write_parquet('adna/waitaha/waitaha_filtered.parquet')
richdalei_sites.write_parquet('adna/richdalei/richdalei_filtered.parquet')


# Use saved data

In [None]:
# Open the saved files (skip to this cell if we have already saved them)
waitaha_sites = pl.scan_parquet('adna/waitaha/waitaha_filtered.parquet')
richdalei_sites = pl.scan_parquet('adna/richdalei/richdalei_filtered.parquet')
intersecting_sites = pl.scan_parquet('adna/intersecting_sites.parquet')

In [None]:
# How many waitaha_sites are 0/0 ?
waitaha_00 = waitaha_sites.filter(pl.col("waitaha_GT") == "0/0").collect().shape[0]
print(f"Waitaha 0/0: {waitaha_00}")
# How many waitaha sites are 0/0, and then 1/1?
waitaha_01 = waitaha_sites.filter(pl.col("waitaha_GT") == "0/1").collect().shape[0]
print(f"Waitaha 0/1: {waitaha_01}")
waitaha_11 = waitaha_sites.filter(pl.col("waitaha_GT") == "1/1").collect().shape[0]
print(f"Waitaha 1/1: {waitaha_11}")

# It's not phased, but a sanity check that costs nothing is still a good idea
waitaha_10 = waitaha_sites.filter(pl.col("waitaha_GT") == "1/0").collect().shape[0]
print(f"Waitaha 1/0: {waitaha_10}")

# The same for richdalei
richdalei_00 = richdalei_sites.filter(pl.col("richdalei_GT") == "0/0").collect().shape[0]
print(f"Richdalei 0/0: {richdalei_00}")
richdalei_01 = richdalei_sites.filter(pl.col("richdalei_GT") == "0/1").collect().shape[0]
print(f"Richdalei 0/1: {richdalei_01}")
richdalei_11 = richdalei_sites.filter(pl.col("richdalei_GT") == "1/1").collect().shape[0]
print(f"Richdalei 1/1: {richdalei_11}")
richdalei_10 = richdalei_sites.filter(pl.col("richdalei_GT") == "1/0").collect().shape[0]
print(f"Richdalei 1/0: {richdalei_10}")

#richdalei_00 = richdalei_sites.filter(pl.col("GT") == "0/0").collect().shape[0]
#print(f"Waitaha 0/0: {waitaha_00}, Richdalei 0/0: {richdalei_00}")

In [None]:
# ==========
# Parameters
# ==========
input_vcf = "merged.a9.filtered.qual99_fmissing0.2.maf0.05.biallelic.bcf"   # path to your filtered, biallelic VCF
num_snps = 200                 # how many SNPs to randomly sample
num_replicates = 10             # how many replicates to generate
output_prefix = "random_snps_adna_200"   # prefix for output FASTA files
random_seed = None              # set to an integer for reproducible results, e.g. 42

In [None]:
if random_seed is not None:
    random.seed(random_seed)

In [None]:
# ============================
# 1. Read in the VCF variants
# ============================
vcf = VCF(input_vcf)
records = [variant for variant in vcf]  # store all variants in memory
samples = vcf.samples

vcf_sites = pl.DataFrame({
    "chrom": [record.CHROM for record in records],
    "pos": [record.POS for record in records],
    "VCF_ref": [record.REF for record in records],
    "VCF_alt": [record.ALT[0] if record.ALT else None for record in records],
})

print(f"Loaded {len(records)} variants from {input_vcf}.")
print(f"Samples in VCF: {samples}")

# Check we have enough variants to sample
if len(records) < num_snps:
    raise ValueError(
        f"ERROR: The VCF has only {len(records)} variants, "
        f"but you requested {num_snps}."
    )

# Add in aDNA samples
samples.extend(["waitaha", "richdalei"])

In [None]:
waitaha_sites.columns

In [None]:
intersection = vcf_sites.join(
    intersecting_sites.collect(),
    left_on=["chrom", "pos"],
    right_on=["modern_chrom", "modern_pos"],
    how="inner"
)

In [None]:
len(intersection)

In [None]:
intersection = intersection.select((pl.col("chrom").alias("CHROM"), pl.col("pos").alias("POS"))).to_dicts()

In [None]:
waitaha_sites = waitaha_sites.collect()
richdalei_sites = richdalei_sites.collect()

In [None]:
# See if there's a SNP
# halSnps --tsv snps --refSequence ptg000057l --start 63660 --length 100 /mnt/data/seabirds.hal a9 Eudyptesmoseleyi_genomic

# If no SNP, see if it's a reference allele or missing sequence
# halAlignmentDepth /mnt/data/seabirds.hal a9 --targetGenomes Eudyptesmoseleyi_genomic --refSequence ptg000057l --start 63660 --length 100

# hal snp or missing function

hal_file = "/mnt/data/seabirds.hal"

def hal_snp_or_missing(chrom, pos, samples):

    samples_state = { sample: None for sample in samples }

    # Convert samples to a comma-separated string
    samples_str = ",".join(samples)

    # Subtract 1 from pos for 0-based indexing
    pos -= 1
    snp_command = f"/mnt/data/development/hoiho/wga/cactus/cactus-bin-v2.9.7/bin/halSnps --tsv snps.tmp --refSequence {chrom} --start {pos} --length 1 {hal_file} a9 {samples_str}"
    missing_command = f"/mnt/data/development/hoiho/wga/cactus/cactus-bin-v2.9.7/bin/halAlignmentDepth {hal_file} a9 --targetGenomes {samples_str} --refSequence {chrom} --start {pos} --length 1 --noAncestors"

    # Run the snp_command, capture the stdout and read the snps.tmp
    import subprocess
    try:
        # Don't print stdout, just check for errors
        subprocess.run(snp_command, shell=True, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        snp_data = pl.read_csv("snps.tmp", separator="\t")
        for sample in samples:
            if sample in snp_data.columns:
                # Get the column which is the sample name
                state = snp_data[sample].to_list()
                if len(state) > 0 and state[0] is not None:
                    samples_state[sample] = state[0].upper()

    except subprocess.CalledProcessError as e:
        print(f"Error running command: {e}")
        return None
    
    # For all remaining nones, create a new sample_str
    missing_samples = [sample for sample, state in samples_state.items() if state is None]
    if missing_samples:
        try:
            # This is just as stdout, so capture it as depth_out
            depth_out = subprocess.run(missing_command, shell=True, check=True, capture_output=True, text=True)
            depth_lines = depth_out.stdout.strip().split("\n")[1:]
            if int(depth_lines[0]) >= 1:
                # If the depth is >= 1, then it's a reference allele
                for sample in missing_samples:
                    samples_state[sample] = "REF"
            # Otherwise leave it as None

        except subprocess.CalledProcessError as e:
            print(f"Error running command: {e}")
            return None
    return samples_state

In [None]:
intersecting_sites.collect()

# Long, skip and used save (if possible)

In [None]:
cactus_samples_rows = []

for row in intersecting_sites.collect().iter_rows(named=True):
    chrom = row['modern_chrom']
    pos = row['modern_pos']
    sample_states = hal_snp_or_missing(chrom, pos, seabirds)
    vcf_state = vcf_sites.filter(pl.col("chrom") == chrom).filter(pl.col("pos") == pos).select(["VCF_ref", "VCF_alt"]).to_dicts()[0]
    vcf_ref = vcf_state['VCF_ref']
    vcf_alt = vcf_state['VCF_alt']

    if sample_states is not None:
        # Create a new row with the sample states
        new_row = {
            "modern_chrom": chrom,
            "modern_pos": pos,
        }

        # Mark third alleles as Missing (None)
        for sample, state in sample_states.items():
            if state == "REF":
                new_row[sample] = vcf_state['VCF_ref']

            if state not in [vcf_ref, vcf_alt, None]:
                state = None

            # Now convert to the numeric representation
            new_col = f"{sample}_GT"
            new_row[new_col] = None  # Default to None
            if state == vcf_state['VCF_ref']:
                new_row[new_col] = "0/0"
            elif state == vcf_state['VCF_alt']:
                new_row[new_col] = "1/1"
            elif state == "REF":
                new_row[new_col] = "0/0"
            elif state == "N":
                new_row[new_col] = None
            elif state is None:
                new_row[new_col] = None
            else:
                print(f"Unexpected state for {sample} at {chrom}:{pos}: {state}")
                new_row[new_col] = None

        cactus_samples_rows.append(new_row)

# Create a new dict to turn into a dataframe for joining later
#cactus_samples_df = pl.DataFrame(cactus_samples_rows)
#cactus_samples_df

In [None]:
cactus_samples_df = pl.DataFrame(cactus_samples_rows, infer_schema_length=20000)
cactus_samples_df

In [None]:
# Save cactus_samples_df since it takes a lot of hal calls to generate
# cactus_samples_df.write_parquet('adna/cactus_samples.parquet')

# Use saved data

In [None]:
cactus_samples_df = pl.read_parquet('adna/cactus_samples.parquet')

In [None]:
# How many rows in cactus_samples_df have >= 80% not None (null in Polars)?
num_samples = len(seabirds)
min_non_null = int(num_samples * 0.8)
cactus_samples_df = cactus_samples_df.with_columns(
    pl.sum_horizontal([pl.col(f"{sample}_GT").is_not_null() for sample in seabirds]).alias("num_non_null")
)
cactus_samples_df = cactus_samples_df.filter(pl.col("num_non_null") >= min_non_null)

cactus_samples_df

In [None]:
# "ptg000057l"	99576
# vcf_sites.filter(pl.col("chrom") == "ptg000057l").filter(pl.col("pos") == 99576).select(["VCF_ref", "VCF_alt"]).to_dicts()[0]

In [None]:
num_replicates = 1 

for rep_index in range(1, num_replicates + 1):
    snps_used = 0
    print(f"Generating replicate {rep_index} of {num_replicates} ...")

    # 2a. Randomly sample indices from possible sites
    # chosen_indices = random.sample(range(len(possible_sites)), num_snps)
    # chosen_sites = possible_sites[chosen_indices]

    sample2seq = {sample: [] for sample in samples}
    sample2seq_msa = {sample: [] for sample in samples}

    # 2c. Process each chosen site
    for site in chosen_sites:
        chrom, pos = site['CHROM'], site['POS']
        if chrom is None or pos is None:
            print(f"WARNING: Skipping site with None values: {site}")
            break
        
        # Use a generator to find the record, which is more memory-efficient
        records = vcf(f"{chrom}:{pos}")
        record = next(records, None)

        # For FASTA, we need ref_allele and alt_allele
        if record is not None:
            ref_allele = record.REF
            alt_allele = record.ALT[0] if record.ALT else None

        if record is None:
            print(f"WARNING: No record found for {chrom}:{pos}. Skipping.")
            # Append a missing data placeholder for all samples for this site
            for sample in samples:
                sample2seq[sample].append('-')
                sample2seq_msa[sample].append('N')
            continue

        # The rest of the processing for a found record
        genotypes = record.genotypes

        snps_used += 1

        for i, sample in enumerate(samples):
            code = '-'  # Default to missing
            if sample in ["waitaha", "richdalei"]:
                loc = [record.CHROM, record.POS]
                if sample == "waitaha":
                    gt_list = waitaha_sites.filter(
                        (pl.col("modern_chrom") == loc[0]) & (pl.col("modern_pos") == loc[1])
                    )['waitaha_GT'].to_list()
                else:  # sample == "richdalei"
                    gt_list = richdalei_sites.filter(
                        (pl.col("modern_chrom") == loc[0]) & (pl.col("modern_pos") == loc[1])
                    )['richdalei_GT'].to_list()

                if gt_list:
                    gt = gt_list[0]
                    if gt == "0/0":
                        code = "0"
                    elif gt in ["0/1", "1/0"]:
                        code = "1"
                    elif gt == "1/1":
                        code = "2"

            elif sample in vcf.samples:
                g1, g2 = genotypes[i][0], genotypes[i][1]

                if g1 >= 0 and g2 >= 0:
                    if g1 == 0 and g2 == 0:
                        code = '0'
                    elif (g1 == 0 and g2 == 1) or (g1 == 1 and g2 == 0):
                        code = '1'
                    elif g1 == 1 and g2 == 1:
                        code = '2'

            sample2seq[sample].append(code)

            # And do the same for MSA - Convert back to ACTGN (or the mix for heterozygotes)
            if code == '0':
                msa_code = ref_allele
            elif code == '1':
                # Have to use IUPAC codes for heterozygotes
                if ref_allele == 'A' and alt_allele == 'C':
                    msa_code = 'M'
                elif ref_allele == 'A' and alt_allele == 'G':
                    msa_code = 'R'
                elif ref_allele == 'A' and alt_allele == 'T':
                    msa_code = 'W'
                elif ref_allele == 'C' and alt_allele == 'G':
                    msa_code = 'S'
                elif ref_allele == 'C' and alt_allele == 'T':
                    msa_code = 'Y'
                elif ref_allele == 'G' and alt_allele == 'T':
                    msa_code = 'K'
                elif ref_allele == 'A' and alt_allele == 'N':
                    msa_code = 'A'
                elif ref_allele == 'C' and alt_allele == 'N':
                    msa_code = 'C'
                elif ref_allele == 'G' and alt_allele == 'N':
                    msa_code = 'G'
                elif ref_allele == 'T' and alt_allele == 'N':
                    msa_code = 'T'
                elif ref_allele == 'N' and alt_allele == 'A':
                    msa_code = 'A'
                elif ref_allele == 'N' and alt_allele == 'C':
                    msa_code = 'C'
                elif ref_allele == 'N' and alt_allele == 'G':
                    msa_code = 'G'
            elif code == '2':
                msa_code = alt_allele
            else:
                msa_code = '-'

            sample2seq_msa[sample].append(msa_code)

    # 2d. Write NEXUS for this replicate
    out_nexus = f"{output_prefix}_rep{rep_index}.nex"
    try:
        with open(out_nexus, "w") as out_f:
            # Header
            out_f.write("#NEXUS\n")
            out_f.write("[SNP matrix in integer format: 0=homREF, 1=het, 2=homALT, -=missing]\n\n")
            out_f.write("Begin data;\n")
            out_f.write(f"\tDimensions ntax={len(samples)} nchar={num_snps};\n")
            out_f.write('\tFormat datatype=integerdata symbols="012" gap=-;\n')
            out_f.write("\tMatrix\n")

            # Matrix lines
            for sample in samples:
                seq_str = "".join(sample2seq[sample])
                out_f.write(f"{sample}\t{seq_str}\n")

            out_f.write("\t;\nEnd;\n")
        print(f"  -> Wrote replicate {rep_index} NEXUS: {out_nexus}")
    except IOError as e:
        print(f"Error writing to file {out_nexus}: {e}")

    # 2e. Write FASTA for this replicate
    out_fasta = f"{output_prefix}_rep{rep_index}.fasta"
    try:
        with open(out_fasta, "w") as out_f:
            for sample in samples:
                seq_str = "".join(sample2seq_msa[sample])
                out_f.write(f">{sample}\n{seq_str}\n")
        print(f"  -> Wrote replicate {rep_index} FASTA: {out_fasta}")
    except IOError as e:
        print(f"Error writing to file {out_fasta}: {e}")

    # Explicitly clear the dictionary to free memory, though it will be
    # garbage collected at the start of the next iteration anyway.
    del sample2seq

print(f"All replicates done! SNPs used: {snps_used}")

In [None]:
# Find ['ptg000003l', 47510119] in waitaha_sites
waitaha_sites.filter(
    (pl.col("modern_chrom") == "ptg000015l") & 
    (pl.col("modern_pos") == 13069202)
)



In [None]:
# See if there's a SNP
# halSnps --tsv snps --refSequence ptg000057l --start 63660 --length 100 /mnt/data/seabirds.hal a9 Eudyptesmoseleyi_genomic

# If no SNP, see if it's a reference allele or missing sequence
# halAlignmentDepth /mnt/data/seabirds.hal a9 --targetGenomes Eudyptesmoseleyi_genomic --refSequence ptg000057l --start 63660 --length 100

In [None]:
for rep_index in range(1, num_replicates + 1):
    print(f"Generating replicate {rep_index} of {num_replicates} ...")

    # 2a. Randomly sample SNPs from possible sites (waitaha sites, since it has the least)
    # chosen_records = random.sample(records, num_snps)
    # Get indices of random sites, we can't use the sample function on Polars DataFrame directly
    chosen_indices = random.sample(range(len(possible_sites)), num_snps)
    chosen_sites = possible_sites[chosen_indices]
    chosen_records = []
    for site in chosen_sites.iter_rows(named=True):
        chrom, pos = site['chrom'], site['pos(1‑based)']
        # Find the record that matches this chrom and pos
        record = vcf(f"{chrom}:{pos}")
        # Record is now a generator, let's get the first one
        record = next(record, None)  # Get the first record or None if not found

        print(record)

        if record is None:
            print(f"WARNING: No record found for {chrom}:{pos}. Skipping.")
            continue
        # Check if the record matches the chrom and pos
        if record.CHROM == chrom and record.POS == pos:
            chosen_records.append(record)

    # 2b. Prepare a structure to hold the integer-coded genotype for each sample
    sample2seq = {sample: [] for sample in samples}

    # 2c. Fill sequence data (0,1,2,-)
    for record in chosen_records:
        genotypes = record.genotypes  # [ [g1, g2, phased, ...], [g1, g2, ...], ...]
        for i, sample in enumerate(samples):

            if sample in ["waitaha", "richdalei"]:
                loc = [record.CHROM, record.POS]
                if sample == "waitaha":
                    gt_list = waitaha_sites.filter(
                        (pl.col("chrom") == loc[0]) & (pl.col("pos(1‑based)") == loc[1])
                    )['GT'].to_list()
                else:  # sample == "richdalei"
                    gt_list = richdalei_sites.filter(
                        (pl.col("chrom") == loc[0]) & (pl.col("pos(1‑based)") == loc[1])
                    )['GT'].to_list()
                if gt_list:
                    gt = gt_list[0]
                    if gt == "0/0":
                        code = "0"
                    elif gt in ["0/1", "1/0"]:
                        code = "1"
                    elif gt == "1/1":
                        code = "2"
                    else:
                        code = "-"
                else:
                    code = "-"
            elif sample in vcf.samples:
                g1, g2 = genotypes[i][0], genotypes[i][1]

                # Missing genotype => '-'
                if g1 < 0 or g2 < 0:
                    code = '-'
                else:
                    # 0 = REF, 1 = ALT for each allele
                    # Biallelic, so valid combos: (0,0), (0,1)/(1,0), (1,1)
                    if g1 == 0 and g2 == 0:
                        code = '0'
                    elif (g1 == 0 and g2 == 1) or (g1 == 1 and g2 == 0):
                        code = '1'
                    elif g1 == 1 and g2 == 1:
                        code = '2'
                    else:
                        # Shouldn't happen in a well-filtered biallelic VCF,
                        # but just in case, treat as missing:
                        code = '-'

            sample2seq[sample].append(code)

    # 2d. Write NEXUS for this replicate
    out_nexus = f"{output_prefix}_rep{rep_index}.nex"
    with open(out_nexus, "w") as out_f:
        # Header
        out_f.write("#NEXUS\n")
        out_f.write("[SNP matrix in integer format: 0=homREF, 1=het, 2=homALT, -=missing]\n\n")
        out_f.write("Begin data;\n")
        out_f.write(f"\tDimensions ntax={len(samples)} nchar={num_snps};\n")
        out_f.write('\tFormat datatype=integerdata symbols="012" gap=-;\n')
        out_f.write("\tMatrix\n")

        # Matrix lines
        for sample in samples:
            seq_str = "".join(sample2seq[sample])
            out_f.write(f"{sample}\t{seq_str}\n")

        out_f.write("\t;\nEnd;\n")

    print(f"  -> Wrote replicate {rep_index} NEXUS: {out_nexus}")

print("All replicates done!")

In [None]:
for rep_index in range(1, num_replicates + 1):
    print(f"Generating replicate {rep_index} of {num_replicates} ...")

    # 2a. Randomly sample SNPs from possible sites (waitaha sites, since it has the least)
    # chosen_records = random.sample(records, num_snps)
    # Get indices of random sites, we can't use the sample function on Polars DataFrame directly
    chosen_indices = random.sample(range(len(possible_sites)), num_snps)
    chosen_sites = possible_sites[chosen_indices]
    chosen_records = []
    for site in chosen_sites.iter_rows(named=True):
        chrom, pos = site['chrom'], site['pos(1‑based)']
        # Find the record that matches this chrom and pos
        record = vcf(f"{chrom}:{pos}")
        # Record is now a generator, let's get the first one
        record = next(record, None)  # Get the first record or None if not found

        print(record)

        if record is None:
            print(f"WARNING: No record found for {chrom}:{pos}. Skipping.")
            continue
        # Check if the record matches the chrom and pos
        if record.CHROM == chrom and record.POS == pos:
            chosen_records.append(record)

    # 2b. Prepare a structure to hold the integer-coded genotype for each sample
    sample2seq = {sample: [] for sample in samples}

    # 2c. Fill sequence data (0,1,2,-)
    for record in chosen_records:
        genotypes = record.genotypes  # [ [g1, g2, phased, ...], [g1, g2, ...], ...]
        for i, sample in enumerate(samples):

            if sample in ["waitaha", "richdalei"]:
                loc = [record.CHROM, record.POS]
                if sample == "waitaha":
                    gt_list = waitaha_sites.filter(
                        (pl.col("chrom") == loc[0]) & (pl.col("pos(1‑based)") == loc[1])
                    )['GT'].to_list()
                else:  # sample == "richdalei"
                    gt_list = richdalei_sites.filter(
                        (pl.col("chrom") == loc[0]) & (pl.col("pos(1‑based)") == loc[1])
                    )['GT'].to_list()
                if gt_list:
                    gt = gt_list[0]
                    if gt == "0/0":
                        code = "0"
                    elif gt in ["0/1", "1/0"]:
                        code = "1"
                    elif gt == "1/1":
                        code = "2"
                    else:
                        code = "-"
                else:
                    code = "-"
            elif sample in vcf.samples:
                g1, g2 = genotypes[i][0], genotypes[i][1]

                # Missing genotype => '-'
                if g1 < 0 or g2 < 0:
                    code = '-'
                else:
                    # 0 = REF, 1 = ALT for each allele
                    # Biallelic, so valid combos: (0,0), (0,1)/(1,0), (1,1)
                    if g1 == 0 and g2 == 0:
                        code = '0'
                    elif (g1 == 0 and g2 == 1) or (g1 == 1 and g2 == 0):
                        code = '1'
                    elif g1 == 1 and g2 == 1:
                        code = '2'
                    else:
                        # Shouldn't happen in a well-filtered biallelic VCF,
                        # but just in case, treat as missing:
                        code = '-'

            sample2seq[sample].append(code)

    # 2d. Write NEXUS for this replicate
    out_nexus = f"{output_prefix}_rep{rep_index}.nex"
    with open(out_nexus, "w") as out_f:
        # Header
        out_f.write("#NEXUS\n")
        out_f.write("[SNP matrix in integer format: 0=homREF, 1=het, 2=homALT, -=missing]\n\n")
        out_f.write("Begin data;\n")
        out_f.write(f"\tDimensions ntax={len(samples)} nchar={num_snps};\n")
        out_f.write('\tFormat datatype=integerdata symbols="012" gap=-;\n')
        out_f.write("\tMatrix\n")

        # Matrix lines
        for sample in samples:
            seq_str = "".join(sample2seq[sample])
            out_f.write(f"{sample}\t{seq_str}\n")

        out_f.write("\t;\nEnd;\n")

    print(f"  -> Wrote replicate {rep_index} NEXUS: {out_nexus}")

print("All replicates done!")

In [None]:
# See if there's a SNP
# halSnps --tsv snps --refSequence ptg000057l --start 63660 --length 100 /mnt/data/seabirds.hal a9 Eudyptesmoseleyi_genomic

# If no SNP, see if it's a reference allele or missing sequence
# halAlignmentDepth /mnt/data/seabirds.hal a9 --targetGenomes Eudyptesmoseleyi_genomic --refSequence ptg000057l --start 63660 --length 100

In [None]:
# Do per contig
for rep_index in range(1, num_replicates + 1):
    print(f"Generating replicate {rep_index} of {num_replicates} ...")

    # 2a. Randomly sample SNPs
    chosen_records = random.sample(records, num_snps)

    # 2b. Prepare a structure to hold the integer-coded genotype for each sample
    # Per contig
    sample2seq = {sample: [] for sample in samples}

    # Sort the records by contig and position
    chosen_records.sort(key=lambda r: (r.CHROM, r.start))

    # Store the contig for each entry
    contig2seq = {contig: [] for contig in set(r.CHROM for r in chosen_records)}
    # Count the number of SNPs per contig
    contig2num_snps = {contig: 0 for contig in contig2seq}

    # 2c. Fill sequence data (0,1,2,-)
    for record in chosen_records:
        contig2num_snps[record.CHROM] += 1
        genotypes = record.genotypes  # [ [g1, g2, phased, ...], [g1, g2, ...], ...]
        for i, sample in enumerate(samples):
            g1, g2 = genotypes[i][0], genotypes[i][1]

            # Missing genotype => '-'
            if g1 < 0 or g2 < 0:
                code = '-'
            else:
                # 0 = REF, 1 = ALT for each allele
                # Biallelic, so valid combos: (0,0), (0,1)/(1,0), (1,1)
                if g1 == 0 and g2 == 0:
                    code = '0'
                elif (g1 == 0 and g2 == 1) or (g1 == 1 and g2 == 0):
                    code = '1'
                elif g1 == 1 and g2 == 1:
                    code = '2'
                else:
                    # Shouldn't happen in a well-filtered biallelic VCF,
                    # but just in case, treat as missing:
                    code = '-'

            sample2seq[sample].append(code)

    # 2d. Write NEXUS for this replicate per contig

    for contig, snpcount in contig2num_snps.items():
        if snpcount == 0:
            continue
        out_nexus = f"{output_prefix}_rep{rep_index}_{contig}.nex"
        
        with open(out_nexus, "w") as out_f:
            # Header
            out_f.write("#NEXUS\n")
            out_f.write("[SNP matrix in integer format: 0=homREF, 1=het, 2=homALT, -=missing]\n\n")
            out_f.write("Begin data;\n")
            out_f.write(f"\tDimensions ntax={len(samples)} nchar={snpcount};\n")
            out_f.write('\tFormat datatype=integerdata symbols="012" gap=-;\n')
            out_f.write("\tMatrix\n")

            # Matrix lines
            for sample in samples:
                seq_str = "".join(sample2seq[sample][:snpcount])
                out_f.write(f"{sample}\t{seq_str}\n")

                # Remove the first n snps from the list
                sample2seq[sample] = sample2seq[sample][snpcount:]

            out_f.write("\t;\nEnd;\n")

        print(f"  -> Wrote replicate {rep_index} NEXUS: {out_nexus}")


print("All replicates done!")

In [None]:
from cyvcf2 import VCF

print("Generating SNP dataset from contig ptg000423c ...")

# Open BCF file and access the specific contig directly
vcf = VCF(input_vcf)
records = [record for record in vcf("ptg000423c")]  # Random access to contig

# Sort SNPs by position
chosen_records = sorted(records, key=lambda r: r.POS)

# Prepare a structure to hold the integer-coded genotype for each sample
samples = vcf.samples
sample2seq = {sample: [] for sample in samples}

# Fill sequence data (0,1,2,-)
for record in chosen_records:
    genotypes = record.genotypes  # [ [g1, g2, phased, ...], [g1, g2, ...], ...]
    for i, sample in enumerate(samples):
        g1, g2 = genotypes[i][0], genotypes[i][1]

        # Missing genotype => '-'
        if g1 < 0 or g2 < 0:
            code = '-'
        else:
            # 0 = REF, 1 = ALT for each allele
            if g1 == 0 and g2 == 0:
                code = '0'
            elif (g1 == 0 and g2 == 1) or (g1 == 1 and g2 == 0):
                code = '1'
            elif g1 == 1 and g2 == 1:
                code = '2'
            else:
                code = '-'

        sample2seq[sample].append(code)

# Write NEXUS file
out_nexus = f"{output_prefix}.nex"
with open(out_nexus, "w") as out_f:
    # Header
    out_f.write("#NEXUS\n")
    out_f.write("[SNP matrix in integer format: 0=homREF, 1=het, 2=homALT, -=missing]\n\n")
    out_f.write("Begin data;\n")
    out_f.write(f"\tDimensions ntax={len(samples)} nchar={len(chosen_records)};\n")
    out_f.write('\tFormat datatype=integerdata symbols="012" gap=-;\n')
    out_f.write("\tMatrix\n")

    # Matrix lines
    for sample in samples:
        seq_str = "".join(sample2seq[sample])
        out_f.write(f"{sample}\t{seq_str}\n")

    out_f.write("\t;\nEnd;\n")

print(f"  -> Wrote NEXUS file: {out_nexus}")
print("All SNP data processing done!")
