In [None]:
import polars as pl
from needletail import parse_fastx_file, NeedletailError, reverse_complement, normalize_seq

In [None]:
ASESMBLY_FASTA_FILE = "../a9_genome_masked.fa"
MIN_LENGTH = 10_000_000
OUTPUT_FILENAME = "hoiho.karyotype.txt"
TOTAL_COLORS = 6

In [None]:
chr_lengths = {}

try:
    for record in parse_fastx_file(ASESMBLY_FASTA_FILE):
        chr_lengths[record.id] = len(record.seq)
except NeedletailError:
    print("Invalid Fastq file")

In [None]:
# How many are >= MIN_LENGTH
long_chrs = {k: v for k, v in chr_lengths.items() if v >= MIN_LENGTH}
print(f"Number of chromosomes >= {MIN_LENGTH:,} bp: {len(long_chrs)}")

In [None]:
# Get max, mean of long_chrs
max_length = max(long_chrs.values())
mean_length = sum(long_chrs.values()) / len(long_chrs) if long_chrs else 0
total_length = sum(long_chrs.values())
print(f"Max length: {max_length:,} bp")
print(f"Mean length: {mean_length:,.2f} bp")
print(f"Total length: {total_length:,} bp")

In [None]:
# Order by length
ordered_chrs = dict(sorted(long_chrs.items(), key=lambda item: item[1], reverse=True))
# Output in the following format
#
# chr - S1 1 0 155644563 c1
# chr - S2 2 0 123023803 c2
# chr - S3 3 0 101869369 c3
# chr - S4 4 0 89082643 c4
# chr - S5 5 0 86460390 c5
# etc..

with open(OUTPUT_FILENAME, "w") as f:
    for i, (chr_name, length) in enumerate(ordered_chrs.items(), start=1):
        f.write(f"chr - {chr_name} {i} 0 {length} c{i % TOTAL_COLORS}\n")