# https://iprogramming.bacpop.org/print.html
# counting codons

In [2]:
import time
import gzip

def read_fasta_sample(fp):
    name, seq= None, []
    for line in fp:
        line= line.strip()
        if line.startswith(">"):
            if name: yield (name, "".join(seq).upper())
            name, seq= line[1:], []
        else:
            seq.append(line)
    if name: yield (name, "".join(seq).upper())

def read_file(file_name):
    n_samples= 0
    with open(file_name, "rb") as test_f:
        zipped= test_f.read(2) == b"\x1f\x8b" # gzip文件前两个字节标志
    if zipped:
        fh= gzip.open(file_name, "rt")
    else:
        fh= open(file_name, "rt")
    with fh as fasta:
        seqs= list()
        names= list()
        for h, s in read_fasta_sample(fasta):
            if len(s) % 3 != 0:
                raise RuntimeError(f"Sequence {h} is not a multiple of three")
            elif len(seqs)>0 and len(s) != len(seqs[0]):
                raise RuntimeError(f"Sequence {h} is length {len(s)}, expecting {len(seqs[0])}")
            else:
                seqs.append(s)
                names.append(h)
    return seqs, names

def main():
    start_t= time.time_ns()
    seqs, names= read_file("/share/home/lsy_luzhen/software/Git/data/self-training/Python/materials/BIGSdb_024538_1190028856_31182.dna.aln.gz")
    end_t= time.time_ns()
    time_ms= (end_t-start_t)/1000000
    print(f"time to read {len(seqs)} samples: {time_ms} ms")

if __name__== "__main__":
    main()


time to read 4889 samples: 120.72291 ms
