In [5]:
!conda list # solely to document conda enviornment

In [6]:
# start coding here
pdb_seqs = snakemake.input[0]
protein_seqs = snakemake.output.protein_seqs
log = snakemake.log[0]

In [7]:
import time

from Bio import SeqIO

def main(all_seqs_inpath, protein_seqs_outpath, log):
    protein_record_count = 0
    nonprotein_record_count = 0
    # parse pdb file, record by record
    with open(protein_seqs_outpath, "w") as out_f:
        with open(all_seqs_inpath, "r") as in_f:
            records = SeqIO.parse(in_f, "fasta")
            for i, record in enumerate(records):
                #print(record.description.split())
                pdbID, molecule, length, (*name) = record.description.split() 
                #print(f"Parsing {pdbID} record number {i} {' '*9}", end="\r")
                #time.sleep(0.009) # solely to fix a wierd issue with the above printout statement
                # a protein record
                if molecule == "mol:protein":
                    SeqIO.write(record, out_f, "fasta")
                    protein_record_count += 1
                # not a protein record
                else:
                    nonprotein_record_count += 1
                    #print(f"Dropping {pdbID}, not a protein")
        
    # print a quick summary
    with open(log, "w") as f:
        f.write(
            f"Removed {nonprotein_record_count} non-protein records\n"
            f"Retained {protein_record_count} protein records\n"
            f"Parsed a total of {i+1} records\n"
        )
            

In [8]:
%%timeit -r 1 -n 1
# timed on toshiba pc
main(all_seqs_inpath=pdb_seqs,
     protein_seqs_outpath=protein_seqs,
    log=log)