In [2]:
# start coding here
# files
input_ss_dis = snakemake.input.ss_dis
output_ss_dis = snakemake.output.reformatted_ss_dis
_tmp = "tmp/ss_dis_interim.txt"

# sourced: https://alexwlchan.net/2018/12/iterating-in-fixed-size-chunks/
import itertools


def chunked_iterable(iterable, size):
    it = iter(iterable)
    while True:
        chunk = tuple(itertools.islice(it, size))
        if not chunk:
            break
        yield chunk

In [3]:
## Overall changes
# moved secstr_disorder_merge fcn to this file
# changed hardcoded paths to variables (defined from snakemake in above cell)
# 

# %load scripts/ss_dis_convert.py
from Bio import SeqIO
import json


def secstr_disorder_merge(secstr, disorder):
    #replace all spaces with dashes
    #import pdb; pdb.set_trace()
    replacement = list(secstr) ## removed replace, this is already addressed in create_dict by making temp file
    #if there should be an X, replace the dash with an X
    for i in range(len(replacement)):
        if disorder[i] == "X":
            replacement[i] = "X"
    replacement_str = ''.join(replacement)
    return replacement_str

def create_dict():
    """ ## moving data to near block where it is populated
    data = {}
    record_list = []
    """
    record_count = 0
    
    print(f"Starting to read {input_ss_dis} and write temporary file {_tmp}...")
    #replace all the spaces in the file and replaces it with dashes
    #so that no information goes missing when parsing through the file
    replace = open(_tmp, "w") ## w+ -> w : w+ allows read and write, we only need to write the tmp file
    with open(input_ss_dis,"r") as in_file: ## r+ -> r : r+ is allows reading and writing access, we only need reading
        for line in in_file:
            if ">" in line:
                record_count += 1
            fixed_line = line.replace(" ", "L")
            replace.write(fixed_line)
    replace.close()
    print("Success")

    print("Now to parse through the whole thing....")
    # After all the spaces have been replaced, move on
    """ ## reworked, see below in line comments for rationale 

    record_dict = SeqIO.to_dict(SeqIO.parse("tmp/ss_dis_interim.txt", "fasta")) ## running the Parse twice is very expensive
    ## also when possible, avoid preloading entire list from generator, get to know difference between lists and generators-very important for speeding up many iterable processes 
    for record in SeqIO.parse("tmp/ss_dis_interim.txt", "fasta"): ## second Parse run
        #creates unique id for dictionary
        id = record.id.split(":")[0] + ":" + record.id.split(":")[1]
        #if unique, add to list
        if(not (id in record_list)): ## clarity rewrite: 'if id not in record_list:'
            record_list.append(id) ## alternatively, don't check for uniqueness, instead run 'list(set(record_list))', this will convert the list to a set (which removes redudant items) then back to a list which now only contains unique values
    print("Success")

    print("Finally, to add everything into a dictionary...")
    for id in record_list: ## rather than checking through the record descriptions, we should exploit the guaranteed triplet sequence format
        #create new id for future use
        new_id = id.split(":")[0] + id.split(":")[1] 
        #find the sequence, secstr, and disorder sequences from dictionary
        sequence = str(record_dict[id + ":sequence"].seq)
        secstr = str(record_dict[id + ":secstr"].seq)
        disorder = str(record_dict[id + ":disorder"].seq)
        merge = str(secstr_disorder_merge(secstr, disorder))
        #add all information into useful dictionary
        data[new_id] = sequence, merge
    """
    ## Reworked version of above block
    data = dict()
    triplet_records = chunked_iterable(iterable=SeqIO.parse(_tmp,"fasta"), size=3)
    for i, (primary, secstruct, disorder) in enumerate(triplet_records):
        print(f"Coverting record {i+1} of {int(record_count/3)} records {' '*8}",end="\r")
        merge = secstr_disorder_merge(secstr=str(secstruct.seq), disorder=str(disorder.seq))
        new_id = primary.id.split(":")[0] + primary.id.split(":")[1]
        data[new_id] = str(primary.seq), merge
        
    print("Success")
    

    print("Finally to create the JSON File...")
    with open(output_ss_dis, "w") as file: ## w+ -> w
            json.dump(data,file, indent=4)
    print("Finished!")






In [4]:
%%timeit -n 1 -r 1
# run the functions above

create_dict()
    