In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
import pandas as pd
import re

In [None]:
def write_new_genbank_file(qseqids,gbfile,newfilename):
    try:
        print("[+] START writing new genbank file")
        records = []
        for record in SeqIO.parse(gbfile,'genbank'):
            records.append(record)

        gb = records[0]
        locus_tags = []
        for feature in gb.features:
            if 'protein_id' in feature.qualifiers.keys():
                if feature.qualifiers['protein_id'][0].split('.')[0] in qseqids:
                    locus_tags.append(feature.qualifiers['locus_tag'][0])

        relevant_features = []
        for feature in gb.features:
            if 'locus_tag' in feature.qualifiers.keys():
                if feature.qualifiers['locus_tag'][0] in locus_tags:
                    relevant_features.append(feature)

        id_to_location = {}
        for feature in gb.features:
            if feature.type == 'gene' or feature.type == 'CDS':
                if feature.qualifiers['locus_tag'][0] in locus_tags:
                    id_to_location[feature.qualifiers['locus_tag'][0]] = [feature.location]
        qseq_location_dict = {}
        for index,qseq in enumerate(locus_tags):
            if index == 0:
                start = 1
                value = id_to_location[qseq][0].end - id_to_location[qseq][0].start + 1
                strand = id_to_location[qseq][0].strand
                end = value

                qseq_location_dict[qseq] = [start,end,strand]
            else:
                value = id_to_location[qseq][0].end - id_to_location[qseq][0].start
                #print(value)
                strand = id_to_location[qseq][0].strand
                new_start = end +(id_to_location[qseq][0].start - id_to_location[locus_tags[index-1]][0].end)
                end = new_start+value
                qseq_location_dict[qseq] = [new_start,end,strand]

        for feature in relevant_features:
            locus_tag = feature.qualifiers['locus_tag'][0]
            location = FeatureLocation(qseq_location_dict[locus_tag][0],qseq_location_dict[locus_tag][1],qseq_location_dict[locus_tag][2])
            feature.location = location

        sequence = gb.seq[id_to_location[locus_tags[0]][0].start:id_to_location[locus_tags[-1]][0].end]

        seq_record = SeqRecord(id=gb.id,
                           name=gb.name,
                           description=gb.description,
                           annotations=gb.annotations,
                           letter_annotations=gb.letter_annotations,
                           seq=sequence)

        for feature in relevant_features:

            seq_record.features.append(feature)

        with open(newfilename,"w") as gbfile:
            SeqIO.write(seq_record,gbfile,'genbank')
        print("[+] DONE writing")
    except Exception as e:
        raise Exception("[-] ERROR during generation of new genbank file for clinker and synteny plots with Exception : {}".format(e))

In [None]:
def extract_gene_cluster(start,end,gbfilepath):
    try:
        print("[*] START parsing genbank file")
        records = []
        for record in SeqIO.parse(gbfilepath,'genbank'):
            records.append(record)

        gb = records[0]
        switch = 0
        locus_tags = []
        qseqids = []
        for feature in gb.features:
            if 'protein_id' in feature.qualifiers.keys():
                if feature.qualifiers['protein_id'][0].split('.')[0] == start:
                    switch = 1
                    locus_tags.append(feature.qualifiers['locus_tag'][0])
                    qseqids.append(start)
                elif feature.qualifiers['protein_id'][0].split('.')[0] == end:
                    switch = 0
                    qseqids.append(end)
                    locus_tags.append(feature.qualifiers['locus_tag'][0])
                elif switch == 1:
                    qseqids.append(feature.qualifiers['protein_id'][0].split('.')[0])
                    locus_tags.append(feature.qualifiers['locus_tag'][0])
        print("[+] DONE parsing")
        return qseqids
    except Exception as e:
        raise Exception("[-] ERROR during extraction of genelocus defined by start : {} and end : {} protein ids with Exception : {}".format(start, end, e))

In [None]:
#example run
gbfilepath = '../data/clinker_synteny/curvibacter.gbk'
start = "WP_198301847"
end = 'WP_087496569'
qseqids = extract_gene_cluster(start,end,'../data/clinker_synteny/curvibacter.gbk')
write_new_genbank_file(qseqids,'../data/clinker_synteny/curvibacter.gbk','curvibacter_operon.gbk')