In [1]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, FeatureLocation
import pandas as pd
import re

In [2]:
def write_new_genbank_file_ids(qseqids,gbfile,newfilename,ids):
    try:
        print("[*] START writing new genbank file")
        records = []
        for record in SeqIO.parse(gbfile,'genbank'):
            records.append(record)

        gb = records[ids]
        #print(len(records))
        locus_tags = []
        for feature in gb.features:
            if 'protein_id' in feature.qualifiers.keys():
                if feature.qualifiers['protein_id'][0].split('.')[0] in qseqids:
                    locus_tags.append(feature.qualifiers['locus_tag'][0])
        print("[*] Number of GENES in new GenBankFile: {}".format(len(locus_tags)))
        relevant_features = []
        for feature in gb.features:
            if 'locus_tag' in feature.qualifiers.keys():
                if feature.qualifiers['locus_tag'][0] in locus_tags:
                    relevant_features.append(feature)

        id_to_location = {}
        for feature in gb.features:
            if feature.type == 'gene' or feature.type == 'CDS':
                if feature.qualifiers['locus_tag'][0] in locus_tags:
                    id_to_location[feature.qualifiers['locus_tag'][0]] = [feature.location]
        qseq_location_dict = {}
        for index,qseq in enumerate(locus_tags):
            if index == 0:
                start = 1
                value = id_to_location[qseq][0].end - id_to_location[qseq][0].start + 1
                strand = id_to_location[qseq][0].strand
                end = value

                qseq_location_dict[qseq] = [start,end,strand]
            else:
                value = id_to_location[qseq][0].end - id_to_location[qseq][0].start
                #print(value)
                strand = id_to_location[qseq][0].strand
                new_start = end +(id_to_location[qseq][0].start - id_to_location[locus_tags[index-1]][0].end)
                end = new_start+value
                qseq_location_dict[qseq] = [new_start,end,strand]

        for feature in relevant_features:
            locus_tag = feature.qualifiers['locus_tag'][0]
            location = FeatureLocation(qseq_location_dict[locus_tag][0],qseq_location_dict[locus_tag][1],qseq_location_dict[locus_tag][2])
            feature.location = location

        sequence = gb.seq[id_to_location[locus_tags[0]][0].start:id_to_location[locus_tags[-1]][0].end]

        seq_record = SeqRecord(id=gb.id,
                           name=gb.name,
                           description=gb.description,
                           annotations=gb.annotations,
                           letter_annotations=gb.letter_annotations,
                           seq=sequence)

        for feature in relevant_features:

            seq_record.features.append(feature)

        with open(newfilename,"w") as gbfile:
            SeqIO.write(seq_record,gbfile,'genbank')
        print("[+] DONE writing")
    except Exception as e:
        raise Exception("[-] ERROR during generation of new genbank file for clinker and synteny plots with Exception : {}".format(e))

In [3]:
def extract_gene_cluster(start,end,gbfilepath):
    try:
        print("[*] START parsing genbank file")
        records = []
        for record in SeqIO.parse(gbfilepath,'genbank'):
            records.append(record)
        
        print("[*] Records Length is: {}".format(len(records)))
        locus_tags = []
        qseqids = []
        for ids,record in enumerate(records):
            gb = record
            switch = 0
            parser_switch = 0
            for feature in gb.features:
                if 'protein_id' in feature.qualifiers.keys():
                    if feature.qualifiers['protein_id'][0].split('.')[0] == start:
                        print("Found Start in Record: {}".format(ids))
                        switch = 1
                        locus_tags.append(feature.qualifiers['locus_tag'][0])
                        qseqids.append(start)
                        parser_switch = 1
                    elif feature.qualifiers['protein_id'][0].split('.')[0] == end:
                        print("Found End in Record: {}".format(ids))
                        switch = 0
                        qseqids.append(end)
                        locus_tags.append(feature.qualifiers['locus_tag'][0])
                    elif switch == 1:
                        qseqids.append(feature.qualifiers['protein_id'][0].split('.')[0])
                        locus_tags.append(feature.qualifiers['locus_tag'][0])
            if parser_switch == 1:
                print("[+] DONE parsing")
                return qseqids, ids
            else:
                continue
    except Exception as e:
        raise Exception("[-] ERROR during extraction of genelocus defined by start : {} and end : {} protein ids with Exception : {}".format(start, end, e))

In [4]:
def extract_gene_cluster_by_locus_tag(start,end,gbfilepath):
    try:
        print("[*] START parsing genbank file")
        records = []
        for record in SeqIO.parse(gbfilepath,'genbank'):
            records.append(record)
        
        print("[*] Records Length is: {}".format(len(records)))
        locus_tags = []
        qseqids = []
        for ids,record in enumerate(records):
            gb = record
            switch = 0
            parser_switch = 0
            for feature in gb.features:
                if 'locus_tag' in feature.qualifiers.keys():
                    if feature.qualifiers['locus_tag'][0] == start:
                        print("Found Start in Record: {}".format(ids))
                        switch = 1
                        locus_tags.append(feature.qualifiers['locus_tag'][0])
                        qseqids.append(start)
                        parser_switch = 1
                    elif feature.qualifiers['locus_tag'][0] == end:
                        print("Found End in Record: {}".format(ids))
                        switch = 0
                        qseqids.append(end)
                        locus_tags.append(feature.qualifiers['locus_tag'][0])
                    elif switch == 1:
                        qseqids.append(feature.qualifiers['locus_tag'][0].split('.')[0])
                        locus_tags.append(feature.qualifiers['locus_tag'][0])
            if parser_switch == 1:
                print("[+] DONE parsing")
                return qseqids, ids
            else:
                continue
    except Exception as e:
        raise Exception("[-] ERROR during extraction of genelocus defined by start : {} and end : {} protein ids with Exception : {}".format(start, end, e))

In [5]:
def write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,newfilename,ids):
    try:
        print("[*] START writing new genbank file")
        records = []
        for record in SeqIO.parse(gbfile,'genbank'):
            records.append(record)

        gb = records[ids]
        #print(len(records))
        locus_tags = []
        for feature in gb.features:
            if 'locus_tag' in feature.qualifiers.keys():
                if feature.qualifiers['locus_tag'][0] in qseqids and (feature.type == 'gene' or feature.type == 'CDS'):
                    locus_tags.append(feature.qualifiers['locus_tag'][0])
        print("[*] Number of GENES in new GenBankFile: {}".format(len(locus_tags)))
        relevant_features = []
        for feature in gb.features:
            if 'locus_tag' in feature.qualifiers.keys():
                if feature.qualifiers['locus_tag'][0] in locus_tags:
                    relevant_features.append(feature)

        id_to_location = {}
        for feature in gb.features:
            if feature.type == 'gene' or feature.type == 'CDS':
                if feature.qualifiers['locus_tag'][0] in locus_tags:
                    id_to_location[feature.qualifiers['locus_tag'][0]] = [feature.location]
        qseq_location_dict = {}
        for index,qseq in enumerate(locus_tags):
            if index == 0:
                start = 1
                value = id_to_location[qseq][0].end - id_to_location[qseq][0].start + 1
                strand = id_to_location[qseq][0].strand
                end = value

                qseq_location_dict[qseq] = [start,end,strand]
            else:
                value = id_to_location[qseq][0].end - id_to_location[qseq][0].start
                #print(value)
                strand = id_to_location[qseq][0].strand
                new_start = end +(id_to_location[qseq][0].start - id_to_location[locus_tags[index-1]][0].end)
                end = new_start+value
                qseq_location_dict[qseq] = [new_start,end,strand]

        for feature in relevant_features:
            locus_tag = feature.qualifiers['locus_tag'][0]
            location = FeatureLocation(qseq_location_dict[locus_tag][0],qseq_location_dict[locus_tag][1],qseq_location_dict[locus_tag][2])
            feature.location = location

        sequence = gb.seq[id_to_location[locus_tags[0]][0].start:id_to_location[locus_tags[-1]][0].end]

        seq_record = SeqRecord(id=gb.id,
                           name=gb.name,
                           description=gb.description,
                           annotations=gb.annotations,
                           letter_annotations=gb.letter_annotations,
                           seq=sequence)

        for feature in relevant_features:

            seq_record.features.append(feature)

        with open(newfilename,"w") as gbfile:
            SeqIO.write(seq_record,gbfile,'genbank')
        print("[+] DONE writing")
    except Exception as e:
        raise Exception("[-] ERROR during generation of new genbank file for clinker and synteny plots with Exception : {}".format(e))

In [20]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/curvibacter_gracilis.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/curvibacter_gracilis_clinker.gbk'
start = 'HGNGCGOJ_03019'
end = 'HGNGCGOJ_03049'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 73
Found Start in Record: 7
Found End in Record: 7
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 28
[+] DONE writing


In [6]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/curvibacter_gracilis.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/lps_cluster/curvibacter_gracilis.gbk'
start = 'HGNGCGOJ_03034'
end = 'HGNGCGOJ_03045'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 73
Found Start in Record: 7
Found End in Record: 7
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 12
[+] DONE writing


In [22]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/curvibacter_lanceolatus_1.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/curvibacter_lanceolatus_1_clinker.gbk'
start = 'JOPOJBGG_02301'
end = 'JOPOJBGG_02331'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 72
Found Start in Record: 4
Found End in Record: 4
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 28
[+] DONE writing


In [7]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/curvibacter_lanceolatus_1.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/lps_cluster/curvibacter_lanceolatus_1_clinker.gbk'
start = 'JOPOJBGG_02315'
end = 'JOPOJBGG_02324'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 72
Found Start in Record: 4
Found End in Record: 4
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 10
[+] DONE writing


In [23]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/curvibacter_lanceolatus_2.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/curvibacter_lanceolatus_2_clinker.gbk'
start = 'NFEFLOND_04396'
end = 'NFEFLOND_04426'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 100
Found Start in Record: 38
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 25
[+] DONE writing


In [14]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/curvibacter_lanceolatus_2.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/lps_cluster/curvibacter_lanceolatus_2_clinker.gbk'
start = 'NFEFLOND_04408'
end = 'NFEFLOND_04419'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 100
Found Start in Record: 38
Found End in Record: 38
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 10
[+] DONE writing


In [24]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/Curvibacter sp. GWA2_63_95.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/curvibacter_gwa2.gbk'
start = 'DFCICLJG_00801'
end = 'DFCICLJG_00831'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 40
Found Start in Record: 7
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 26
[+] DONE writing


In [16]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/Curvibacter sp. GWA2_63_95.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/lps_cluster/curvibacter_gwa2.gbk'
start = 'DFCICLJG_00788'
end = 'DFCICLJG_00800'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 40
Found Start in Record: 7
Found End in Record: 7
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 13
[+] DONE writing


In [15]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/Curvibacter sp. RIFCSPHIGHO2_12_FULL_63_18.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/curvibacter_rifcsphigho2.gbk'
start = 'EICPHKOE_01456'
end = 'EICPHKOE_01496'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 43
Found Start in Record: 19
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 36
[+] DONE writing


In [6]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/BG3.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/curvibacter_bg3.gbk'
start = 'BG3_03000'
end = 'BG3_03039'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 13
Found Start in Record: 10
Found Start in Record: 10
Found End in Record: 10
Found End in Record: 10
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 79
[+] DONE writing


In [30]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/Hma.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/lps_cluster/curvibacter_hma.gbk'
start = 'Hma_02877'
end = 'Hma_02886'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 3
Found Start in Record: 1
Found Start in Record: 1
Found End in Record: 1
Found End in Record: 1
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 20
[+] DONE writing


In [29]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/Hma.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/curvibacter_hma.gbk'
start = 'Hma_02855'
end = 'Hma_02889'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 3
Found Start in Record: 1
Found Start in Record: 1
Found End in Record: 1
Found End in Record: 1
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 70
[+] DONE writing


In [21]:

gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/Hvu.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/curvibacter_hvu.gbk'
start = 'Hvu_03069'
end = 'Hvu_03080'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 8
Found Start in Record: 7
Found Start in Record: 7
Found End in Record: 7
Found End in Record: 7
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 24
[+] DONE writing


In [10]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/curvibacter_sp.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/curvibacter_sp_1.gbk'
start = 'ALBLECJD_00552'
end = 'ALBLECJD_00559'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 59
Found Start in Record: 12
Found End in Record: 12
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 8
[+] DONE writing


In [11]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/Curvibacter sp. GWA2_63_95.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/curvibacter_gwa2_2.gbk'
start = 'DFCICLJG_00784'
end = 'DFCICLJG_00821'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 40
Found Start in Record: 7
Found End in Record: 7
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 37
[+] DONE writing


In [22]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/Curvibacter sp. GWA2_63_95.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/lps_cluster/curvibacter_gwa2_2.gbk'
start = 'DFCICLJG_00791'
end = 'DFCICLJG_00799'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 40
Found Start in Record: 7
Found End in Record: 7
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 9
[+] DONE writing


In [24]:
gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/curvibacter_sp_chrr16.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/lps_cluster/curvibacter_sp_chrr16.gbk'
start = 'ENNBNNNG_02890'
end = 'ENNBNNNG_02898'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 26
Found Start in Record: 5
Found End in Record: 5
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 9
[+] DONE writing


In [26]:


gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/curvibacter_sp_2.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/curvibacter_sp_2.gbk'
start = 'KANBEJMG_01703'
end = 'KANBEJMG_01717'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 600
Found Start in Record: 162
Found End in Record: 162
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 15
[+] DONE writing


In [27]:


gbfile = "../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/curvibacter_sp_2.gbk"
gbfilepath_output = '../data/curvibacter_dna_genomes_for_rec_blast/rec_blast_result/clinker/lps_cluster/curvibacter_sp_2.gbk'
start = 'KANBEJMG_01708'
end = 'KANBEJMG_01717'
qseqids, ids = extract_gene_cluster_by_locus_tag(start,end,gbfile)
write_new_genbank_file_by_locus_tags_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 600
Found Start in Record: 162
Found End in Record: 162
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 10
[+] DONE writing


# Result Processing RecBLAST - Other Symbionts

In [5]:
gbfile = "Duganella_sp_GN2-R2GCF_012849555.1.gbk"
gbfilepath_output = '../data/clinker_other_symbionts/duganella_gn2_r2.gbk'
start = 'WP_169433667'
end = 'WP_169433691'
qseqids, ids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 3
Found Start in Record: 0
Found End in Record: 0
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 34
[+] DONE writing


In [70]:
gbfile = "GCF_018474005.1_ASM1847400v1_genomic.gbff"
gbfilepath_output = '../data/clinker_other_symbionts/curvibacter_chrr_16.gbk'
start = 'WP_214123791'
end = 'WP_214123893'
qseqids, ids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 26
[+] DONE parsing
[+] DONE parsing
[+] DONE parsing
[+] DONE parsing
[+] DONE parsing
Found Start in Record: 5
Found End in Record: 5
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 25
[+] DONE writing


In [73]:
#WP_157256536
gbfile = "Pelomonas_sp_Root1217GCF_001425705.1.gbk"
gbfilepath_output = '../data/clinker_other_symbionts/Pelomonas_sp_Root1217.gbk'
start = 'WP_157256536'
end = 'WP_235538074'
qseqids, ids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 32
Found Start in Record: 29
Found End in Record: 29
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 30
[+] DONE writing


In [6]:
gbfile = "Undibacterium_parvumGCF_003955735.1.gbk"
gbfilepath_output = '../data/clinker_other_symbionts/undibacterium_parvum.gbk'
start = 'WP_126126332'
end = 'WP_126126355'
qseqids, ids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file_ids(qseqids,gbfile,gbfilepath_output, ids)

[*] START parsing genbank file
[*] Records Length is: 1
Found Start in Record: 0
Found End in Record: 0
[+] DONE parsing
[*] START writing new genbank file
[*] Number of GENES in new GenBankFile: 28
[+] DONE writing


# Result Processing RecBLAST - ALL

In [5]:
#example run
gbfile = "Curvibacter_sp_AEP1-3GCF_002163715.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Curvibacter_sp_AEP1-3.gbk'
start = "WP_087496534"#"WP_087496556" #WP_087496534
end = "WP_087496569"#'WP_087496564'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [12]:
gbfile = "Azoarcus_sp_M9-3-2GCF_010983895.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Azoarcus_sp_M9-3-2.gbk'
start = "WP_217424644"#"WP_173763925"
end = "WP_217424645"#'WP_173763933'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [10]:
gbfile = "Candidatus_Methylopumilus_turicensisGCF_000953015.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Candidatus_Methylopumilus_turicensis.gbk'
start = "WP_052661108"
end = 'WP_045752013'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [7]:
gbfile = "Candidatus_Symbiobacter_mobilis_CRGCF_000477435.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Candidatus_Symbiobacter_mobilis_CR.gbk'
start = "WP_022772778"#"WP_022772785"
end = "WP_022772814"#'WP_022772812'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [12]:
gbfile = "Chitinimonas_arcticaGCF_007431345.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Chitinimonas_arctica.gbk'
start = "WP_144278058"
end = 'WP_144278064'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [13]:
gbfile = "Dechloromonas_sp_HYN0024GCF_003441615.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Dechloromonas_sp_HYN0024.gbk'
start = "WP_117608894"
end = 'WP_117608899'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [18]:
gbfile = "Hydrogenophaga_sp_BA0156GCF_011388215.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Hydrogenophaga_sp_BA0156.gbk'
start = "WP_166222957"
end = 'WP_166222968'#'WP_166223006' #WP_166222968
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [19]:
gbfile = "Hydrogenophaga_sp_PBCGCF_000263795.2.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Hydrogenophaga_sp_PBC.gbk'
start = 'WP_009518443'#"WP_009518427" #WP_009518443
end = 'WP_009518449' #WP_009518449
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [11]:
gbfile = "Hydrogenophaga_sp_PBL-H3GCF_010104355.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Hydrogenophaga_sp_PBL.gbk'
start = "WP_201450059"#"WP_159604905" 
end = "WP_159604948"#'WP_159604918' #'WP_159604918' #epsD #WP_159604946 EpsA 
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [21]:
gbfile = "Janthinobacterium_sp_1_2014MBL_MicDivGCF_001865675.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Janthinobacterium_sp_1_2014MBL_MicDiv.gbk'
start = "WP_083411968" 
end = 'WP_071324847' 
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [22]:
gbfile = "Janthinobacterium_tructaeGCF_006517255.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Janthinobacterium_tructae.gbk'
start = "WP_141172142" 
end = 'WP_141172147' #WP_141172166 LuxR
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [23]:
gbfile = "Leptothrix_cholodnii_SP-6GCF_000019785.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Leptothrix_cholodnii_SP-6.gbk'
start = "WP_012347374" 
end = 'WP_012347392' 
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [24]:
gbfile = "Massilia_sp_NR_4-1GCF_001191005.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Massilia_sp_NR_4-1.gbk'
start = "WP_050408903" 
end = 'WP_050408907' 
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [25]:
gbfile = "Massilia_armeniacaGCF_003028855.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Massilia_armeniaca.gbk'
start = "WP_107144440" 
end = 'WP_107141327' 
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [26]:
gbfile = "Massilia_sp_WG5GCF_001412595.2.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Massilia_sp_WG5.gbk'
start = "WP_047823285" 
end = 'WP_047823267' 
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [14]:
gbfile = "Massilia_flavaGCF_009789595.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Massilia_flava.gbk'
start = "WP_145873161" 
end = "WP_145873322"#'WP_145873168' 
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [15]:
gbfile = "Methylobacillus_flagellatus_KTGCF_000013705.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Methylobacillus_flagellatus_KTG.gbk'
start = "WP_011480226"#"WP_011480243" 
end = "WP_195742185"#'WP_195741992' 
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [29]:
gbfile = "Methylophilus_sp_TWE2GCF_001183865.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Methylophilus_sp_TWE2.gbk'
start = "WP_049638639" 
end = 'WP_049638646'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [30]:
gbfile = "Methylotenera_mobilis_JLW8GCF_000023705.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Methylotenera_mobilis_JLW8.gbk'
start = "WP_015832731" 
end = 'WP_015832740'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [31]:
gbfile = "Methyloversatilis_sp_RAC08GCF_001713355.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Methyloversatilis_sp_RAC08.gbk'
start = "WP_069037629" 
end = 'WP_223300229'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [10]:
gbfile = "Methylovorus_sp_MP688GCF_000183115.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Methylovorus_sp_MP688.gbk'
start = "WP_013442598"#"WP_013442610" 
end = "WP_013442624"#'WP_193373555'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [9]:
gbfile = "Nitrosospira_multiformis_ATCC_25196GCF_000196355.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Nitrosospira_multiformis_ATCC_25196.gbk'
start = "WP_011379597"#"WP_011379600" 
end = 'WP_041352320'#'WP_041352730'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [34]:
gbfile = "Oryzomicrobium_terraeGCF_008274805.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Oryzomicrobium_terrae.gbk'
start = "WP_223115922" 
end = 'WP_149425636'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [35]:
gbfile = "Sulfuriferula_sp_AH1GCF_002162035.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Sulfuriferula_sp_AH1.gbk'
start = "WP_087447933" 
end = 'WP_087447940'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [36]:
gbfile = "Thauera_sp_K11GCF_002354895.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Thauera_sp_K11.gbk'
start = "WP_096448401" 
end = 'WP_096448413'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [37]:
gbfile = "Thiobacillus_denitrificans_ATCC_25259GCF_000012745.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Thiobacillus_denitrificans_ATCC_25259.gbk'
start = "WP_011312303" 
end = 'WP_187147213'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [13]:
gbfile = "Undibacterium_parvumGCF_003955735.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Undibacterium_parvum.gbk'
start = "WP_126126332"#"WP_126126347" 
end = "WP_126126355"#'WP_157984300' 
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing


In [8]:
gbfile = "Xylophilus_rhododendriGCF_009906855.1.gbk"
gbfilepath_output = '../data/clinker_38_organisms/Xylophilus_rhododendri.gbk'
start = "WP_160553782"#"WP_160553790" 
end = "WP_160553806"#'WP_160555442'
qseqids = extract_gene_cluster(start,end,gbfile)
write_new_genbank_file(qseqids,gbfile,gbfilepath_output)

[*] START parsing genbank file
[+] DONE parsing
[*] START writing new genbank file
[+] DONE writing
