In [1]:
from Bio import SeqIO as si
import csv
from collections import namedtuple
import re

## Create a dictionary containing seq IDs and their phenotypes

In [2]:
name_phenos = {}
# Fill up dict with all names and their corresponding phenotypes
with open("../proc/phenotypes.csv", newline="") as infile:
    reader = csv.reader(infile)
    Data = namedtuple("Data", next(reader))  # get names from column headers
    for data in map(Data._make, reader):
        name_phenos[data.name] = data.phenotype

## Append to seq IDs (of seq_record objects) the phenotype info

In [3]:
diag_genes_list = []
for seq_record in si.parse("../proc/diagnostic_genes.fa", "fasta"):
    # Add d to demarkate that the id is a duplicate of another
    curr_id = seq_record.id
    # This is necessary because ariba notes non-coding sequence ids with all "-":s converted to "_"
    curr_id = curr_id.replace("-", "_")
    if(re.search(" dupID$", seq_record.description)):
        curr_id += "d"
    # Descriptions didn't bring anything new to the table so they are removed
    # This makes also parsing the fasta files easier
    seq_record.description = ""
    for key in name_phenos.keys():
        if(re.search(key, seq_record.id)):
            seq_record.id = curr_id + " |||" + name_phenos.get(key)
    diag_genes_list.append(seq_record)

## Write a fasta file augmented with the phenotype info in the fasta headers

In [4]:
si.write(diag_genes_list, "../coding_non-coding.fa", "fasta")

578

## Do some sanity checks

In [5]:
diag_genes_list[0]

SeqRecord(seq=Seq('ATGAAACTGCTGCTCTACCCGTTATTGCTGTTCCTTGTCATTCCAGCCTTTGCC...TGA'), id='blaBEL_1 |||ESBL_A', name='blaBEL-1', description='', dbxrefs=[])

In [6]:
diag_genes_list[4]

SeqRecord(seq=Seq('ATGGTTAAAAAATCACTGCGTCAGTTCACGCTGATGGCGACGGCAACCGTCACG...TGA'), id='blaCTX_M_1_2d |||ESBL_A', name='blaCTX-M-1_2', description='', dbxrefs=[])

In [7]:
for rec in diag_genes_list:
    if(rec.id == "blaSHV-28_2d |||ESBL_A"):
        print(rec)
        break

In [8]:
name_phenos

{'blaACC-1': 'ESBL_M_(AmpC)',
 'blaACC-2': 'ESBL_M_(AmpC)',
 'blaACC-4': 'ESBL_M_(AmpC)',
 'blaACT-10': 'ESBL_M_(AmpC)',
 'blaACT-1': 'ESBL_M_(AmpC)',
 'blaACT-3': 'ESBL_M_(AmpC)',
 'blaACT-6': 'ESBL_M_(AmpC)',
 'blaACT-7': 'ESBL_M_(AmpC)',
 'blaACT-9': 'ESBL_M_(AmpC)',
 'blaBEL-1': 'ESBL_A',
 'blaBEL-2': 'ESBL_A',
 'blaBEL-3': 'ESBL_A',
 'blaCMY-10': 'ESBL_M_(AmpC)',
 'blaCMY-11': 'ESBL_M_(AmpC)',
 'blaCMY-12': 'ESBL_M_(AmpC)',
 'blaCMY-14': 'ESBL_M_(AmpC)',
 'blaCMY-16_1_FM995219': 'ESBL_M_(AmpC)',
 'blaCMY-16_2_FJ855437': 'ESBL_M_(AmpC)',
 'blaCMY-1': 'ESBL_M_(AmpC)',
 'blaCMY-2': 'ESBL_M_(AmpC)',
 'blaCMY-38': 'ESBL_M_(AmpC)',
 'blaCMY-3': 'ESBL_M_(AmpC)',
 'blaCMY-42_b': 'ESBL_M_(AmpC)',
 'blaCMY-42_CP023957': 'ESBL_M_(AmpC)',
 'blaCMY-42': 'ESBL_M_(AmpC)',
 'blaCMY-43': 'ESBL_M_(AmpC)',
 'blaCMY-44': 'ESBL_M_(AmpC)',
 'blaCMY-45': 'ESBL_M_(AmpC)',
 'blaCMY-49': 'ESBL_M_(AmpC)',
 'blaCMY-4': 'ESBL_M_(AmpC)',
 'blaCMY-50_FN645444.1': 'ESBL_M_(AmpC)',
 'blaCMY-51': 'ESBL_M_(AmpC)',
