# Gather sequences into fasta files based on an seq ID list

In [1]:
from Bio import SeqIO as si
import csv
from collections import namedtuple
import re

## Read in a list of ids of non-coding sequences

In [2]:
nonc_ids = []
with open("../proc/non-coding.txt", newline="") as infile:
    reader = csv.reader(infile)
    for row in reader:
        nonc_ids.append(row[0])

In [3]:
nonc_ids[0], len(nonc_ids)

('PVL_EU368820', 117)

In [4]:
print(nonc_ids)

['PVL_EU368820', 'PVL_FJ821791', 'PVL_FJ895584', 'PVL_HM584701', 'PVL_HM584703', 'PVL_HM584704', 'PVL_HM584706', 'PVL_HM584708', 'PVL_JN635504', 'PVL_JN635506', 'PVL_JN635507', 'PVL_JN635508', 'PVL_JN635510', 'VanB_5_NG_048333_1', 'VanB_9_NG_048339_1', 'VanC_10_NG_048343_1', 'VanC_11_NG_048345_1', 'VanC_12_NG_048354_1', 'VanC_13_NG_048355_1', 'VanC_14_NG_048356_1', 'VanC_15_NG_048357_1', 'VanC_16', 'VanC_17', 'VanC_1_NG_048344_1', 'VanC_2_NG_048346_1', 'VanC_3_NG_048347_1', 'VanC_4_NG_048348_1', 'VanC_5_NG_048349_1', 'VanC_6_NG_048350_1', 'VanC_7_NG_048351_1', 'VanC_8_NG_048352_1', 'VanC_9_NG_048353_1', 'blaCMY_12', 'blaCMY_3', 'blaCTX_M_106', 'blaCTX_M_107_JF274244.1', 'blaCTX_M_108_JF274245.1', 'blaCTX_M_109_JF274248', 'blaCTX_M_110', 'blaKPC_9', 'blaPER_5', 'blaSHV_123_GQ390805', 'blaSHV_126_GQ390808', 'blaSHV_152', 'blaSHV_153', 'blaSHV_163', 'blaSHV_165', 'blaTEM_118_AY130285.1', 'blaTEM_199', 'blaTEM_205', 'blaTEM_21', 'blaTEM_22', 'blaTEM_42_X98047.1', 'blaTEM_89_AY039040', 'erm

## Read in all sequences

In [5]:
seqs = si.to_dict(si.parse("../final/coding_non-coding.fa", "fasta"))

In [6]:
len(seqs)

641

In [7]:
seqs.keys()

dict_keys(['blaBEL_1', 'blaBEL_2', 'blaBEL_3', 'blaCTX_M_1', 'blaCTX_M_1_2d', 'blaCTX_M_10', 'blaCTX_M_101', 'blaCTX_M_102', 'blaCTX_M_103', 'blaCTX_M_104', 'blaCTX_M_105', 'blaCTX_M_106', 'blaCTX_M_11', 'blaCTX_M_110', 'blaCTX_M_111', 'blaCTX_M_112', 'blaCTX_M_113', 'blaCTX_M_114', 'blaCTX_M_116', 'blaCTX_M_117', 'blaCTX_M_12', 'blaCTX_M_121', 'blaCTX_M_123', 'blaCTX_M_124', 'blaCTX_M_126', 'blaCTX_M_13', 'blaCTX_M_131', 'blaCTX_M_132', 'blaCTX_M_134', 'blaCTX_M_136', 'blaCTX_M_139', 'blaCTX_M_14', 'blaCTX_M_14_2d', 'blaCTX_M_142', 'blaCTX_M_14b', 'blaCTX_M_14b_2d', 'blaCTX_M_15', 'blaCTX_M_15_2d', 'blaCTX_M_15_3d', 'blaCTX_M_16', 'blaCTX_M_17', 'blaCTX_M_17_2d', 'blaCTX_M_19', 'blaCTX_M_2', 'blaCTX_M_20', 'blaCTX_M_21', 'blaCTX_M_22', 'blaCTX_M_22_2d', 'blaCTX_M_23', 'blaCTX_M_24', 'blaCTX_M_24_2d', 'blaCTX_M_24_3d', 'blaCTX_M_25', 'blaCTX_M_26', 'blaCTX_M_27', 'blaCTX_M_28', 'blaCTX_M_29', 'blaCTX_M_3', 'blaCTX_M_3_2d', 'blaCTX_M_30', 'blaCTX_M_31', 'blaCTX_M_32', 'blaCTX_M_33', 'bl

## Form a list of non-coding sequences

... while removing them from the dictionary containing all sequences.

In [8]:
non_coding = []
for id in nonc_ids:
    popped = seqs.pop(id)
    non_coding.append(popped)

## Write coding sequences

In [9]:
coding = list(seqs.values())
si.write(coding, "../final/coding.fa", "fasta")

524

## Write non-coding sequences

In [10]:
si.write(non_coding, "../final/non-coding.fa", "fasta")

117

## Do some sanity checks

In [11]:
len(nonc_ids), len(non_coding)

(117, 117)

In [12]:
nonc_ids[0], nonc_ids[-1]

('PVL_EU368820', 'rrs_WT_intraC_chim')

In [13]:
non_coding[0], non_coding[-1]

(SeqRecord(seq=Seq('ATGGTCAAAAAAAGACTATTAGCTGCAACATTGTCGTTAGGAATAATCACTCCT...TAA'), id='PVL_EU368820', name='PVL_EU368820', description='PVL_EU368820 |||PVL', dbxrefs=[]),
 SeqRecord(seq=Seq('GCCTGTGTGCAGGTGGTGCATGGCTGTCGTCAGCTCGTGTCGTGAGATGTTGGG...ATC'), id='rrs_WT_intraC_chim', name='rrs_WT_intraC_chim', description='rrs_WT_intraC_chim |||Wildtype rrs', dbxrefs=[]))