In [1]:
pip install pyfaidx

Note: you may need to restart the kernel to use updated packages.


In [8]:
from pyfaidx import Fasta

genes = Fasta('GRCh38_latest_genomic.fna')
genes

Fasta("GRCh38_latest_genomic.fna")

In [9]:
genes.keys()

odict_keys(['NC_000001.11', 'NT_187361.1', 'NT_187362.1', 'NT_187363.1', 'NT_187364.1', 'NT_187365.1', 'NT_187366.1', 'NT_187367.1', 'NT_187368.1', 'NT_187369.1', 'NC_000002.12'])

In [10]:
genes['NT_187361.1'][200:230]

>NT_187361.1:201-230
AGCATTAGATTGCGAAGTTCTATTGATTGT

In [11]:
genes['NT_187361.1'][200:230].seq

'AGCATTAGATTGCGAAGTTCTATTGATTGT'

In [12]:
genes['NT_187361.1'][200:230].name

'NT_187361.1'

In [13]:
genes['NT_187361.1'][200:230].start

201

In [14]:
genes['NT_187361.1'][200:230].end

230

In [15]:
genes['NT_187361.1'][200:230].fancy_name

'NT_187361.1:201-230'

In [16]:
len(genes['NT_187361.1'])

175055

Indexes like a list 

In [20]:
genes[0][:50]

>NC_000001.11:1-50
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN

In [18]:
genes['NT_187361.1'][200:230][::-1]

>NT_187361.1:230-201
TGTTAGTTATCTTGAAGCGTTAGATTACGA

In [19]:
genes['NT_187361.1'][200:230][::3]

>NT_187361.1
AAATCATAGT

Complements and reverse complements just like DNA

In [21]:
genes['NT_187361.1'][200:230].complement

>NT_187361.1:201-230 (complement)
TCGTAATCTAACGCTTCAAGATAACTAACA

In [22]:
genes['NT_187361.1'][200:230].reverse

>NT_187361.1:230-201
TGTTAGTTATCTTGAAGCGTTAGATTACGA

In [23]:
-genes['NT_187361.1'][200:230][200:230]

>NT_187361.1:230-231 (complement)

Fasta objects can also be accessed using methode calls 

In [26]:
genes.get_seq('NT_187361.1', 201, 210)

>NT_187361.1:201-210
AGCATTAGAT

In [27]:
genes.get_seq('NT_187361.1', 201, 210, rc = True)

>NT_187361.1:210-201 (complement)
ATCTAATGCT

Spliced sequences can be retrieved from a list of [start, end] coordinates

In [28]:
segments = [[1, 10], [50, 70]]
genes.get_spliced_seq('NT_187361.1', segments)

>NT_187361.1
GAATTCAGCTGAAGGGGCCTGAAAGTGTGGT

In [29]:
genes.keys()

odict_keys(['NC_000001.11', 'NT_187361.1', 'NT_187362.1', 'NT_187363.1', 'NT_187364.1', 'NT_187365.1', 'NT_187366.1', 'NT_187367.1', 'NT_187368.1', 'NT_187369.1', 'NC_000002.12'])

Can create custom key functions for cleaner access

In [31]:
from pyfaidx import Fasta

genes = Fasta('GRCh38_latest_genomic.fna', key_function = lambda x: x.split('.')[0])
genes.keys()

odict_keys(['NC_000001', 'NT_187361', 'NT_187362', 'NT_187363', 'NT_187364', 'NT_187365', 'NT_187366', 'NT_187367', 'NT_187368', 'NT_187369', 'NC_000002', 'NT_187370', 'NT_187371', 'NC_000003', 'NT_167215', 'NC_000004', 'NT_113793', 'NC_000005', 'NT_113948', 'NC_000006', 'NC_000007', 'NC_000008', 'NC_000009', 'NT_187372', 'NT_187373', 'NT_187374', 'NT_187375', 'NC_000010', 'NC_000011', 'NC_000012', 'NC_000013', 'NC_000014', 'NT_113796', 'NT_167219', 'NT_187377', 'NT_113888', 'NT_187378', 'NT_187379', 'NT_187380', 'NT_187381', 'NC_000015', 'NT_187382', 'NC_000016', 'NT_187383', 'NC_000017'])

In [32]:
genes['NC_000001'][:10]

>NC_000001:1-10
NNNNNNNNNN

You can specify a character to split names on, which will generate additional entries

In [33]:
from pyfaidx import Fasta

genes = Fasta('GRCh38_latest_genomic.fna', split_char='.', duplicate_action="first") # default duplicate_action="stop"
genes.keys()

odict_keys(['NC_000001', '11', 'NT_187361', '1', 'NT_187362', 'NT_187363', 'NT_187364', 'NT_187365', 'NT_187366', 'NT_187367', 'NT_187368', 'NT_187369', 'NC_000002', '12', 'NT_187370', 'NT_187371', 'NC_000003', 'NT_167215', 'NC_000004', 'NT_113793', '3', 'NC_000005', '10', 'NT_113948', 'NC_000006', 'NC_000007', '14', 'NC_000008', 'NC_000009', 'NT_187372', 'NT_187373', 'NT_187374', 'NT_187375', 'NC_000010', 'NC_000011', 'NC_000012', 'NC_000013', 'NC_000014', '9', 'NT_113796', 'NT_167219', 'NT_187377', 'NT_113888', 'NT_187378', 'NT_187379', 'NT_187380', 'NT_187381', 'NC_000015', 'NT_187382', 'NC_000016', 'NT_187383', 'NC_000017', 'NT_113930', '2', 'NT_187384', 'NT_187385', 'NC_000018', 'NC_000019', 'NC_000020', 'NC_000021', 'NC_000022', 'NT_187386', 'NT_187387', 'NT_187388', 'NT_187390', 'NT_187391', 'NT_187392', 'NT_187393', 'NT_187394', 'NC_000023', 'NC_000024', 'NT_187395', 'NT_187396', 'NT_187397', 'NT_187398', 'NT_187399', 'NT_187400', 'NT_187401', 'NT_187402', 'NT_187403', 'NT_1874

Filter functions (returning True) limit the index:

In [34]:
genes = Fasta('GRCh38_latest_genomic.fna', filt_function = lambda x: x[0] == 'N')
genes.keys()

odict_keys(['NC_000001.11', 'NT_187361.1', 'NT_187362.1', 'NT_187363.1', 'NT_187364.1', 'NT_187365.1', 'NT_187366.1', 'NT_187367.1', 'NT_187368.1', 'NT_187369.1', 'NC_000002.12', 'NT_187370.1', 'NT_187371.1', 'NC_000003.12', 'NT_167215.1', 'NC_000004.12', 'NT_113793.3', 'NC_000005.10', 'NT_113948.1', 'NC_000006.12', 'NC_000007.14', 'NC_000008.11', 'NC_000009.12', 'NT_187372.1', 'NT_187373.1', 'NT_187374.1', 'NT_187375.1', 'NC_000010.11', 'NC_000011.10', 'NC_000012.12', 'NC_000013.11', 'NC_000014.9', 'NT_113796.3', 'NT_167219.1', 'NT_187377.1', 'NT_113888.1', 'NT_187378.1', 'NT_187379.1', 'NT_187380.1', 'NT_187381.1', 'NC_000015.10', 'NT_187382.1', 'NC_000016.10', 'NT_187383.1', 'NC_000017.11', 'NT_113930.2', 'NT_187384.1', 'NT_187385.1', 'NC_000018.10', 'NC_000019.10', 'NC_000020.11', 'NC_000021.9', 'NC_000022.11', 'NT_187386.1', 'NT_187387.1', 'NT_187388.1', 'NT_187390.1', 'NT_187391.1', 'NT_187392.1', 'NT_187393.1', 'NT_187394.1', 'NC_000023.11', 'NC_000024.10', 'NT_187395.1', 'NT_18

You can also perform line-based iteration, receiving the sequence lines as they appear in the FAST file:

In [36]:
genes = Fasta('GRCh38_latest_genomic.fna')

for line in genes['NT_187361.1']:
    print(line)

GAATTCAGCTGAGAAGAACAGGCAAGGACTTAGGAAATATTCCTTATTTGAAGGGGCCTGAAAGTGTGGTCTGGGGTACA
GCAGTGACCTGTCATACTTGAGAGGATTAAAATACTCTCCAAACACAGTCCCATTCCTTCAACCTTAGCTCGTTTTTTCC
AGCGTCTGAGATATATTAAACCTAGTCCATCCCCAAATTTAGCATTAGATTGCGAAGTTCTATTGATTGTATTTGATTTG
TAATTTAAGATTTTCTCCCCCTACgtaattttgttaaaaacacagaAGTGAATTCTGTTCACTTAGGTGTAACAGTTAAT
ACTTGCTGTTTAAGGAACTAATTAAACCTTACTGGCTTATaaaaaacaaccaccattttattgGTTTGAAGTTCTACGGA
TCTGCATTTTGGTGTGGTGGATTCAGCTGGGTAGTTGATATATGTGTGTTGCCTGGATCATAAAAAGGCCTTAGTCACCT
GGTGCCTTGACTGAGCCTGGTTGGTTTAAGATAGTTTCCTTCACAATCTGGTGGTTTGTGGTGACTCTTGGCTAGGCCCT
GTGTCTCCAACAGGGTAGCTCCAGACCTCTTCACAATTTCCCCCAAAAAGGGAAGAACCAATGGATATTTGCATCACATT
TTCCATTGTCCATTCACTGGACAAGTCAGATGGAAAAGCCCAATTTATTGTCAGAGCATAATATGAGGGCTTGGATAGAA
GGAAAGGTGTTATTGGGAAACATGAGTAGAATGGTGTACTGCAGGAAATAcatattatgtacatttttaaaaacataatt
gtaGGCCAAAATTGCTGGTTTGCAAGAAGCACTTTCCATGATGTTCAGGTATAGAAAAGCAAGATGTACTGTCATGGGAA
CACTCTTATGAAGTTGTTTGTGGAATCTACATATTAATAGGAAAATAGCTAATATAGCCCAGTATAtttctataacattt
attttagtgaacttataatgtttctttg

In [38]:
 for record in genes:
    print(record.name)

NC_000001.11
NT_187361.1
NT_187362.1
NT_187363.1
NT_187364.1
NT_187365.1
NT_187366.1
NT_187367.1
NT_187368.1
NT_187369.1
NC_000002.12
NT_187370.1
NT_187371.1
NC_000003.12
NT_167215.1
NC_000004.12
NT_113793.3
NC_000005.10
NT_113948.1
NC_000006.12
NC_000007.14
NC_000008.11
NC_000009.12
NT_187372.1
NT_187373.1
NT_187374.1
NT_187375.1
NC_000010.11
NC_000011.10
NC_000012.12
NC_000013.11
NC_000014.9
NT_113796.3
NT_167219.1
NT_187377.1
NT_113888.1
NT_187378.1
NT_187379.1
NT_187380.1
NT_187381.1
NC_000015.10
NT_187382.1
NC_000016.10
NT_187383.1
NC_000017.11
NT_113930.2
NT_187384.1
NT_187385.1
NC_000018.10
NC_000019.10
NC_000020.11
NC_000021.9
NC_000022.11
NT_187386.1
NT_187387.1
NT_187388.1
NT_187390.1
NT_187391.1
NT_187392.1
NT_187393.1
NT_187394.1
NC_000023.11
NC_000024.10
NT_187395.1
NT_187396.1
NT_187397.1
NT_187398.1
NT_187399.1
NT_187400.1
NT_187401.1
NT_187402.1
NT_187403.1
NT_187404.1
NT_187405.1
NT_187406.1
NT_187407.1
NT_187408.1
NT_187409.1
NT_187410.1
NT_187411.1
NT_187412.1
NT_187

Sequence can be buffered in memory using a read-ahead buffer for fast sequential access

In [52]:
from timeit import timeit
fetch = "genes['NT_187361.1'][200:230]"
read_ahead = "import pyfaidx; genes = pyfaidx.Fasta('GRCh38_latest_genomic.fna', read_ahead=10000)"
no_read_ahead = "import pyfaidx; genes = pyfaidx.Fasta('GRCh38_latest_genomic.fna')"
string_slicing = "genes = {}; genes['NT_187361.1'] = 'N'*10000"

In [53]:
timeit(fetch, no_read_ahead, number=10000)

0.06439260998740792

In [54]:
timeit(fetch, read_ahead, number=10000)

0.048968147020787

In [55]:
timeit(fetch, string_slicing, number=10000)

0.0009471289813518524