In [1]:
from pyteomics import fasta
import pandas as pd
from collections import namedtuple

In [2]:
prots_str = ''
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/all data/NOD2_mouse_database.fasta'
prot_db = namedtuple('prot_db', ['sequences', 'table'])

In [3]:
# split the name on the OS value if it exists
get_name = lambda name: name[:name.index('OS=')-1] if 'OS=' in name else name

prots = []
prot_things = []

# go through each entry in the fasta and put it in memory
for i, entry in enumerate(fasta.read(fasta_file)):

    # take the description without the 'sp' value
    desc = entry.description.split('|')[1:] if '|' in entry.description else entry.description

    # if the id is in the description, take it
    if len(desc) > 1:
        id_ = desc[0]
        name = get_name(desc[1])

    # make the id just the number
    else:
        id_ = i
        name = get_name(desc[0])

    # get the sequence
    seq = entry.sequence

    # make the entry and add it to prots
    prots.append((name, id_, seq))
    
    prots_str += f'|{i}={seq}'
    prot_things.append((i, id_, name))
                 
prots_df = pd.DataFrame(prots, columns=['name', 'id', 'sequence'])
db = prot_db(prots_str, pd.DataFrame(prot_things, columns=['num', 'id', 'name']))
prots_df.head()

Unnamed: 0,name,id,sequence
0,5HT6R_MOUSE 5-hydroxytryptamine receptor 6,Q9R1C8,MVPEPGPVNSSTPAWGPGPPPAPGGSGWVAAALCVVIVLTAAANSL...
1,ACL7A_MOUSE Actin-like protein 7A,Q9QY84,MSLDGVWAPQTANIGDGPAKKASDQASMQTQVLQTASLKDGPAKRA...
2,ACO12_MOUSE Acetyl-coenzyme A thioesterase,Q9DBK0,MESMVAPGEVLMSQAIQPAHADSRGELSAGQLLKWMDTTACLAAEK...
3,5NTD_MOUSE 5'-nucleotidase,Q61503,MRPAAAKVPKWLLLALSALLPQWPAASAWELTILHTNDVHSRLEQT...
4,ALOX8_MOUSE Arachidonate 8S-lipoxygenase,O35936,MAKCRVRVSTGEACGAGTWDKVSVSIVGTHGESPLVPLDHLGKEFS...


In [4]:
def get_prots_df(substring):
    return list(prots_df[prots_df['sequence'].apply(lambda x: substring in x)]['name'])

In [5]:
import re
def get_prots(substring):
    def get_prot_num(pos):
        start_idx = db.sequences[:pos].rindex('|')+1
        end_idx = db.sequences[:pos].rindex('=')
        return int(db.sequences[start_idx:end_idx])
    return list(
        db.table[db.table['num'].isin([get_prot_num(i) for i in [s.start() for s in re.finditer(substring, db.sequences)]])]['name']
    )

In [6]:
y = get_prots_df('GGG')
z = get_prots('GGG')

all([x in y for x in z])
    

True

In [7]:
%%time 
get_prots_df('ABC')

CPU times: user 16.6 ms, sys: 6.1 ms, total: 22.7 ms
Wall time: 23.9 ms


[]

In [8]:
%%time
get_prots('ABC')

CPU times: user 17.3 ms, sys: 392 µs, total: 17.7 ms
Wall time: 18.9 ms


[]

In [9]:
sequences = ['MAL', 'GGG', 'QQQQ', 'KTAEN', 'EVE']

In [10]:
%%time
[get_prots_df(seq) for seq in sequences]

CPU times: user 68.8 ms, sys: 1.96 ms, total: 70.7 ms
Wall time: 72 ms


[["5NTD_MOUSE 5'-nucleotidase",
  'AGRB2_MOUSE Adhesion G protein-coupled receptor B2',
  'ARH37_MOUSE Rho guanine nucleotide exchange factor 37',
  'ANKH_MOUSE Progressive ankylosis protein',
  'AT2A3_MOUSE Sarcoplasmic/endoplasmic reticulum calcium ATPase 3',
  'AT8B3_MOUSE Phospholipid-transporting ATPase IK',
  'CP2W1_MOUSE Cytochrome P450 2W1',
  'ECEL1_MOUSE Endothelin-converting enzyme-like 1',
  'C99L2_MOUSE CD99 antigen-like protein 2',
  'CAC1A_MOUSE Voltage-dependent P/Q-type calcium channel subunit alpha-1A',
  'BUB1_MOUSE Mitotic checkpoint serine/threonine-protein kinase BUB1',
  'CAHM2_MOUSE Calcium homeostasis modulator protein 2',
  'CO6A2_MOUSE Collagen alpha-2(VI) chain',
  'CO4B_MOUSE Complement C4-B',
  'DCAF1_MOUSE DDB1- and CUL4-associated factor 1',
  'DCP1A_MOUSE mRNA-decapping enzyme 1A',
  'DTWD1_MOUSE DTW domain-containing protein 1',
  'AGRV1_MOUSE Adhesion G-protein coupled receptor V1',
  'AP5M1_MOUSE AP-5 complex subunit mu-1',
  'ARMC8_MOUSE Armadillo r

In [11]:
%%time
[get_prots(seq) for seq in sequences]

CPU times: user 23.4 s, sys: 16.4 s, total: 39.8 s
Wall time: 40.5 s


[["5NTD_MOUSE 5'-nucleotidase",
  'AGRB2_MOUSE Adhesion G protein-coupled receptor B2',
  'ARH37_MOUSE Rho guanine nucleotide exchange factor 37',
  'ANKH_MOUSE Progressive ankylosis protein',
  'AT2A3_MOUSE Sarcoplasmic/endoplasmic reticulum calcium ATPase 3',
  'AT8B3_MOUSE Phospholipid-transporting ATPase IK',
  'CP2W1_MOUSE Cytochrome P450 2W1',
  'ECEL1_MOUSE Endothelin-converting enzyme-like 1',
  'C99L2_MOUSE CD99 antigen-like protein 2',
  'CAC1A_MOUSE Voltage-dependent P/Q-type calcium channel subunit alpha-1A',
  'BUB1_MOUSE Mitotic checkpoint serine/threonine-protein kinase BUB1',
  'CAHM2_MOUSE Calcium homeostasis modulator protein 2',
  'CO6A2_MOUSE Collagen alpha-2(VI) chain',
  'CO4B_MOUSE Complement C4-B',
  'DCAF1_MOUSE DDB1- and CUL4-associated factor 1',
  'DCP1A_MOUSE mRNA-decapping enzyme 1A',
  'DTWD1_MOUSE DTW domain-containing protein 1',
  'AGRV1_MOUSE Adhesion G-protein coupled receptor V1',
  'AP5M1_MOUSE AP-5 complex subunit mu-1',
  'ARMC8_MOUSE Armadillo r