# Experiment with pandas dataframe usage
1. Put the fasta file into a df and see if building kmers is faster this way than my for loop
2. Use the dataframe the way we used the sqlite database and see how fast queries are

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.sequence.gen_spectra import gen_spectrum, max_mass
from pyteomics import fasta
import pandas as pd
from collections import defaultdict
import swifter
import dask.dataframe as dd
from more_itertools import flatten

## 1. Make all kmers via loading a fasta into a df

In [2]:
fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/NOD2_mouse_database.fasta'
#fasta_file = '/Users/zacharymcgrath/Desktop/nod2 data/filteredNOD2.fasta'
entries = []
for entry in fasta.read(fasta_file):
    get_name = lambda name: name[:name.index('OS=')-1] if 'OS=' in name else name

    # take the description without the 'sp' value
    desc = entry.description.split('|')[1:] if '|' in entry.description else entry.description

    # if the id is in the description, take it
    if len(desc) > 1:
        id_ = desc[0]
        name = get_name(desc[1])

    # make the id just the number
    else:
        id_ = i
        name = get_name(desc[0])

    # get the sequence
    seq = entry.sequence

    # make the entry and add it to prots
    entries.append({'name': name, 'id': id_, 'sequence': seq})

In [3]:
proteins = pd.DataFrame(entries)

In [4]:
ps = dd.from_pandas(proteins, npartitions=2)
print(len(ps))
ps.head()

17039


Unnamed: 0,name,id,sequence
0,5HT6R_MOUSE 5-hydroxytryptamine receptor 6,Q9R1C8,MVPEPGPVNSSTPAWGPGPPPAPGGSGWVAAALCVVIVLTAAANSL...
1,ACL7A_MOUSE Actin-like protein 7A,Q9QY84,MSLDGVWAPQTANIGDGPAKKASDQASMQTQVLQTASLKDGPAKRA...
2,ACO12_MOUSE Acetyl-coenzyme A thioesterase,Q9DBK0,MESMVAPGEVLMSQAIQPAHADSRGELSAGQLLKWMDTTACLAAEK...
3,5NTD_MOUSE 5'-nucleotidase,Q61503,MRPAAAKVPKWLLLALSALLPQWPAASAWELTILHTNDVHSRLEQT...
4,ALOX8_MOUSE Arachidonate 8S-lipoxygenase,O35936,MAKCRVRVSTGEACGAGTWDKVSVSIVGTHGESPLVPLDHLGKEFS...


In [None]:
%%time
def breakdown(s):
    kmers = []
    for j in range(len(s) - 2):

        # make a kmer sequence. Do the max (to generate the kmer spec once) then 
        # just iterate through it
        kmer_len = 30 if j + 30 <= len(s) \
            else len(s) - j

        kmers += [s[j:j+k] for k in range(3, kmer_len)]
        
    return kmers
        
    
kmers = pd.DataFrame(list(set(flatten(proteins['sequence'].swifter.apply(breakdown)))), columns =['sequence'])

kmers.head()

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=17039.0, style=ProgressStyle(descripti…

In [None]:
%%time
def spectrify(a):
    f = {}
    f['bs'] = max_mass(a, 'b', 1)
    f['bd'] = max_mass(a, 'b', 2)
    f['ys'] = max_mass(a, 'y', 1)
    f['yd'] = max_mass(a, 'y', 2)
    f['sequence'] = a
    return f

mass_sequences = pd.DataFrame(list(kmers['sequence'].swifter.apply(spectrify)))
del kmers
mass_sequences.astype({'bs': 'float32', 'bd': 'float32', 'ys': 'float32', 'yd': 'float32'})

## 2. Try fast queries

In [None]:
gen_spectrum('GGG', ion='b', charge=2)['spectrum'][-1]

In [None]:
%%time
kmers[kmers['bd'].between(86.539, 86.54)]

## 3. Search for substrings

In [None]:
%%time
list(proteins[proteins['sequence'].apply(lambda x: 'GGG' in x)]['name'])