In [1]:
import pandas as pd
import numpy as np
import math
import re

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [2]:
df = pd.read_csv('AllergenOnline - Browse the Database.csv')

In [3]:
#get GI nums to query uniprot to obtain fasta. Results stored in allergens.fasta
gi_nums = df['GI#@'].dropna().apply(lambda x: str(int(x))).values.tolist()
gi_nums = ' '.join(gi_nums)

In [4]:
def get_sequences_from_fasta(fin):
    fasta_sequences = SeqIO.parse(fin,'fasta')
    return [str(fasta.seq) for fasta in fasta_sequences]

In [5]:
ao_sequences = get_sequences_from_fasta(open('allergens.fasta'))

In [6]:
ao_df = pd.DataFrame({'sequence': ao_sequences})

In [7]:
ao_df.shape

(1666, 1)

<h1> Uniprot Allergens </h1>

In [8]:
with open('allergens.txt', 'r') as fin:
    pattern = '\(([\w\d]{6})\)'
    accession_nums = [re.search(pattern, l).group(1) for l in fin.readlines()]

In [9]:
allergen_sequences = get_sequences_from_fasta(open('uniprot_allergens.fasta'))
non_allergen_sequences = get_sequences_from_fasta(open('uniprot_sprot.fasta'))

In [10]:
def create_balanced_df(allergen_sequences, non_allergen_sequences):
    allergen_set = set(allergen_sequences)
    non_allergen_sequences = [seq for seq in non_allergen_sequences if seq not in allergen_set]
    non_allergen_sequences = np.random.choice(non_allergen_sequences, len(allergen_sequences)).tolist()
    balanced_df = pd.DataFrame({'sequence': allergen_sequences + non_allergen_sequences,
                                'allergen': [1]*len(allergen_sequences) + [0]*len(non_allergen_sequences)})
    return balanced_df

In [12]:
def add_protein_characteristics(df):
    aa_list = ['A', 'C','E','D','G','F','I','H','K','M','L','N','Q','P','S','R','T','W','V','Y']
    aa_dict = {}
    for aa in aa_list:
        aa_dict[aa] = []
    prop_dict = {'aromaticity': [], 'instability_index': [], 'helix': [], 'turn': [], 'sheet': []}
    for i, s in enumerate(df['sequence']):
        pa = ProteinAnalysis(s)
        prop_dict['aromaticity'].append(pa.aromaticity())
        prop_dict['instability_index'].append(pa.aromaticity())
        for fraction, ss in zip(pa.secondary_structure_fraction(), ['helix', 'turn', 'sheet']):
            prop_dict[ss].append(fraction)
        for k, v in pa.get_amino_acids_percent().items():
            aa_dict[k].append(v)
    for k, v in aa_dict.items():
        df[k] = v
    for k, v in prop_dict.items():
        df[k] = v
    return df

uniprot_df = create_balanced_df(allergen_sequences, non_allergen_sequences)
uniprot_final_df = add_protein_characteristics(uniprot_df)    

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

X = uniprot_final_df.drop(['sequence','allergen'], axis=1)
y = uniprot_final_df['allergen']

rf = RandomForestClassifier()

print 'cross validation score: ' + str(np.mean(cross_val_score(rf, X, y, cv=5))) + '\n'

cross validation score: 0.808974358974



In [15]:
ao_df = create_balanced_df(ao_sequences, non_allergen_sequences)
ao_final_df = add_protein_characteristics(ao_df)

In [16]:
X = ao_final_df.drop(['sequence','allergen'], axis=1)
y = ao_final_df['allergen']

rf = RandomForestClassifier()

print 'cross validation score: ' + str(np.mean(cross_val_score(rf, X, y, cv=5))) + '\n'

cross validation score: 0.824439409469



## Dont forget to add n_grams

In [None]:
import urllib,urllib2

url = 'http://www.uniprot.org/uploadlists/'

params = {
'from':'ACC',
'to':'ACC',
'format':'fasta',
'query':'P31946 P62258',
'columns': 'organism, id, Sequence'
}

data = urllib.urlencode(params)
request = urllib2.Request(url, data)
contact = "jaredtjacobsen@gmail.com"
request.add_header('User-Agent', 'Python %s' % contact)
response = urllib2.urlopen(request)
page = response.read(200000)

In [None]:
print page

In [None]:
import requests
url = 'http://www.uniprot.org/uploadlists/'
params = {
'from':'ACC',
'to':'ACC',
'format':'fasta',
'query':'P31946 P62258',
}

response = requests.get(url, params)

In [None]:
[''.join(i.split('\n')[1:]) for i in response.text.split('>')[1:]]

In [None]:
s = [1,2,3]
'g'.join(map(str, s))

In [None]:
from Bio import SeqIO
fasta_sequences = SeqIO.parse(response.text,'fasta')
return [str(fasta.seq) for fasta in fasta_sequences]

In [None]:
'>g'.split('>')