In [1]:
import pandas as pd
import numpy as np
import math
import re

from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis

In [2]:
df = pd.read_csv('AllergenOnline - Browse the Database.csv')

In [3]:
#get GI nums to query uniprot to obtain fasta. Results stored in allergens.fasta
gi_nums = df['GI#@'].dropna().apply(lambda x: str(int(x))).values.tolist()
gi_nums = ' '.join(gi_nums)

In [4]:
def get_sequences_from_fasta(fin):
    fasta_sequences = SeqIO.parse(fin,'fasta')
    return [str(fasta.seq) for fasta in fasta_sequences]

In [5]:
ao_sequences = get_sequences_from_fasta(open('allergens.fasta'))

In [6]:
ao_df = pd.DataFrame({'sequence': ao_sequences})

In [7]:
ao_df.shape

(1666, 1)

<h1> Uniprot Allergens </h1>

In [63]:
with open('allergens.txt', 'r') as fin:
    pattern = '\(([\w\d]{6})\)'
    accession_nums = [re.search(pattern, l).group(1) for l in fin.readlines()]

391


In [9]:
allergen_sequences = get_sequences_from_fasta(open('uniprot_allergens.fasta'))
non_allergen_sequences = get_sequences_from_fasta(open('uniprot_sprot.fasta'))

IOError: [Errno 2] No such file or directory: 'uniprot_sprot.fasta'

In [None]:
def create_balanced_df(allergen_sequences, non_allergen_sequences):
    allergen_set = set(allergen_sequences)
    non_allergen_sequences = [seq for seq in non_allergen_sequences if seq not in allergen_set]
    non_allergen_sequences = np.random.choice(non_allergen_sequences, len(allergen_sequences)).tolist()
    balanced_df = pd.DataFrame({'sequence': allergen_sequences + non_allergen_sequences,
                                'allergen': [1]*len(allergen_sequences) + [0]*len(non_allergen_sequences)})
    return balanced_df

In [None]:
def add_protein_characteristics(df):
    aa_list = ['A', 'C','E','D','G','F','I','H','K','M','L','N','Q','P','S','R','T','W','V','Y']
    aa_dict = {}
    for aa in aa_list:
        aa_dict[aa] = []
    prop_dict = {'aromaticity': [], 'instability_index': [], 'helix': [], 'turn': [], 'sheet': []}
    for i, s in enumerate(df['sequence']):
        pa = ProteinAnalysis(s)
        prop_dict['aromaticity'].append(pa.aromaticity())
        prop_dict['instability_index'].append(pa.aromaticity())
        for fraction, ss in zip(pa.secondary_structure_fraction(), ['helix', 'turn', 'sheet']):
            prop_dict[ss].append(fraction)
        for k, v in pa.get_amino_acids_percent().items():
            aa_dict[k].append(v)
    for k, v in aa_dict.items():
        df[k] = v
    for k, v in prop_dict.items():
        df[k] = v
    return df

uniprot_df = create_balanced_df(allergen_sequences, non_allergen_sequences)
uniprot_final_df = add_protein_characteristics(uniprot_df)    

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

X = uniprot_final_df.drop(['sequence','allergen'], axis=1)
y = uniprot_final_df['allergen']

rf = RandomForestClassifier()

print 'cross validation score: ' + str(np.mean(cross_val_score(rf, X, y, cv=5))) + '\n'

In [None]:
ao_df = create_balanced_df(ao_sequences, non_allergen_sequences)
ao_final_df = add_protein_characteristics(ao_df)

In [None]:
X = ao_final_df.drop(['sequence','allergen'], axis=1)
y = ao_final_df['allergen']

rf = RandomForestClassifier()

print 'cross validation score: ' + str(np.mean(cross_val_score(rf, X, y, cv=5))) + '\n'

## Dont forget to add n_grams

In [None]:
import urllib,urllib2

url = 'http://www.uniprot.org/uploadlists/'

params = {
'from':'ACC',
'to':'ACC',
'format':'fasta',
'query':'P31946 P62258',
'columns': 'organism, id, Sequence'
}

data = urllib.urlencode(params)
request = urllib2.Request(url, data)
contact = "jaredtjacobsen@gmail.com"
request.add_header('User-Agent', 'Python %s' % contact)
response = urllib2.urlopen(request)
page = response.read(200000)

In [None]:
print page

In [10]:
import requests
url = 'http://www.uniprot.org/uploadlists/'
params = {
'from':'ACC',
'to':'ACC',
'format':'fasta',
'query':'P31946 P62258',
}

response = requests.get(url, params)

In [None]:
[''.join(i.split('\n')[1:]) for i in response.text.split('>')[1:]]

In [None]:
fasta_str = ''

In [51]:
from Bio import SeqIO
import StringIO

fin = StringIO.StringIO(response.text)

def parse_description(d):
    p = '(\w{2})\s*\|\s*([\w\d]+)\s*\|\s*(.*?)\s(.*?)\sOS=([\w]* [\w]*)'
    group_object = re.match(p, d)
    return group_object.group(1, 2, 3, 4, 5)

fasta_sequences = SeqIO.parse(open('allergens.fasta'),'fasta')
s = [parse_description(fasta.description) for fasta in fasta_sequences]

In [52]:
s

[('tr', 'B0KZJ6', 'B0KZJ6_ACASI', 'Allergen Aca s 13', 'Acarus siro'),
 ('tr', 'B0KZK1', 'B0KZK1_ACASI', 'Alpha-amylase', 'Acarus siro'),
 ('tr', 'L7TV16', 'L7TV16_9ERIC', 'Kiwellin', 'Actinidia arguta'),
 ('tr', 'L7TT88', 'L7TT88_9ERIC', 'Kiwellin', 'Actinidia arguta'),
 ('tr', 'L7TY99', 'L7TY99_9ERIC', 'Kiwellin', 'Actinidia arguta'),
 ('sp', 'P00785', 'ACTN_ACTCH', 'Actinidain', 'Actinidia chinensis'),
 ('sp',
  'P85204',
  'NLTP1_ACTCH',
  'Non-specific lipid-transfer protein 1 (Fragment)',
  'Actinidia chinensis'),
 ('tr', 'L7TT83', 'L7TT83_ACTCH', 'Kiwellin', 'Actinidia chinensis'),
 ('tr',
  'D1YSM4',
  'D1YSM4_ACTCH',
  'Bet v 1 related allergen',
  'Actinidia chinensis'),
 ('sp',
  'P83958',
  'TLP_ACTCH',
  'Thaumatin-like protein (Fragment)',
  'Actinidia chinensis'),
 ('tr',
  'L7TRX2',
  'L7TRX2_ACTCH',
  'Thaumatin-like protein',
  'Actinidia chinensis'),
 ('sp', 'A5HII1', 'ACTN_ACTDE', 'Actinidain', 'Actinidia deliciosa'),
 ('tr',
  'D1YSM5',
  'D1YSM5_ACTDE',
  'Bet v 1

In [57]:
d = [[(1,2),2], [(3,4),4]]
df_testing = pd.DataFrame(data=d, columns=['one', 'two'])
df_testing['one'][0]

(1, 2)

In [59]:
a = (1,2,3)
list(a) + ['hey']

[1, 2, 3, 'hey']

In [62]:
df_testing.T.to_json()

'{"0":{"one":[1,2],"two":2},"1":{"one":[3,4],"two":4}}'

In [61]:
df_testing

Unnamed: 0,one,two
0,"(1, 2)",2
1,"(3, 4)",4


In [66]:
print allergen_sequences[20]

MTSVKLSTPQTGEFEQPTGLFINNEFVKAVDGKTFDVINPSTEEVICSVQEATEKDVDIAVAAARKAFNGPWRKETPENRGKLLNKLADLFEKNADLIAAVEALDNGKAFSMAKNVDVPAAAGCLRYYGGWADKIEGKVVDTAPDSFNYIRKEPIGVCGQIIPWNFPILMWSWKIGPAIATGNTVVLKTAEQTPLSAYIACKLIQEAGFPPGVINVITGFGKIAGAAMSAHMDIDKIAFTGSTVVGRQIMKSAAGSNLKKVTLELGGKSPNIVFADADLDEAIHWVNFGIYFNHGQACCAGSRIYVQEEIYDKFIQRFKERAAQNAVGDPFAADTFQGPQVSQLQFDRIMGYIEEGKKSGATIETGGNRKGDKGYFIEPTIFSNVTEDMKIQQEEIFGPVCTISKFKTKADVIKIGNNTTYGLAAAVHTSNLTTAIEVANALRAGTVWVNSYNTLHWQLPFGGYKESGIGRELGEAALDNYIQTKTVSIRLGDVLFG


In [65]:
print accession_nums[0]

O76821


In [67]:
with open('uniprot_allergens.fasta', 'r') as fin:
    print fin.readlines()[0:20]

['>sp|O76821|FABP_ACASI Fatty acid-binding protein (Fragment) OS=Acarus siro PE=1 SV=2\n', 'QINGSYKLEKSDNFDAFLKELGLNFVTRNLAKSATPTVEVSVNGDSYTIKTASTLKNTEI\n', 'SFKL\n', '>sp|P00785|ACTN_ACTCH Actinidain OS=Actinidia chinensis PE=1 SV=4\n', 'MGLPKSFVSMSLLFFSTLLILSLAFNAKNLTQRTNDEVKAMYESWLIKYGKSYNSLGEWE\n', 'RRFEIFKETLRFIDEHNADTNRSYKVGLNQFADLTDEEFRSTYLRFTSGSNKTKVSNRYE\n', 'PRVGQVLPSYVDWRSAGAVVDIKSQGECGGCWAFSAIATVEGINKIVTGVLISLSEQELI\n', 'DCGRTQNTRGCNGGYITDGFQFIINNGGINTEENYPYTAQDGECNVDLQNEKYVTIDTYE\n', 'NVPYNNEWALQTAVTYQPVSVALDAAGDAFKQYSSGIFTGPCGTAVDHAVTIVGYGTEGG\n', 'IDYWIVKNSWDTTWGEEGYMRILRNVGGAGTCGIATMPSYPVKYNNQNHPKPYSSLINPP\n', 'AFSMSKDGPVGVDDGQRYSA\n', '>sp|P83958|TLP_ACTCH Thaumatin-like protein (Fragment) OS=Actinidia chinensis GN=tlp PE=1 SV=1\n', 'ATFNFINNCPFTVWAAAVPG\n', '>sp|P85261|KIWEL_ACTCH Kiwellin (Fragments) OS=Actinidia chinensis PE=1 SV=1\n', 'ISSCNGPCRDLNDCDGQLICGTTHSHQPGGCKPS\n', '>sp|P81370|TLP_ACTDE Thaumatin-like protein OS=Actinidia deliciosa GN=tlp PE=1 SV=2\n', 'MS