In [88]:
import pandas as pd
import nltk
from tqdm import tqdm_notebook
from collections import Iterable
from itertools import chain
import os
import sqlite3
import xmltodict
from tqdm import tqdm_notebook
from fuzzywuzzy import fuzz

In [2]:
conn = sqlite3.connect("human_protein.db")
prot_feat = pd.read_sql('select * from protein_features',con=conn)
amy_df = pd.read_sql('select * from amyloid', con=conn)
conn.close()

In [3]:
prot_feat.head()

Unnamed: 0,protein,accession,sequence,Nucleosome core,Calmodulin-binding,Cardiomyopathy,Electron transport,Glycerol metabolism,Heparin-binding,DNA integration,...,Iron-sulfur,Autism,Translocation,Signal transduction inhibitor,Ligand-gated ion channel,Phenylketonuria,Neuropathy,Hypogonadotropic hypogonadism,Aminotransferase,Multifunctional enzyme
0,RL37A_HUMAN,"P61513,P12751,Q6FGF5",MAKRTKKVGIVGKYGTRYGASLRKMVKKIEISQHAKYTCSFCGKTK...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,PYRG1_HUMAN,"P17812,B4DR64,D3DPW1,Q5VW67,Q96GK6",MKYILVTGGVISGIGKGIIASSVGTILKSCGLHVTSIKIDPYINID...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,RL6_HUMAN,"Q02878,Q2M3Q3,Q8WW97",MAGEKVEKPDTKEKKPEAKKVDAGGKVKKGNLKAKKPKKGKPHCSR...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,RAB10_HUMAN,"P61026,D6W538,O88386,Q6IA52,Q9D7X6,Q9H0T3",MAKKTYDLLFKLLLIGDSGVGKTCVLFRFSDDAFNTTFISTIGIDF...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,RAB30_HUMAN,"Q15771,Q6FGK1,Q6MZH2,Q96CI8",MSMEDYDFLFKIVLIGNAGVGKTCLVRRFTQGLFPPGQGATIGVDF...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
amy_df.head()

Unnamed: 0,amy_name,sequence,species,uniprot_id,pmid,category
0,AP00001,LPICPGGAARCQVTLRDLFDRAVVLSHYIHNLSSEMFSEFDKRYTH...,Homo sapiens,P01236,9006323,pathogenic
1,AP00002,CGNLSTCMLGTYTQDFNKFHTFPQTAIGVGAP,Homo sapiens,P01258,120959971746874723395606,pathogenic
2,AP00003,DEPPQSPWDRVKDLATVYVDVLKDSGRDYVSQFEGSALGKQLNLKL...,Homo sapiens,P02647,124218242018270922609356,pathogenic
3,AP00004,KNTMEHVSSSEESIISQETYKQEKNMAINPSKENLCSTFCKEVVRN...,Bos taurus,P02663,10611944,pathogenic
4,AP00005,RSFFSFLGEAFDGARDMWRAYSDMREANYIGSDKYFHARGNYDAAK...,Homo sapiens,P0DJI8,173178721569756,pathogenic


# Checking amyloid status against uniprot proteins

In [5]:
def idChecker(accessions, uni_id):
    if uni_id in accessions:
        return True
    return False
# Split the accession strings to a list
split_accs = prot_feat['accession'].str.split(',').values.tolist()
# Match a list out of the set to check against in the checker func
acc_list = list(set([item for sublist in split_accs for item in sublist]))
# Add column to hold info on match for each amyloid
amy_df.loc[:,'uni_match'] = amy_df.uniprot_id.apply(lambda x: idChecker(acc_list, x)).values
# Amyloids with no match
no_uni_amys = amy_df[~amy_df.uni_match]
# Amyloids with a match
match_amys = amy_df[amy_df.uni_match]
# Num amyloids that have a match, num amyloids with no match
amy_df.shape[0]-no_uni_amys.shape[0], amy_df.shape[0]-match_amys.shape[0]

(71, 72)

So there are 72 proteins in the amyloid table that have no representation in the protein list. Those proteins are going to be missing their structural information and other annotations from uniprot.

I want to avoid missing out on all the really nice annotation information held in each of the annotated proteins. So I'm going to request the uniprot XML information for each of the amyloid proteins that are missing in the reviewed section of uniprot.

In [6]:
missing_ids = no_uni_amys.uniprot_id.values.tolist()

In [7]:
copy_str = " ".join(missing_ids)
copy_str

'P02663 P02754 P63159 P32081 P0C805 P68082 Q52546 P05453 P23202 Q03689 P23727 P28307 P0ABK7 Q9AD92 Q9Z4N4 Q04571 P25367 P14922 P09547 P02846 P04002 Q9PWC8 P07884 P54785 Q08972 Q02629 P40070 P52912 P38996 Q967R6 A8Z0V1 Q53643 P55090 P01144 P01145 P10636-8 A0MVU0 P11657 Q9VSR3 P0A734 Q59L12 P82042 P32194 P59637 P25822 P03275 P0ADA7 P61825 Q4ZHU1 Q1EN15 P01012 P00698 J7GMN2 P03036 P86706 P15703 P54507 D2YW43 P52750 P32588 P38741 C4IN70 C4IN69 A0B829 P0C0U1 A0A1A0G6M5 Q99109 Q934F8 P53617 P08615 P08611 '

# Check matched amyloid sequence against annotated sequence.

If my amyloid protein sequences are the same as the protein with the same uniprot ID in my protein table - then I can avoid having to add the amyloid sequence into the db.

If the amyloid sequence is different, I will make a 'copy' row that contains the variant amyloid sequence.

In [28]:
# First lets get the table of protein name and accession melted on accesion.
prot_feat.loc[:,'accession_split'] = prot_feat['accession'].str.split(',')
prot_feat_extended = prot_feat[['protein','sequence','accession_split']] \
    .set_index(['protein','sequence'])['accession_split'].apply(pd.Series).stack()
prot_feat_extended = prot_feat_extended.reset_index()
prot_feat_extended.columns=['protein','sequence','id_num','uniprot_id']

In [29]:
prot_feat_extended.head()

Unnamed: 0,protein,sequence,id_num,uniprot_id
0,RL37A_HUMAN,MAKRTKKVGIVGKYGTRYGASLRKMVKKIEISQHAKYTCSFCGKTK...,0,P61513
1,RL37A_HUMAN,MAKRTKKVGIVGKYGTRYGASLRKMVKKIEISQHAKYTCSFCGKTK...,1,P12751
2,RL37A_HUMAN,MAKRTKKVGIVGKYGTRYGASLRKMVKKIEISQHAKYTCSFCGKTK...,2,Q6FGF5
3,PYRG1_HUMAN,MKYILVTGGVISGIGKGIIASSVGTILKSCGLHVTSIKIDPYINID...,0,P17812
4,PYRG1_HUMAN,MKYILVTGGVISGIGKGIIASSVGTILKSCGLHVTSIKIDPYINID...,1,B4DR64


In [32]:
# Merge on uniprot ID
prot_amy_merge = prot_feat_extended[['protein','sequence','uniprot_id']] \
    .merge(amy_df[['amy_name','sequence','uniprot_id','species']],
          on='uniprot_id',how='inner')

In [71]:
prot_amy_merge.head()
# Check for sequence equivalence
seq_equals = prot_amy_merge['sequence_x'].str.strip()==prot_amy_merge['sequence_y'].str.strip()
print(seq_equals.shape[0])
# Num sequences equivalent subtracted by sequences with no equivalence
prot_amy_merge[seq_equals].shape[0],prot_amy_merge[~seq_equals].shape[0]

71


(15, 56)

In [72]:
prot_amy_merge[~seq_equals].head()

Unnamed: 0,protein,sequence_x,uniprot_id,amy_name,sequence_y,species
0,PSPC_HUMAN,MDVGSKEVLMESPPDYSAAPRGRFGIPCCPVHLKRLLIVVVVVVLI...,P11686,AP00130,FGIPCCPVHLKRLLIVVVVVVLIVVVIVGALLMGL,Homo sapiens
1,SODC_HUMAN,MATKAVCVLKGDGPVQGIINFEQKESNGPVKVWGSIKGLTEGLHGF...,P00441,AP00084,ATKAVCVLKGDGPVQGIINFEQKESNGPVKVWGSIKGLTEGLHGFH...,Homo sapiens
2,SAA1_HUMAN,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...,P0DJI8,AP00005,RSFFSFLGEAFDGARDMWRAYSDMREANYIGSDKYFHARGNYDAAK...,Homo sapiens
4,SEMG1_HUMAN,MKPNIIFVLSLLLILEKQAAVMGQKGGSKGRLPSEFSQFPHGQKGQ...,P04279,AP00009,QKGGSKGRLPSEFSQFPHGQKGQHYSGQKGKQQTESKGSFSIQYTY...,Homo sapiens
5,SOMA_HUMAN,MATGSRTSLLLAFGLLCLPWLQEGSAFPTIPLSRLFDNAMLRAHRL...,P01241,AP00135,FPTIPLSRLFDNAMLRAHRLHQLAFDTYQEFEEAYIPKEQKYSFLQ...,Homo sapiens


So I've got 41 more uniprot-amyloid matches with no sequence similarity. Lets see if the sequence is simply truncated and exists within the uniprot sequence.

In [76]:
prot_merge_no_match = prot_amy_merge[~seq_equals]
print(prot_merge_no_match.shape)
def stringIncluded(str1, str2):
    if str1 in str2 or str1 == str2:
        return True
    return False

string_inside_index = prot_merge_no_match.apply(lambda row: stringIncluded(row['sequence_y'],row['sequence_x']) ,axis=1)
string_inside = prot_merge_no_match[string_inside_index]
string_not_inside = prot_merge_no_match[~string_inside_index]
string_inside.shape

(56, 6)


(48, 6)

In [83]:
for x, y in string_not_inside[['sequence_x','sequence_y']].values:
    print(x==y)
    print(fuzz.ratio(x,y))

False
100
False
98
False
88
False
19
False
17
False
98
False
97
False
78


In [89]:
string_not_inside.head()

Unnamed: 0,protein,sequence_x,uniprot_id,amy_name,sequence_y,species
9,TADBP_HUMAN,MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNP...,Q13148,AP00077,MSEYIRVTEDENDEPIEIPSEDDGTVLLSTVTAQFPGACGLRYRNP...,Homo sapiens
12,BGH3_HUMAN,MALFVRLLALALALALGPAATLAGPAKSPYQLVLQHSRLRGRQHGP...,Q15582,AP00095,GPAKSPYQLVLQHSRLRGRQHGPNVCAVQKVIGTNRKYFTNCKQWY...,Homo sapiens
14,APOC3_HUMAN,MQPRVLLVVALLALLASARASEAEDASLLSFMQGYMKHATKTAKDA...,P02656,AP00142,SEAEDASLLSFMQGYMKHATKTAKVALSSVQESQVAQQARGWVTDG...,Homo sapiens
17,ITM2B_HUMAN,MVKVTFNSALAQKEAKKDEPKSGEEALIIPPDAVAVDCKDPDDVVP...,Q9Y287,AP00107,EASNCFAIRHFENKFAVETLICSRTVKKNIIEEN,Homo sapiens
18,ITM2B_HUMAN,MVKVTFNSALAQKEAKKDEPKSGEEALIIPPDAVAVDCKDPDDVVP...,Q9Y287,AP00108,EASNCFAIRHFENKFAVETLICFNLFLNSQEKHY,Homo sapiens


I'm going to just avoid using these in any train/test set. I can use them after the fact to check how well the model did in predicting the uniprot annotated version.

In [90]:
conn = sqlite3.connect('human_protein.db')
string_not_inside[['protein','uniprot_id','amy_name']].to_sql("amy_ignore",con=conn,index=False,if_exists='replace')
conn.close()

# Correct amyloid_prion mapping

For the amyloids that exist for humans and are simply noted as truncated, alter the coordinates of their prions so that they map to the uniprot sequence.

In [163]:
# Alter amyloid_prion such that;
# All entries have uniprot_id as their primary key
conn = sqlite3.connect('human_protein.db')
amy_uni_ids = pd.read_sql("select amy_name, uniprot_id from amyloid",con=conn)
amy_prion = pd.read_sql('select * from amyloid_prion',con=conn)
print(amy_prion.shape)
amy_joined = amy_uni_ids.merge(amy_prion, on='amy_name', how='right')
print(amy_joined.shape)
amy_joined.drop('amy_name',axis=1,inplace=True)
# There is one entry of a synthetic protein. Drop it.
amy_joined = amy_joined[amy_joined['uniprot_id'].str.len()!=0]
amy_joined.to_sql("amyloid_prion_id",con=conn,index=False,if_exists='replace')
conn.close()

(174, 3)
(174, 4)


In [164]:
# Ensure human proteins that are truncated, get mapped to uniprot sequence.
conn = sqlite3.connect('human_protein.db')
amy_prion_df = pd.read_sql("select * from amyloid_prion_id",con=conn)
conn.close()
amy_prion_df.head(), amy_prion_df.shape

(  uniprot_id begin end
 0     P01236     7  34
 1     P01236    43  57
 2     P01258     6  11
 3     P01258    15  20
 4     P02647    46  59, (173, 3))

In [169]:
display(string_inside.head()), string_inside.shape

Unnamed: 0,protein,sequence_x,uniprot_id,amy_name,sequence_y,species,seq_diff
0,PSPC_HUMAN,MDVGSKEVLMESPPDYSAAPRGRFGIPCCPVHLKRLLIVVVVVVLI...,P11686,AP00130,FGIPCCPVHLKRLLIVVVVVVLIVVVIVGALLMGL,Homo sapiens,23
1,SODC_HUMAN,MATKAVCVLKGDGPVQGIINFEQKESNGPVKVWGSIKGLTEGLHGF...,P00441,AP00084,ATKAVCVLKGDGPVQGIINFEQKESNGPVKVWGSIKGLTEGLHGFH...,Homo sapiens,1
2,SAA1_HUMAN,MKLLTGLVFCSLVLGVSSRSFFSFLGEAFDGARDMWRAYSDMREAN...,P0DJI8,AP00005,RSFFSFLGEAFDGARDMWRAYSDMREANYIGSDKYFHARGNYDAAK...,Homo sapiens,18
4,SEMG1_HUMAN,MKPNIIFVLSLLLILEKQAAVMGQKGGSKGRLPSEFSQFPHGQKGQ...,P04279,AP00009,QKGGSKGRLPSEFSQFPHGQKGQHYSGQKGKQQTESKGSFSIQYTY...,Homo sapiens,23
5,SOMA_HUMAN,MATGSRTSLLLAFGLLCLPWLQEGSAFPTIPLSRLFDNAMLRAHRL...,P01241,AP00135,FPTIPLSRLFDNAMLRAHRLHQLAFDTYQEFEEAYIPKEQKYSFLQ...,Homo sapiens,26


(None, (48, 7))

In [171]:
string_inside[string_inside.duplicated()]

Unnamed: 0,protein,sequence_x,uniprot_id,amy_name,sequence_y,species,seq_diff


In [166]:
def prionAnnotMapper(seq1, seq2):
    """Return difference between the start of seq1 and where seq2 resides
    within seq1. This will be used to alter the begin and end columns so 
    that each amyloid annotated region can be mapped to its uniprot
    counterpart."""
    return len(seq1.split(seq2)[0])

# Apply prion annot mapper to each row in amy_prion_df that has an entry in
# the string_inside dataframe.
string_inside.loc[:,'seq_diff'] = string_inside.apply(lambda row: prionAnnotMapper(row['sequence_x'],row['sequence_y']),axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [184]:
checked_id = set()
for uni_id in string_inside['uniprot_id']:
    if uni_id not in checked_id:
        checked_id.add(uni_id)
        id_slice = string_inside[string_inside['uniprot_id']==uni_id]
        if id_slice.shape[0]>1:
            display(id_slice)

Unnamed: 0,protein,sequence_x,uniprot_id,amy_name,sequence_y,species,seq_diff
19,GLUC_HUMAN,MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQ...,P01275,AP00041,HSQGTFTSDYSKYLDSRRAQDFVQWLMNT,Homo sapiens,52
20,GLUC_HUMAN,MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQ...,P01275,AP00065,HDEFERHAEGTFTSDVSSYLEGQAAKEFIAWLVKGRG,Home sapiens,91
21,GLUC_HUMAN,MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQ...,P01275,AP00066,HADGSFSDEMNTILDNLAARDFINWLIQTKITD,Home sapiens,145


Unnamed: 0,protein,sequence_x,uniprot_id,amy_name,sequence_y,species,seq_diff
22,INS_HUMAN,MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGER...,P01308,AP00023,FVNQHLCGSHLVEALYLVCGERGFFYTPKT,Homo sapiens,24
23,INS_HUMAN,MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGER...,P01308,AP00024,GIVEQCCTSICSLYQLENYCN,Homo sapiens,89


Unnamed: 0,protein,sequence_x,uniprot_id,amy_name,sequence_y,species,seq_diff
53,PPAP_HUMAN,MRAAPLLLARAASLSLGFLFLLFFWLDRSVLAKELKFVTLVFRHGD...,P15309,AP00091,GIHKQKEKSRLQGGVLVNEILNHMKRATQIPSYKKLIMY,Homo sapiens,247
54,PPAP_HUMAN,MRAAPLLLARAASLSLGFLFLLFFWLDRSVLAKELKFVTLVFRHGD...,P15309,AP00092,IRKRYRKFLNESYKHEQVYIRSTDVDRTLMSAMTNL,Homo sapiens,84


Unnamed: 0,protein,sequence_x,uniprot_id,amy_name,sequence_y,species,seq_diff
62,ANF_HUMAN,MSSFSTTTVSFLLLLAFQLLGQTRANPMYNAVSNADLMDFKNLLDH...,P01160,AP00081,NPMYNAVSNADLMDFKNLLDHLEEKMPLEDEVVPPQVLSEPNEEAG...,Homo sapiens,25
63,ANF_HUMAN,MSSFSTTTVSFLLLLAFQLLGQTRANPMYNAVSNADLMDFKNLLDH...,P01160,AP00082,SLRRSSCFGGRMDRIGAQSGLGCNSFRY,Homo sapiens,123


Unnamed: 0,protein,sequence_x,uniprot_id,amy_name,sequence_y,species,seq_diff
66,ANFB_HUMAN,MDPQTAPSRALLLLLFLHLAFLGGRSHPLGSPGSASDLETSGLQEQ...,P16860,AP00132,HPLGSPGSASDLETSGLQEQRNHLQGKLSELQVEQTSLEPLQESPR...,Homo sapiens,26
67,ANFB_HUMAN,MDPQTAPSRALLLLLFLHLAFLGGRSHPLGSPGSASDLETSGLQEQ...,P16860,AP00019,SPKMVQGSGCFGRKMDRISSSSGLGCKVLRRH,Homo sapiens,102


Unnamed: 0,protein,sequence_x,uniprot_id,amy_name,sequence_y,species,seq_diff
68,A4_HUMAN,MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,P05067,AP00010,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVVIA,Homo sapiens,671
69,A4_HUMAN,MLPGLALLLLAAWTARALEVPTDGNAGLLAEPQIAMFCGRLNMHMN...,P05067,AP00011,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVV,Homo sapiens,671


In [189]:
amy_prion_df_alt = amy_prion_df.merge(string_inside[['uniprot_id','seq_diff']],on='uniprot_id',how='left').drop_duplicates()
amy_prion_df_alt.fillna(0,inplace=True)
print(amy_prion_df.shape,amy_prion_df_alt.shape)
amy_prion_df_alt.head()

(173, 3) (187, 4)


Unnamed: 0,uniprot_id,begin,end,seq_diff
0,P01236,7,34,28.0
1,P01236,43,57,28.0
2,P01258,6,11,84.0
3,P01258,15,20,84.0
4,P02647,46,59,24.0


In [186]:
amy_prion_df_alt.loc[:,'begin_diff'] = amy_prion_df_alt['begin'].astype(int) + amy_prion_df_alt['seq_diff'].astype(int)
amy_prion_df_alt.loc[:,'end_diff'] = amy_prion_df_alt['end'].astype(int) + amy_prion_df_alt['seq_diff'].astype(int)

In [188]:
amy_prion_df_change = amy_prion_df_alt.drop(['begin','end','seq_diff'],axis=1)
amy_prion_df_change.columns = ['uniprot_id','begin','end']
print(amy_prion_df_change.shape)
amy_prion_df_change.head()

(187, 3)


Unnamed: 0,uniprot_id,begin,end
0,P01236,35,62
1,P01236,71,85
2,P01258,90,95
3,P01258,99,104
4,P02647,70,83


In [192]:
conn = sqlite3.connect('human_protein.db')
amy_prion_df_change.to_sql('amyloid_prion_id_mapped',con=conn,index=False,if_exists='replace')
conn.close()

If you're wondering how/why the amy_prion table expanded...in the amyloid database, if a protein is annotated as producing multiple amyloid proteins - they make a new entry for it in the database.

My doing the above, I correctly mapped these extra amyloids to the original uniprot protein sequence. So my list of amyloid sections expanded due to those proteins having multiple entries.

# Parse amyloids not yet present in protein db

In [193]:
conn = sqlite3.connect("human_protein.db")
with open('data/backfill_amyloids.xml') as f:
    amyloid_protein_dict = xmltodict.parse(f.read())

In [198]:
def commentAlgo(comment,locs,):
    """I'm having a problem with entries sometimes being lists or flat entries"""

def entryCommentParser(entry_name, comments):
    function_list = []
    disease_list = []
    tissue_list = []
    subcellular_list = []
    if type(comments)!=list:
        comments = [comments]
    for comment in comments:
        comm_arr = [entry_name]
        comm_type = comment.get("@type",None)
        value = comment.get("text")
        if comm_type in ['function','pathway','activity regulation','similarity']:
            if value!=None and type(value)!=str:
                value = value.get("#text")
                comm_arr.extend([comm_type, value])
                function_list.append(comm_arr)
        elif comm_type=='disease':
            comm_disease = comment.get('disease')
            if type(comm_disease)!='str' and comm_disease!=None:
                comm_arr.append(comment.get('disease').get("name"))
                disease_list.append(comm_arr)
        elif comm_type=='tissue specificity':
            value = comment.get('text')
            if type(value)!=str:
                value = value.get("#text")
            comm_arr.append(value)
            tissue_list.append(comm_arr)
        # This is a really messily annotated section in these entries...sheesh!
        elif comm_type=='subcellular location':
            locs = []
            loc_source = comment.get('subcellularLocation')
            if type(loc_source)!=list:
                loc_source = [loc_source]
            for loc_head in loc_source:
                if loc_head == None:
                    pass
                else:
                    sub_locs = loc_head.get('location')
                    if type(sub_locs)==list:
                        for sub_sub_locs in sub_locs:
                            if type(sub_sub_locs)==str:
                                value = sub_sub_locs
                            else:
                                value = sub_sub_locs.get('#text')
                            if value not in locs:
                                locs.append(value)
                                subcellular_list.append([entry_name, value])
                    elif type(sub_locs)==str:
                        value = sub_locs
                    else:
                        value = sub_locs.get('#text')
                    if value not in locs:
                        locs.append(value)
                        subcellular_list.append([entry_name, value])
        else:
            pass
    return function_list, disease_list, tissue_list, subcellular_list

def featureParser(entry_name, features):
    """Right now I only want features that are annotations of;
    
    -strand (beta sheet)
    -helix  (alpha helix)
    -turn   (highly structured secondary structure)
    
    """
    sec_struc_list = []
    modified_residues = []
    feat_arr = []
    for feature in features:
        if type(feature)==str:
            pass
        else:
            feat_type = feature.get("@type")
            if feat_type in ['strand','helix','turn']:
                feat_locations = feature.get('location')
                feat_begin = feat_locations.get('begin').get('@position')
                feat_end = feat_locations.get('end').get('@position')
                sec_struc_list.append([entry_name, feat_type, feat_begin, feat_end])
            elif feat_type in ['modified residue','non-standard residue','lipidation',
                               'glycosylation','disulfide bond','cross-link']:
                feat_desc = feature.get("@description")
                feat_posi = feature.get("location")
                if 'begin' in feat_posi.keys():
                    begin = feat_posi.get('begin').get("@position")
                    end = feat_posi.get('end').get("@position")
                    if begin==None:
                        feat_posi = end
                    elif end==None:
                        feat_posi = begin
                    elif begin==None and end==None:
                        feat_posi = None
                    else:
                        feat_posi = begin+"-"+end
                else:
                    feat_posi = feat_posi.get('position').get("@position")
                modified_residues.append([entry_name, feat_type, feat_desc, feat_posi])

    return sec_struc_list, modified_residues

def keywordParser(keyword):
    text = []
    for x in keyword:
        if type(x)==str:
            text.append(x)
        elif x!=None and type(x)!=str:
            text.append(x.get('#text'))
    return ', '.join(text)

def entryParser(entry, conn, print_option=False):
    # Get name
    entry_name = entry.get('name')
    # Get accession IDs
    entry_accessions = entry.get('accession')
    if type(entry_accessions)==list:
        entry_accessions = ",".join(entry_accessions)
    # Get sequence
    entry_seq = entry.get('sequence').get('#text').replace("\n","").strip()
    # Parse keywords
    entry_keywords = ""
    if entry.get('keyword')!=None:
        entry_keywords = keywordParser(entry.get('keyword'))
    # Parse features
    entry_features = entry.get('feature')
    if entry_features!=None:
        sec_struc_list, modified_residues = featureParser(entry_name, entry_features)
    else:
        sec_struc_list = []
        modified_residues = []
    # Parse comments
    entry_comments = entry.get('comment', None)
    comm_list = []
    disease_list = []
    tissue_list = []
    subcellular_list = []
    if entry_comments != None:
        comm_list, disease_list, tissue_list, subcellular_list \
        = entryCommentParser(entry_name, entry_comments)
    
    # One row each protein: protein name, sequence and keyword string
    protein_df = pd.DataFrame([[entry_name, entry_accessions, entry_seq, entry_keywords]],
                              columns=["protein","accession","sequence","keywords"])
    # Multiple rows each protein for secondary structure: protein name, secondary structure type,
    # where it starts, where it ends
    sec_struc_df = pd.DataFrame(sec_struc_list,
                                columns=["protein","sec_struc_type","begin","end"])
    # Multiple rows each protein: protein name, modification type, description of modification,
    # position of modification
    aa_mod_df = pd.DataFrame(modified_residues,
                             columns=['protein','modification','description','position'])
    # Multiple rows each protein for comments: protein name, comment type, comment value
    comment_df = pd.DataFrame(comm_list, columns=["protein","comm_type","value"])
    
    # Multiple rows each protein for disease associations: protein name, disease name
    disease_df = pd.DataFrame(disease_list, columns=["protein","disease"])
    
    # Multiple rows each protein for tissue expression: protein name, tissue
    tissue_df = pd.DataFrame(tissue_list, columns=['protein','disease'])
    
    # Multiple rows each protein for subcellular localization: protein name, subcellular loc
    subcellular_df = pd.DataFrame(subcellular_list, columns=['protein','subcellular_loc'])
    
    if print_option:
        if protein_df.shape[0]!=0:
            print("Protein DataFrame")
            display(protein_df)
        if sec_struc_df.shape[0]!=0:
            print("Secondary structure DataFrame")
            display(sec_struc_df)
        if aa_mod_df.shape[0]!=0:
            print("Amino Acid Modifications")
            display(aa_mod_df)
        if comment_df.shape[0]!=0:
            print("Comment DataFrame")
            display(comment_df)
        if disease_df.shape[0]!=0:
            print("Disease DataFrame")
            display(disease_df)
        if tissue_df.shape[0]!=0:
            print("Tissue DataFrame")
            display(tissue_df)
        if subcellular_df.shape[0]!=0:
            print("Subcellular Location DataFrame")
            display(subcellular_df)
            
    # Record entries in sqlite
    if protein_df.shape[0]!=0:
        protein_df.to_sql('protein',conn,if_exists='append',index=False)
    if sec_struc_df.shape[0]!=0:
        sec_struc_df.to_sql('protein_secondary_structure',conn,if_exists='append',index=False)
    if aa_mod_df.shape[0]!=0:
        aa_mod_df.to_sql('protein_amino_acid_modifications',conn,if_exists='append',index=False)
    if comment_df.shape[0]!=0:
        comment_df.to_sql('protein_comments',conn,if_exists='append',index=False)
    if disease_df.shape[0]!=0:
        disease_df.to_sql('protein_diseases',conn,if_exists='append',index=False)
    if tissue_df.shape[0]!=0:
        tissue_df.to_sql('protein_tissue_expression',conn,if_exists='append',index=False)
    if subcellular_df.shape[0]!=0:
        subcellular_df.to_sql('protein_subcellular_localization',conn,if_exists='append',index=False)


In [199]:
# 
for entry in tqdm_notebook(amyloid_protein_dict['uniprot']['entry']):
    entryParser(entry,conn)

HBox(children=(IntProgress(value=0, max=70), HTML(value='')))

In [203]:
# Drop duplicates in each table.
for table in ['protein','protein_secondary_structure',
              'protein_amino_acid_modifications', 'protein_comments',
              'protein_diseases','protein_tissue_expression',
              'protein_subcellular_localization']:
    loaded_df = pd.read_sql("select * from "+table,con=conn)
    shape_bef = loaded_df.shape[0]
    loaded_df.drop_duplicates(inplace=True)
    shape_aft = loaded_df.shape[0]
    if shape_bef > shape_aft:
        print("Dropped "+str(shape_bef-shape_aft)+" rows from "+table)
    loaded_df.to_sql(table,con=conn,index=False,if_exists='replace')

Dropped 63 rows from protein
Dropped 499 rows from protein_secondary_structure
Dropped 246 rows from protein_amino_acid_modifications
Dropped 95 rows from protein_comments
Dropped 4 rows from protein_diseases
Dropped 9 rows from protein_tissue_expression
Dropped 1084 rows from protein_subcellular_localization


In [204]:
conn.close()

In [11]:
def nGrammer(protein, seq, n):
    return [[protein, i+1]+list(x) for i, x in enumerate(nltk.ngrams(seq, n))]

def dfNGrammer(protein_df, num_grams):
    gram_cols = ["protein","gram_num"]+["gram_"+str(i+1) for i in range(num_grams)]
    seq_grams = []
    seq_grams.extend([val for sublist in protein_df.apply(lambda row: nGrammer(row['protein'], row['sequence'], num_grams),axis=1) for val in sublist])
    gram_df = pd.DataFrame(seq_grams, columns=gram_cols)
    return gram_df
        
#ngram_df = dfNGrammer(prot_feat,5)

In [None]:
#prot_feat.shape,ngram_df.shape

In [None]:
#ngram_df.head()

In [None]:
#conn.close()