---
**API requests to UniProt**

---

Ressources:
https://www.uniprot.org/help/api_queries 
- https://www.uniprot.org/help/query-fields
 - https://www.uniprot.org/help/api_idmapping
 - https://www.uniprot.org/docs/dbxref
 - https://www.uniprot.org/taxonomy/
  
- https://www.uniprot.org/help/uniprotkb_column_names

- https://www.uniprot.org/docs/userman.htm#linetypes

In [1]:
import requests
import os
import pandas as pd
import re
import regex as re2
from sklearn.feature_extraction.text import CountVectorizer
from io import StringIO
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from toolbox import *

In [2]:
cfg = load_cfg()

logVersions = load_LogVersions()

# Download data

- v3.4 is downloaded on 09/11/2021

In [3]:
version_uniprot = "3-4"

In [4]:
baseRequest= "https://www.uniprot.org/uniprot/?"

requestParameters = {
    "query":[
        "active:yes", # not obsolete
        "reviewed:yes", # Swiss-prot
        "organism:9606" # Human only
    ],
    "format":["tab"],
    "columns":["id,"+
               "go(biological process),"+
               "go(cellular component),"+
               "go(molecular function),"+
#                "database(ensembl),"+
               "database(bgee),"+
               "feature(DOMAIN EXTENT),"+
               "feature(MOTIF),"+
               "sequence"
              ]
}

baseRequest += '&'.join(['%s=%s' % (k,"+AND+".join(v)) for k,v in requestParameters.items()])

print(baseRequest)

https://www.uniprot.org/uniprot/?query=active:yes+AND+reviewed:yes+AND+organism:9606&format=tab&columns=id,go(biological process),go(cellular component),go(molecular function),database(bgee),feature(DOMAIN EXTENT),feature(MOTIF),sequence


In [5]:
results = requests.get(baseRequest)

In [6]:
# Sanity check
assert results.ok

In [7]:
uniprotEnriched1 = pd.read_csv(StringIO(results.content.decode("utf-8")), sep="\t")

glance(uniprotEnriched1)

DataFrame: 20,386 rows 	 8 columns


Unnamed: 0,Entry,Gene ontology (biological process),Gene ontology (cellular component),Gene ontology (molecular function),Cross-reference (bgee),Domain [FT],Motif,Sequence
0,Q00266,methionine catabolic process [GO:0009087]; met...,cytosol [GO:0005829],ATP binding [GO:0005524]; identical protein bi...,ENSG00000151224;,,,MNGPVDGLCDHSLSEGVFMFTSESVGEGHPDKICDQISDAVLDAHL...
1,Q8NB16,activation of JUN kinase activity [GO:0007257]...,cell junction [GO:0030054]; cytoplasm [GO:0005...,ATP binding [GO:0005524]; identical protein bi...,ENSG00000168404;,"DOMAIN 194..469; /note=""Protein kinase""; /ev...",,MENLKHIITLGQVIHKRCEEMKYCKKQCRRLGHRVLGLIKPLEMLQ...
2,O94851,actin filament depolymerization [GO:0030042]; ...,nucleus [GO:0005634],actin binding [GO:0003779]; FAD binding [GO:00...,ENSG00000133816;,"DOMAIN 516..619; /note=""Calponin-homology (CH...","MOTIF 660..681; /note=""Nuclear localization s...",MGENEDEKQAQAGQVFENFVQASTCKGTLQAFNILTRHLDLDPLDH...
3,Q8TDZ2,actin filament bundle assembly [GO:0051017]; a...,actin cytoskeleton [GO:0015629]; cytoplasm [GO...,actin binding [GO:0003779]; actin filament bin...,ENSG00000135596;,"DOMAIN 508..612; /note=""Calponin-homology (CH...",,MASPTSTNPAHAHFESFLQAQLCQDVLSSFQELCGALGLEPGGGLP...
4,Q9NPJ6,"positive regulation of transcription, DNA-temp...",core mediator complex [GO:0070847]; mediator c...,nuclear receptor coactivator activity [GO:0030...,ENSG00000136146;,,,MAASSSGEKEKERLGGGLGVAGGNSTRERLLSALEDLEVLSRELIE...


---
**Clean columns**

In [8]:
uniprotEnriched1['Cross-reference (bgee)'] = uniprotEnriched1['Cross-reference (bgee)'].str.replace(';','')

uniprotEnriched1['Cross-reference (bgee)'][:5]

0    ENSG00000151224
1    ENSG00000168404
2    ENSG00000133816
3    ENSG00000135596
4    ENSG00000136146
Name: Cross-reference (bgee), dtype: object

In [9]:
print(uniprotEnriched1.Sequence[0])

MNGPVDGLCDHSLSEGVFMFTSESVGEGHPDKICDQISDAVLDAHLKQDPNAKVACETVCKTGMVLLCGEITSMAMVDYQRVVRDTIKHIGYDDSAKGFDFKTCNVLVALEQQSPDIAQCVHLDRNEEDVGAGDQGLMFGYATDETEECMPLTIILAHKLNARMADLRRSGLLPWLRPDSKTQVTVQYMQDNGAVIPVRIHTIVISVQHNEDITLEEMRRALKEQVIRAVVPAKYLDEDTVYHLQPSGRFVIGGPQGDAGVTGRKIIVDTYGGWGAHGGGAFSGKDYTKVDRSAAYAARWVAKSLVKAGLCRRVLVQVSYAIGVAEPLSISIFTYGTSQKTERELLDVVHKNFDLRPGVIVRDLDLKKPIYQKTACYGHFGRSEFPWEVPRKLVF


----
**Sort by alphabetical order**

In [10]:
uniprotEnriched2 = uniprotEnriched1.sort_values(by="Entry").reset_index(drop=True)
glance(uniprotEnriched2)

DataFrame: 20,386 rows 	 8 columns


Unnamed: 0,Entry,Gene ontology (biological process),Gene ontology (cellular component),Gene ontology (molecular function),Cross-reference (bgee),Domain [FT],Motif,Sequence
0,A0A024RBG1,adenosine 5'-(hexahydrogen pentaphosphate) cat...,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,bis(5'-adenosyl)-hexaphosphatase activity [GO:...,ENSG00000173598,"DOMAIN 18..145; /note=""Nudix hydrolase""; /ev...","MOTIF 51..72; /note=""Nudix box""; /evidence=""...",MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...
1,A0A075B6H7,adaptive immune response [GO:0002250]; immune ...,extracellular space [GO:0005615]; immunoglobul...,,ENSG00000243063,"DOMAIN 22..>116; /note=""Ig-like""; /evidence=...",,MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRAS...
2,A0A075B6H8,adaptive immune response [GO:0002250]; immune ...,extracellular space [GO:0005615]; immunoglobul...,,ENSG00000211633,"DOMAIN 23..>117; /note=""Ig-like""; /evidence=...",,MDMRVPAQLLGLLLLWLPGVRFDIQMTQSPSFLSASVGDRVSIICW...
3,A0A075B6H9,adaptive immune response [GO:0002250]; immune ...,extracellular space [GO:0005615]; immunoglobul...,,ENSG00000211637,"DOMAIN 21..>119; /note=""Ig-like""; /evidence=...",,MAWTPLLFLTLLLHCTGSLSQLVLTQSPSASASLGASVKLTCTLSS...
4,A0A075B6I0,adaptive immune response [GO:0002250]; immune ...,extracellular space [GO:0005615]; immunoglobul...,,ENSG00000211638,"DOMAIN 25..>122; /note=""Ig-like""; /evidence=...",,MSVPTMAWMMLLLGLLAYGSGVDSQTVVTQEPSFSVSPGGTVTLTC...


---
**Test uniprot matching**

In [11]:
uniprotkbIdsList = list(set(uniprotEnriched2.Entry))

uniprotMapping = mappingUniprotIDs(fromID = 'ACC', listIDs = uniprotkbIdsList)

assert len(uniprotMapping.loc[uniprotMapping.From != uniprotMapping.To]) == 0

---
**Export**

In [12]:
uniprotEnriched_export = uniprotEnriched2

In [13]:
# logVersions['UniProt'] = dict()
logVersions['UniProt']['rawData'] = version_uniprot

dump_LogVersions(logVersions)

In [14]:
# Export raw enriched data
uniprotEnriched_export.to_pickle(os.path.join(cfg['rawDataUniProt'],
                                              "uniprot_allProteinsEnriched_Human_v{}.pkl".format(version_uniprot)))

# Export protein list
with open(os.path.join(cfg['rawDataUniProt'], "uniprot_allProteins_Human_v{}.pkl".format(version_uniprot)), 'w') as f:
    for item in uniprotEnriched_export.Entry:
        f.write("%s\n" % item)

# Export UniProt/Bgee matching
uniprotEnriched_export.loc[:, ['Entry', 'Cross-reference (bgee)']].to_pickle(os.path.join(cfg['rawDataUniProt'],
                                                                                          "uniprot_allProteinsBgee_Human_v{}.pkl".format(version_uniprot)))

In [15]:
uniprotEnriched_export.loc[:, ['Entry', 'Cross-reference (bgee)']]

Unnamed: 0,Entry,Cross-reference (bgee)
0,A0A024RBG1,ENSG00000173598
1,A0A075B6H7,ENSG00000243063
2,A0A075B6H8,ENSG00000211633
3,A0A075B6H9,ENSG00000211637
4,A0A075B6I0,ENSG00000211638
...,...,...
20381,S4R3Y5,ENSG00000270188
20382,U3KPV4,ENSG00000184389
20383,W5XKT8,
20384,W6CW81,


In [16]:
uniprotBgeeMatching = pd.read_pickle(
    os.path.join(cfg['rawDataUniProt'], 
                 "uniprot_allProteinsBgee_Human_v{}.pkl".format(logVersions['UniProt']['rawData'])))
glance(uniprotBgeeMatching)

DataFrame: 20,386 rows 	 2 columns


Unnamed: 0,Entry,Cross-reference (bgee)
0,A0A024RBG1,ENSG00000173598
1,A0A075B6H7,ENSG00000243063
2,A0A075B6H8,ENSG00000211633
3,A0A075B6H9,ENSG00000211637
4,A0A075B6I0,ENSG00000211638


# Create features datasets
- v2.0 is current preprocessing

In [3]:
logVersions = load_LogVersions()

In [4]:
myVersionUniprot = '2-0'

logVersions['UniProt']['preprocessed'] = myVersionUniprot

dump_LogVersions(logVersions)

In [5]:
uniprotEnriched = pd.read_pickle(os.path.join(cfg['rawDataUniProt'],
                                              "uniprot_allProteinsEnriched_Human_v{}.pkl".format(logVersions['UniProt']['rawData'])))

glance(uniprotEnriched)

DataFrame: 20,386 rows 	 8 columns


Unnamed: 0,Entry,Gene ontology (biological process),Gene ontology (cellular component),Gene ontology (molecular function),Cross-reference (bgee),Domain [FT],Motif,Sequence
0,A0A024RBG1,adenosine 5'-(hexahydrogen pentaphosphate) cat...,cytoplasm [GO:0005737]; cytosol [GO:0005829]; ...,bis(5'-adenosyl)-hexaphosphatase activity [GO:...,ENSG00000173598,"DOMAIN 18..145; /note=""Nudix hydrolase""; /ev...","MOTIF 51..72; /note=""Nudix box""; /evidence=""...",MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...
1,A0A075B6H7,adaptive immune response [GO:0002250]; immune ...,extracellular space [GO:0005615]; immunoglobul...,,ENSG00000243063,"DOMAIN 22..>116; /note=""Ig-like""; /evidence=...",,MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRAS...
2,A0A075B6H8,adaptive immune response [GO:0002250]; immune ...,extracellular space [GO:0005615]; immunoglobul...,,ENSG00000211633,"DOMAIN 23..>117; /note=""Ig-like""; /evidence=...",,MDMRVPAQLLGLLLLWLPGVRFDIQMTQSPSFLSASVGDRVSIICW...
3,A0A075B6H9,adaptive immune response [GO:0002250]; immune ...,extracellular space [GO:0005615]; immunoglobul...,,ENSG00000211637,"DOMAIN 21..>119; /note=""Ig-like""; /evidence=...",,MAWTPLLFLTLLLHCTGSLSQLVLTQSPSASASLGASVKLTCTLSS...
4,A0A075B6I0,adaptive immune response [GO:0002250]; immune ...,extracellular space [GO:0005615]; immunoglobul...,,ENSG00000211638,"DOMAIN 25..>122; /note=""Ig-like""; /evidence=...",,MSVPTMAWMMLLLGLLAYGSGVDSQTVVTQEPSFSVSPGGTVTLTC...


In [6]:
uniprotEnriched.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20386 entries, 0 to 20385
Data columns (total 8 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   Entry                               20386 non-null  object
 1   Gene ontology (biological process)  17048 non-null  object
 2   Gene ontology (cellular component)  18621 non-null  object
 3   Gene ontology (molecular function)  15834 non-null  object
 4   Cross-reference (bgee)              19090 non-null  object
 5   Domain [FT]                         8571 non-null   object
 6   Motif                               2283 non-null   object
 7   Sequence                            20386 non-null  object
dtypes: object(8)
memory usage: 1.2+ MB


In [10]:
uniprotEnriched.isna().sum()

Entry                                     0
Gene ontology (biological process)     3338
Gene ontology (cellular component)     1765
Gene ontology (molecular function)     4552
Cross-reference (bgee)                 1296
Domain [FT]                           11815
Motif                                 18103
Sequence                                  0
dtype: int64

In [20]:
uniprotEnriched2 = uniprotEnriched.fillna('')

## Biological process

In [21]:
bow, vectorizer = createBoW(
    createGOlist(GOcol = uniprotEnriched2["Gene ontology (biological process)"], 
                 regex0 = r"(?<=\[GO:)[\d]+(?=\])")
)
bow

20386 20386
Shape BoW: (20386, 12248)


<20386x12248 sparse matrix of type '<class 'numpy.int64'>'
	with 135438 stored elements in Compressed Sparse Row format>

In [22]:
# This one takes a while to run
bioProcessUniprot_BoW = pd.DataFrame(bow.todense())
bioProcessUniprot_BoW.columns = vectorizer.get_feature_names()
bioProcessUniprot_BoW['uniprotID'] = uniprotEnriched2['Entry']

glance(bioProcessUniprot_BoW)

DataFrame: 20,386 rows 	 12,249 columns


Unnamed: 0,0000002,0000003,0000012,0000017,0000018,0000019,0000022,0000023,0000027,0000028,...,2001287,2001288,2001294,2001295,2001301,2001302,2001303,2001306,2001311,uniprotID
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A024RBG1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H7
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H8
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H9
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6I0


---
**Export**

In [27]:
bioProcessUniprot_BoW.to_pickle(
    os.path.join(
        cfg['outputPreprocessingUniprot'], 
        "bioProcessUniprot_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
    )
)

## Cellular component

In [28]:
bow, vectorizer = createBoW(
    createGOlist(GOcol = uniprotEnriched2["Gene ontology (cellular component)"], 
                 regex0 = r"(?<=\[GO:)[\d]+(?=\])")
)
bow

20386 20386
Shape BoW: (20386, 1754)


<20386x1754 sparse matrix of type '<class 'numpy.int64'>'
	with 81474 stored elements in Compressed Sparse Row format>

In [29]:
cellCompUniprot_BoW = pd.DataFrame(bow.todense())
cellCompUniprot_BoW.columns = vectorizer.get_feature_names()
cellCompUniprot_BoW['uniprotID'] = uniprotEnriched2['Entry']

glance(cellCompUniprot_BoW)

DataFrame: 20,386 rows 	 1,755 columns


Unnamed: 0,0000015,0000109,0000110,0000111,0000112,0000118,0000120,0000123,0000124,0000125,...,1990876,1990879,1990904,1990907,1990909,1990913,1990917,1990923,1990971,uniprotID
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A024RBG1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H7
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H8
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H9
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6I0


---
**Export**

In [33]:
cellCompUniprot_BoW.to_pickle(
    os.path.join(
        cfg['outputPreprocessingUniprot'], 
        "cellCompUniprot_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
    )
)

## Molecular function

In [35]:
bow, vectorizer = createBoW(
    createGOlist(GOcol = uniprotEnriched2["Gene ontology (molecular function)"], 
                 regex0 = r"(?<=\[GO:)[\d]+(?=\])")
)
bow

20386 20386
Shape BoW: (20386, 4346)


<20386x4346 sparse matrix of type '<class 'numpy.int64'>'
	with 56227 stored elements in Compressed Sparse Row format>

In [36]:
molFuncUniprot_BoW = pd.DataFrame(bow.todense())
molFuncUniprot_BoW.columns = vectorizer.get_feature_names()
molFuncUniprot_BoW['uniprotID'] = uniprotEnriched2['Entry']

glance(molFuncUniprot_BoW)

DataFrame: 20,386 rows 	 4,347 columns


Unnamed: 0,0000009,0000010,0000014,0000016,0000026,0000030,0000033,0000035,0000036,0000048,...,1990932,1990935,1990939,1990948,1990955,1990984,2001065,2001069,2001070,uniprotID
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A024RBG1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H7
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H8
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H9
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6I0


---
**Export**

In [37]:
molFuncUniprot_BoW.to_pickle(
    os.path.join(
        cfg['outputPreprocessingUniprot'], 
        "molFuncUniprot_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
    )
)

## Domain

In [38]:
bow, vectorizer = createBoW(
    createGOlist(GOcol = uniprotEnriched2["Domain [FT]"], 
                 regex0 = r"(?<=note=\")[^\"]+(?=\")")
)
bow

20386 20386
Shape BoW: (20386, 2313)


<20386x2313 sparse matrix of type '<class 'numpy.int64'>'
	with 20663 stored elements in Compressed Sparse Row format>

In [39]:
# This one takes a while to run
domain_BoW = pd.DataFrame(bow.todense())
domain_BoW.columns = vectorizer.get_feature_names()
domain_BoW['uniprotID'] = uniprotEnriched2['Entry']

glance(domain_BoW)

DataFrame: 20,386 rows 	 2,314 columns


Unnamed: 0,10,11,12,2Fe2Sferredoxintype,2SPRY,2SPRY1,2SPRY2,2SPRY3,3ligatedtype,4Fe4SHis,...,tSNAREcoiledcoilhomology2,trtypeG,truncated,uDENN,uDENNC9ORF72type,uDENNFLCNSMCR8type,uDENNFNIP12type,vSNAREcoiledcoilhomology,xRRM,uniprotID
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A024RBG1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H7
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H8
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H9
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6I0


---
**Test of the parsing method**

In [40]:
temp1 = pd.DataFrame({'a': uniprotEnriched2["Domain [FT]"],
                      'b': createGOlist(GOcol = uniprotEnriched2["Domain [FT]"], 
                                        regex0 = r"(?<=note=\")[^\"]+(?=\")"
                                       )}
                     )
temp1.loc[temp1.a != '']

20386 20386


Unnamed: 0,a,b
0,"DOMAIN 18..145; /note=""Nudix hydrolase""; /ev...",Nudixhydrolase
1,"DOMAIN 22..>116; /note=""Ig-like""; /evidence=...",Iglike
2,"DOMAIN 23..>117; /note=""Ig-like""; /evidence=...",Iglike
3,"DOMAIN 21..>119; /note=""Ig-like""; /evidence=...",Iglike
4,"DOMAIN 25..>122; /note=""Ig-like""; /evidence=...",Iglike
...,...,...
20372,"DOMAIN 873..953; /note=""IPT/TIG""; DOMAIN 1547...",IPTTIG IQ1 IQ2 IQ3
20373,"DOMAIN 644..707; /note=""SAM""; DOMAIN 779..989...",SAM DDHD
20378,"DOMAIN 53..112; /note=""Collagen-like""; DOMAIN...",Collagenlike Ctypelectin
20383,"DOMAIN 150..236; /note=""Ig-like""; /evidence=...",Iglike


In [41]:
# Sanity check
foo = temp1.loc[(temp1.a != '')&(temp1.b == '')]
assert len(foo) == 0

---
**Export**

In [43]:
domain_BoW.to_pickle(
    os.path.join(
        cfg['outputPreprocessingUniprot'], 
        "domainFT_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
    )
)

## Motif

In [44]:
bow, vectorizer = createBoW(
    createGOlist(GOcol = uniprotEnriched2["Motif"], 
                 regex0 = r"(?<=note=\")[^\"]+(?=\")")
)
bow

20386 20386
Shape BoW: (20386, 819)


<20386x819 sparse matrix of type '<class 'numpy.int64'>'
	with 3584 stored elements in Compressed Sparse Row format>

In [45]:
# This one takes a while to run
motif_BoW = pd.DataFrame(bow.todense())
motif_BoW.columns = vectorizer.get_feature_names()
motif_BoW['uniprotID'] = uniprotEnriched2['Entry']

glance(motif_BoW)

DataFrame: 20,386 rows 	 820 columns


Unnamed: 0,2SPRYdomainbindingmotif,4EHPbindingmotif,4Smotif,4motif,9aaTAD,9aaTAD1,9aaTAD2,AAASmotif,AAD,ABS2,...,requiredforinteractionwithCTTN,requiredforinteractionwithHSC20,requiredforinteractionwithMAPRE1,requiredforinteractionwithSLC9A3R1,requiredforpropernuclearlocalization,signalforcargopackagingintoCOPIIcoatedvesicles,toFYN,toLCP2,xCxx,uniprotID
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A024RBG1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H7
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H8
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6H9
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,A0A075B6I0


---
**Test of the parsing method**

In [46]:
temp1 = pd.DataFrame({'a': uniprotEnriched2["Motif"],
                      'b': createGOlist(GOcol = uniprotEnriched2["Motif"], 
                                        regex0 = r"(?<=note=\")[^\"]+(?=\")"
                                       )}
                     )
temp1.loc[temp1.a != '']

20386 20386


Unnamed: 0,a,b
0,"MOTIF 51..72; /note=""Nudix box""; /evidence=""...",Nudixbox
67,"MOTIF 275..277; /note=""POLO box domain (PBD)-...",POLOboxdomain(PBD)binding
237,"MOTIF 48..57; /note=""AxLyCxL""; /evidence=""EC...",AxLyCxL
385,"MOTIF 28..43; /note=""Engrailed homology 1 rep...",Engrailedhomology1repressor
413,"MOTIF 25..35; /note=""Cx9C motif 1""; /evidenc...",Cx9Cmotif1 Cx9Cmotif2
...,...,...
20335,"MOTIF 1059..1061; /note=""Cell attachment site...",Cellattachmentsite
20345,"MOTIF 685..689; /note=""LXXLL motif 1""; MOTIF ...",LXXLLmotif1 LXXLLmotif2 LXXLLmotif3
20357,"MOTIF 2..30; /note=""Q motif""; MOTIF 152..155;...",Qmotif DEADbox
20364,"MOTIF 19..23; /note=""LXXLL motif""",LXXLLmotif


In [47]:
# Sanity check
foo = temp1.loc[(temp1.a != '')&(temp1.b == '')]
assert len(foo) == 0

---
**Export**

In [49]:
motif_BoW.to_pickle(
    os.path.join(
        cfg['outputPreprocessingUniprot'], 
        "motif_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
    )
)

## Sequence

In [50]:
uniprotEnriched2.Sequence[0]

'MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQWIVPGGGMEPEEEPGGAAVREVYEEAGVKGKLGRLLGIFEQNQDRKHRTYVYVLTVTEILEDWEDSVNIGRKREWFKVEDAIKVLQCHKPVHAEYLEKLKLGCSPANGNSTVPSLPDNNALFVTAAQTSGLPSSVR'

In [51]:
sequenceData = uniprotEnriched2.loc[:,['Entry','Sequence']]
sequenceData.columns = ['uniprotID','sequence']

glance(sequenceData)

DataFrame: 20,386 rows 	 2 columns


Unnamed: 0,uniprotID,sequence
0,A0A024RBG1,MMKFKPNQTRTYDREGFKKRAACLCFRSEQEDEVLLVSSSRYPDQW...
1,A0A075B6H7,MEAPAQLLFLLLLWLPDTTREIVMTQSPPTLSLSPGERVTLSCRAS...
2,A0A075B6H8,MDMRVPAQLLGLLLLWLPGVRFDIQMTQSPSFLSASVGDRVSIICW...
3,A0A075B6H9,MAWTPLLFLTLLLHCTGSLSQLVLTQSPSASASLGASVKLTCTLSS...
4,A0A075B6I0,MSVPTMAWMMLLLGLLAYGSGVDSQTVVTQEPSFSVSPGGTVTLTC...


---
**Export**

In [52]:
sequenceData.to_pickle(
    os.path.join(
        cfg['outputPreprocessingUniprot'], 
        "sequenceData_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
    )
)