In [32]:
import requests
import pandas as pd
import json

In [33]:
patric = pd.read_table("patric1.tbl", nrows = 1000)
patric = patric.rename(columns = {'genome.genome_id':'genome_id', 
    'feature.patric_id':'patric_id', 
    'feature.aa_sequence_md5':'md5',
    'feature.pgfam_id':'pgfam_id',
    'feature.plfam_id':'plfam_id'
})
patric.dropna(subset = ['md5'], inplace = True)
patric['species'] = (patric['genome_id'].astype(str).apply(lambda x: x.split('.')[0])).astype(int)

patricmd5 = patric['md5'].values
patricspecies = patric['species'].drop_duplicates().values

In [34]:
patric

Unnamed: 0,genome_id,patric_id,md5,pgfam_id,plfam_id,species
0,1478221.3,fig|1478221.3.peg.329,d7d58cc8dc87a0859d34f29a0d3dfb8b,PGF_01671660,,1478221
1,1478221.3,fig|1478221.3.peg.178,f37f77ec6ed4833af23f052ef2bb13c8,,,1478221
2,1478221.3,fig|1478221.3.peg.1715,0842c842419f20909f282500bcf9576e,,,1478221
3,1478221.3,fig|1478221.3.peg.1546,2a6eadada00a8d879ad2bc775e90e154,PGF_00012969,,1478221
4,1478221.3,fig|1478221.3.peg.2514,b86b786e2067b0717c927b484e5dae52,,,1478221
...,...,...,...,...,...,...
995,1478221.3,fig|1478221.3.peg.485,6f2e1dd1eada2333aa01bc25f7ee8556,,,1478221
996,1478221.3,fig|1478221.3.peg.2031,40fd4d24142a8ec274d9b44bb5e9ea9a,,,1478221
997,1478221.3,fig|1478221.3.peg.2572,625a94adc7dfe97cca3d92da2415c6e6,,,1478221
998,1478221.3,fig|1478221.3.peg.4460,c6bc746737b9289a869bf61aebf89edd,PGF_08225224,,1478221


In [35]:
def md5toString(md5array):
    s = '('
    for md5 in md5array:
        s = s + '"' + md5 + '",'
    s = s[:-1] + ')'
    return s
def speciestoString(species):
    s = '('
    for org in species:
        s = s + 'taxon:' + str(org) + ','
    s = s[:-1] + ')'
    return s

In [36]:
search = f"""
    PREFIX up: <http://purl.uniprot.org/core/>
    PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
        SELECT DISTINCT
            (CONCAT(SUBSTR(STR(?protein), 33)) AS ?uniprot)
            ?md5 
            ?aa_sequence
            (CONCAT(SUBSTR(STR(?taxon), 34)) AS ?species)
            (CONCAT(SUBSTR(STR(?goTerm), 32)) AS ?GO)
            ?goLabel
        WHERE {{
    VALUES (?go) {{("GO_0008150") ("GO_0005575") ("GO_0003674")}}
    BIND (IRI(CONCAT("http://purl.obolibrary.org/obo/", ?go)) AS ?aspect)
            ?protein a up:Protein ;
                up:organism ?taxon ;
                up:classifiedWith ?goTerm ;
                up:sequence ?sequence .
            ?sequence up:md5Checksum ?md5 ;
                      rdf:value ?aa_sequence .
            ?goTerm rdfs:subClassOf ?aspect ;
                    rdfs:label ?goLabel .
        FILTER (?md5 in {md5toString(patricmd5)})
        FILTER (?taxon in {speciestoString(patricspecies)})
        }}
"""
r1 = requests.post("https://sparql.uniprot.org", headers={'accept': 'application/sparql-results+json'}, data={'query': search})
results = r1.json()

In [37]:
uniprot = pd.DataFrame(columns=('md5', 'uniprotID', 'species', 'aa_sequence', 'goTerm', 'goLabel'))
for row in results['results']['bindings']:
    uniprot = uniprot.append({
        'md5': row['md5']['value'], 
        'uniprotID': row['uniprot']['value'],
        'species': int(row['species']['value']),
        'aa_sequence': row['aa_sequence']['value'],
        'goTerm': row['GO']['value'],
        'goLabel': row['goLabel']['value']},
        ignore_index=True)

In [38]:
patricGO = pd.merge(uniprot, patric)
crosslinks = patricGO[['uniprotID', 'patric_id']].drop_duplicates()
crosslinks.to_csv("linkout.tbl", index=False, sep = '\t')
patricGO

Unnamed: 0,md5,uniprotID,species,aa_sequence,goTerm,goLabel,genome_id,patric_id,pgfam_id,plfam_id
0,68ed1e7c2fd73db33789b97b25661e68,A0A267MF46,1478221,MSIKVENLKHIYNPNTPFETIALNNVSFNIEKGEFIGLIGHTGSGK...,GO_0006824,cobalt ion transport,1478221.3,fig|1478221.3.peg.1527,PGF_00843068,
1,68ed1e7c2fd73db33789b97b25661e68,A0A267MF46,1478221,MSIKVENLKHIYNPNTPFETIALNNVSFNIEKGEFIGLIGHTGSGK...,GO_0005524,ATP binding,1478221.3,fig|1478221.3.peg.1527,PGF_00843068,
2,68ed1e7c2fd73db33789b97b25661e68,A0A267MF46,1478221,MSIKVENLKHIYNPNTPFETIALNNVSFNIEKGEFIGLIGHTGSGK...,GO_0005886,plasma membrane,1478221.3,fig|1478221.3.peg.1527,PGF_00843068,
3,68ed1e7c2fd73db33789b97b25661e68,A0A267MF46,1478221,MSIKVENLKHIYNPNTPFETIALNNVSFNIEKGEFIGLIGHTGSGK...,GO_0042626,ATPase-coupled transmembrane transporter activity,1478221.3,fig|1478221.3.peg.1527,PGF_00843068,
4,080aed5d192e3b528b13921cf2cc26d3,A0A267MHJ5,1478221,MEKIMDFLFGLKMTVVSGIFLALSLICMMVGIEVPVDFAWGAVLIS...,GO_0019829,ATPase-coupled cation transmembrane transporte...,1478221.3,fig|1478221.3.peg.793,PGF_07109475,
...,...,...,...,...,...,...,...,...,...,...
1606,1091ea6fed8ec4c7440215f77352ce97,A0A267MLZ1,1478221,MPNIICAGFGGQGVLTAGLIIAKTGMNNNKNVTWIPSYGSEMRGGT...,GO_0016625,"oxidoreductase activity, acting on the aldehyd...",1478221.3,fig|1478221.3.peg.1750,PGF_03197417,
1607,36f21670edfc46943914da58345fef9f,A0A267MH52,1478221,MRIIFMGTPDFAVPCLDVVAKEHELLAVVTQPDRPKGRGKKLAAPP...,GO_0004479,methionyl-tRNA formyltransferase activity,1478221.3,fig|1478221.3.peg.915,PGF_05165078,
1608,b44f04c9b33b565b80e6504ebdab0cb1,A0A267MK93,1478221,MERYERNMSMLSEDENKKLSSFKVCVVGSGGLGGYVIEMLGRLGIG...,GO_0008641,ubiquitin-like modifier activating enzyme acti...,1478221.3,fig|1478221.3.peg.3669,PGF_00062085,
1609,99b3655fd2a72c9daa8c17d3a38981c6,A0A267MJH0,1478221,MGLSLIDLSQEIFQGMSVFPMHQPTFIMVNMTHEENMKVTGSKTLG...,GO_0004061,arylformamidase activity,1478221.3,fig|1478221.3.peg.3946,PGF_09583846,


In [42]:
fasta = []
for md5 in patricGO['md5'].unique():
    frame = patricGO.loc[patricGO['md5'] == md5]
    data = {
        'md5': md5,
        'sequence': frame['aa_sequence'].iloc[0],
        'sequence_type': 'protein'
    }
    data['patric'] = []
    for id in frame['patric_id'].unique():
        frame2 = frame.loc[frame['patric_id'] == id]
        p_ids = {
            'patric_id': id,
            'pgfam_id': frame2['pgfam_id'].iloc[0],
            'plfam_id': frame2['plfam_id'].iloc[0]}
        p_ids['GO'] = []
        for i in range(len(frame2)):
            p_ids['GO'].append({
                'GO_id': frame2['goTerm'].iloc[i],
                'GO_label': frame2['goLabel'].iloc[i]
            })
        data['patric'].append(dict(p_ids))
    fasta.append(dict(data))
with open("patricfasta.json", 'w') as f:
    json.dump(fasta, f)

TypeError: Object of type ndarray is not JSON serializable

In [None]:
out = json.dumps(fasta, indent = 4)
print(out)

[
    {
        "md5": "68ed1e7c2fd73db33789b97b25661e68",
        "sequence": "MSIKVENLKHIYNPNTPFETIALNNVSFNIEKGEFIGLIGHTGSGKSTLVQHLNGILKPHSGKIIINGVDITKPKMNLREIRQKVGLVFQYPEHQLFEETVHRDVAFGPMNLGLDEDEVNSRVKEAIKWVGLNYEEIKDKSPFELSGGQKRRVAIAGVVAMKPEVLILDEPTAGLDPKARDDILNQIKKLHNEYKMNIILVSHSMEDISRLVDRIIVMHKGQVALMGRAKEVFKESKLLTEIGLGVPQITVLVNKLNEKGINIKDDIFTIDEAKEEILKWMRSK",
        "sequence_type": "protein",
        "uniprot": [
            {
                "uniprot_id": "A0A267MF46"
            }
        ],
        "patric": [
            {
                "patric_id": "fig|1478221.3.peg.1527",
                "pgfam_id": "PGF_00843068",
                "plfam_id": NaN,
                "GO": [
                    {
                        "GO_id": "GO_0006824",
                        "GO_label": "cobalt ion transport"
                    },
                    {
                        "GO_id": "GO_0005524",
                        "GO_label": "ATP binding"
                    },
           