## API Calls to Obtain GO Terms from UniProt

This notebook performs API calls to obtain gene ontology terms from UniProt

Author: Nicolas Jonsson and Giulio Tesei

Contact: giulio.tesei@bio.ku.dk

In [1]:
import urllib.parse
import urllib.request
import json
import pandas as pd
import numpy as np

def uniprot_api(uniprot):
    # Use the uniprot as input to fetch the protein name
    url_template_uniprot = "https://rest.uniprot.org/uniprotkb/{}.json"
    url_uniprot = url_template_uniprot.format(uniprot)

    with urllib.request.urlopen(url_uniprot) as link:
        #print(f'Extracting protein name for {uniprot}...')
        data_uniprot = json.loads(link.read().decode())
        #print(data_uniprot)

        try:
            go_list = []
            for count,value in enumerate(data_uniprot['uniProtKBCrossReferences']):
                if data_uniprot['uniProtKBCrossReferences'][count]["database"] == "GO":
                    go_id = data_uniprot['uniProtKBCrossReferences'][count]["id"]
                    #print(go_id)
                    go_list.append(go_id)
        except:
            go_id = "unknown"
            go_list.append(go_id)


        if len(go_list) == 0:
            go_id = "unknown"
            go_list.append(go_id)

        return go_list

In [5]:
# sequences
df_idrome = pd.read_csv('idr_selection/idr_pLDDT.csv.gz',header=0,sep=';')
df_idrome.sort_values('uniprot',inplace=True)
df_idrome['seq_name'] = df_idrome.uniprot+'_'+df_idrome['first'].apply(lambda x : 
                '{:g}'.format(x))+'_'+df_idrome['last'].apply(lambda x : '{:g}'.format(x))
df_idrome.set_index('seq_name',inplace=True)
df_idrome
df_idrome = df_idrome.drop(['category','presplit'],axis=1)
df_idrome = df_idrome.rename({'uniprot':'UniProt_ID','nres_unip':'N_FL',
                              'nres_seg':'N','sequence':'fasta'},axis=1)

In [54]:
GO_terms = pd.Series(pd.read_pickle('go_analysis/uniprot_dict.pkl')).to_dict()

In [55]:
for i,uniprot in enumerate(df_idrome.UniProt_ID.unique()):
    go = uniprot_api(uniprot)
    GO_terms[uniprot] = go
    print(i,end='-')

0-1-2-3-4-5-6-7-8-9-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30-31-32-33-34-35-36-37-38-39-40-41-42-43-44-45-46-47-48-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-69-70-71-72-73-74-75-76-77-78-79-80-81-82-83-84-85-86-87-88-89-90-91-92-93-94-95-96-97-98-99-100-101-102-103-104-105-106-107-108-109-110-111-112-113-114-115-116-117-118-119-120-121-122-123-124-125-126-127-128-129-130-131-132-133-134-135-136-137-138-139-140-141-142-143-144-145-146-147-148-149-150-151-152-153-154-155-156-157-158-159-160-161-162-163-164-165-166-167-168-169-170-171-172-173-174-175-176-177-178-179-180-181-182-183-184-185-186-187-188-189-190-191-192-193-194-195-196-197-198-199-200-201-202-203-204-205-206-207-208-209-210-211-212-213-214-215-216-217-218-219-220-221-222-223-224-225-226-227-228-229-230-231-232-233-234-235-236-237-238-239-240-241-242-243-244-245-246-247-248-249-250-251-252-253-254-255-256-257-258-259-260-261-262-263-264-265-266-267-268-269-270-271-272-273-274-275-276-27

In [78]:
pd.Series(GO_terms).to_pickle('idr_selection/uniprot_dict.pkl')