In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import sys
import os

<h1> map_UniProt

In [2]:
import re
import time
import json
import zlib
from xml.etree import ElementTree
from urllib.parse import urlparse, parse_qs, urlencode
import requests
from requests.adapters import HTTPAdapter, Retry

POLLING_INTERVAL = 3

API_URL = "https://rest.uniprot.org"


retries = Retry(total=5, backoff_factor=0.25, status_forcelist=[500, 502, 503, 504])
session = requests.Session()
session.mount("https://", HTTPAdapter(max_retries=retries))


def submit_id_mapping(from_db, to_db, ids):
    request = requests.post(
        f"{API_URL}/idmapping/run",
        data={"from": from_db, "to": to_db, "ids": ",".join(ids)},
    )
    request.raise_for_status()
    return request.json()["jobId"]

def get_next_link(headers):
    re_next_link = re.compile(r'<(.+)>; rel="next"')
    if "Link" in headers:
        match = re_next_link.match(headers["Link"])
        if match:
            return match.group(1)


def check_id_mapping_results_ready(job_id):
    while True:
        request = session.get(f"{API_URL}/idmapping/status/{job_id}")
        request.raise_for_status()
        j = request.json()
        if "jobStatus" in j:
            if j["jobStatus"] == "RUNNING":
                print(f"Retrying in {POLLING_INTERVAL}s")
                time.sleep(POLLING_INTERVAL)
            else:
                raise Exception(request["jobStatus"])
        else:
            return bool(j["results"] or j["failedIds"])


def get_batch(batch_response, file_format, compressed):
    batch_url = get_next_link(batch_response.headers)
    while batch_url:
        batch_response = session.get(batch_url)
        batch_response.raise_for_status()
        yield decode_results(batch_response, file_format, compressed)
        batch_url = get_next_link(batch_response.headers)


def combine_batches(all_results, batch_results, file_format):
    if file_format == "json":
        for key in ("results", "failedIds"):
            if key in batch_results and batch_results[key]:
                all_results[key] += batch_results[key]
    elif file_format == "tsv":
        return all_results + batch_results[1:]
    else:
        return all_results + batch_results
    return all_results


def get_id_mapping_results_link(job_id):
    url = f"{API_URL}/idmapping/details/{job_id}"
    request = session.get(url)
    request.raise_for_status()
    return request.json()["redirectURL"]


def decode_results(response, file_format, compressed):
    if compressed:
        decompressed = zlib.decompress(response.content, 16 + zlib.MAX_WBITS)
        if file_format == "json":
            j = json.loads(decompressed.decode("utf-8"))
            return j
        elif file_format == "tsv":
            return [line for line in decompressed.decode("utf-8").split("\n") if line]
        elif file_format == "xlsx":
            return [decompressed]
        elif file_format == "xml":
            return [decompressed.decode("utf-8")]
        else:
            return decompressed.decode("utf-8")
    elif file_format == "json":
        return response.json()
    elif file_format == "tsv":
        return [line for line in response.text.split("\n") if line]
    elif file_format == "xlsx":
        return [response.content]
    elif file_format == "xml":
        return [response.text]
    return response.text


def get_xml_namespace(element):
    m = re.match(r"\{(.*)\}", element.tag)
    return m.groups()[0] if m else ""


def merge_xml_results(xml_results):
    merged_root = ElementTree.fromstring(xml_results[0])
    for result in xml_results[1:]:
        root = ElementTree.fromstring(result)
        for child in root.findall("{http://uniprot.org/uniprot}entry"):
            merged_root.insert(-1, child)
    ElementTree.register_namespace("", get_xml_namespace(merged_root[0]))
    return ElementTree.tostring(merged_root, encoding="utf-8", xml_declaration=True)


def print_progress_batches(batch_index, size, total):
    n_fetched = min((batch_index + 1) * size, total)
    #print(f"Fetched: {n_fetched} / {total}")


def get_id_mapping_results_search(url):
    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    file_format = query["format"][0] if "format" in query else "json"
    if "size" in query:
        size = int(query["size"][0])
    else:
        size = 500
        query["size"] = size
    compressed = (
        query["compressed"][0].lower() == "true" if "compressed" in query else False
    )
    parsed = parsed._replace(query=urlencode(query, doseq=True))
    url = parsed.geturl()
    request = session.get(url)
    request.raise_for_status()
    results = decode_results(request, file_format, compressed)
    total = int(request.headers["x-total-results"])
    print_progress_batches(0, size, total)
    for i, batch in enumerate(get_batch(request, file_format, compressed), 1):
        results = combine_batches(results, batch, file_format)
        print_progress_batches(i, size, total)
    if file_format == "xml":
        return merge_xml_results(results)
    return results


def get_id_mapping_results_stream(url):
    if "/stream/" not in url:
        url = url.replace("/results/", "/stream/")
    request = session.get(url)
    request.raise_for_status()
    parsed = urlparse(url)
    query = parse_qs(parsed.query)
    file_format = query["format"][0] if "format" in query else "json"
    compressed = (
        query["compressed"][0].lower() == "true" if "compressed" in query else False
    )
    return decode_results(request, file_format, compressed)


def map_UniProtID(UniProtIDs, identifier_from, identifier_to):
    try:
        job_id = submit_id_mapping(
            from_db=identifier_from, to_db=identifier_to, ids=UniProtIDs 
        )
        if check_id_mapping_results_ready(job_id):
            link = get_id_mapping_results_link(job_id)
            results = get_id_mapping_results_search(link)
            # Equivalently using the stream endpoint which is more demanding
            # on the API and so is less stable:
            # results = get_id_mapping_results_stream(link)
        
    except:
        ensemble_flat = []

        
    id_dict = {results['results'][i]['from']:results['results'][i]['to'] for i in range(len(results['results']))}
    return id_dict


<h1>get_ESGN_PTM

In [3]:
def get_ESNG_PTM(df):
    import urllib.parse
    import urllib.request
    import pandas as pd
    
    url = 'https://www.uniprot.org/uploadlists/'
    
    UniProtID_new = []
    ESNG_new = []
    query = ''
    
    for i in range(len(df)):
        for x in range(len(df['Proteins'][i].split(';'))):
            query = query + str(df['Proteins'][i].split(';')[x])+' '
      
    params = {
    'from': 'ACC+ID',
    'to': 'ENSEMBL_ID',
    'format': 'tab',
    'query': query
    }   
         
    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    with urllib.request.urlopen(req) as f:
        response = f.read()
               
    anno = str(response.decode('utf-8')).split()
    
    anno.remove('To')
    anno.remove('From')
    ESNG = []
    UniProtID = []
    for i in range(len(anno)):
        if i % 2 == 0:
            UniProtID.append(anno[i])
            
        else:
            ESNG.append(anno[i])
    
    UniProtID_new.append(UniProtID)
    ESNG_new.append(ESNG)
    
    ESNG = []
    for i in range(len(df)):
        y=[]
        for x in range(len(df['Proteins'][i].split(';'))):
            if df['Proteins'][i].split(';')[x] in UniProtID_new[0]:
                index = UniProtID_new[0].index(df['Proteins'][i].split(';')[x])
                y.append(ESNG_new[0][index])
            else:
                y.append('')
        ESNG.append(y)
    df['ESNG'] = ESNG 
    return(df)

<h1>get_Entrez_PTM

In [4]:
def get_Entrez_PTM(df):
    import urllib.parse
    import urllib.request
    import pandas as pd
    
    url = 'https://www.uniprot.org/uploadlists/'
    
    UniProtID_new = []
    Entrez_GeneID_new = []
    query = ''
    
    for i in range(len(df)):
        for x in range(len(df['Proteins'][i].split(';'))):
            query = query + str(df['Proteins'][i].split(';')[x])+' '
      
    params = {
    'from': 'ACC+ID',
    'to': '	P_ENTREZGENEID',
    'format': 'tab',
    'query': query
    }   
         
    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    with urllib.request.urlopen(req) as f:
        response = f.read()
               
    anno = str(response.decode('utf-8')).split()
    
    anno.remove('To')
    anno.remove('From')
    Entrez_GeneID = []
    UniProtID = []
    for i in range(len(anno)):
        if i % 2 == 0:
            UniProtID.append(anno[i])
            
        else:
            Entrez_GeneID.append(anno[i])
    
    UniProtID_new.append(UniProtID)
    Entrez_GeneID_new.append(Entrez_GeneID)
    
    Entrez_GeneID = []
    for i in range(len(df)):
        y=[]
        for x in range(len(df['Proteins'][i].split(';'))):
            if df['Proteins'][i].split(';')[x] in UniProtID_new[0]:
                index = UniProtID_new[0].index(df['Proteins'][i].split(';')[x])
                y.append(Entrez_GeneID_new[0][index])
            else:
                y.append('')
        Entrez_GeneID.append(y)
    df['Entrez_GeneID'] = Entrez_GeneID 
    return(df)

<h1>get_IMPI_annotation_PTM </h1> -->file import

In [5]:
def get_IMPI_annotation_PTM(df):
    path_IMPI = os.path.dirname(os.path.abspath('__file__'))
    path_IMPI_file = path_IMPI+'\\IMPI_2021_Q4pre_Mus_Musculus.csv'
    df_IMPI = pd.read_csv(path_IMPI_file)
   #path_IMPI_file = 'Z:/Fynn/005_Mito/Reanalysis/IMPI_tables/IMPI_2020_Q3pre_Mus_Musculus.csv'
   #df_IMPI = pd.read_csv(path_IMPI_file)
    
    
    IMPI_new = []
    for x in range(len(df)):
        IMPI = []
        for i in df['ESNG'][x]:
            if i in list(df_IMPI['Ensembl Gene ID Mus Musculus']):
                IMPI.append(list(df_IMPI[df_IMPI['Ensembl Gene ID Mus Musculus']==i]['IMPI Class'])[0])
            else:
                IMPI.append('NA')
        IMPI_new.append(IMPI)
        
    df['IMPI_new']=IMPI_new
    return(df)

<h1>get_MitoCharta_annotation</h1> --> file import

In [6]:
def get_MitoCharta_annotation(df):
    path_MC3 = os.path.dirname(os.path.abspath('__file__'))
    df_MC3 = pd.read_excel(path_MC3+'\\Mouse_MitoCarta3_0.xls', sheet_name = [0,1,2])
    
    SubMitoLocalization_new = []
    Pathways_new = []
    for x in range(len(df)):
        SubMitoLocalization = []
        Pathways = []
        for i in df['Entrez_GeneID'][x]:
            if i != '' and int(i) in list(df_MC3[1]['MouseGeneID']):
                SubMitoLocalization.append(list(df_MC3[1][df_MC3[1]['MouseGeneID']==int(i)]['MitoCarta3.0_SubMitoLocalization'])[0])
        SubMitoLocalization_new.append(list(set(SubMitoLocalization)))
        
        for i in df['Entrez_GeneID'][x]:
            if i != '' and len(list(df_MC3[1][df_MC3[1]['MouseGeneID']==int(i)]['MitoCarta3.0_MitoPathways']))>0 and list(df_MC3[1][df_MC3[1]['MouseGeneID']==int(i)]['MitoCarta3.0_MitoPathways'])!=[0]:
                Pathways.extend(list(df_MC3[1][df_MC3[1]['MouseGeneID']==int(i)]['MitoCarta3.0_MitoPathways'])[0].split('|'))
        Pathways_new.append(list(set(Pathways)))
            
    df['SubMitoLocalization'] = SubMitoLocalization_new
    df['Pathways'] = Pathways_new
    return(df)

<h1>filter_Mito

In [7]:
def filter_Mito(df):
    #df['IMPI_new']= [df['IMPI_new'][i].split(',') for i in range(len(df))]
    row_filter = [i for i in range(len(df)) if (('Verified mitochondrial' in df['IMPI_new'][i])==True or 
                                                         #('Predicted mitochondrial' in df['IMPI_new'][i])==True or 
                                                         df['SubMitoLocalization'][i]!=[])]
    df_filtered = df.iloc[row_filter,:].reset_index(drop=True)
    return(df_filtered)

<h1>filter_PTM

In [8]:
def filter_PTM(df,samples=['B','BAT','H','K','L','S','SKM']):

    valid= []
    #get groups 
    for s in range(len(samples)):
        col = []
        per_val_val = []
        values_total = []
        [col.append(i) for i in range(len(list(df.columns))) if ((df.columns[i].find('Intensity wt')==0)&(df.columns[i].find('_'+samples[s])==(len(df.columns[i])-len(samples[s])-1)))==True]
        df.iloc[:,col] = np.log2((df.iloc[:,col]).replace(0,np.nan))
        for z in range(len(df)):
            valid_values = 0
            valid_values = [valid_values+1 for i in range(len(col)) if ((df.iloc[z,col[i]]>0) & (df.iloc[z,col[i]]!=np.nan))]
            per_val_val.append(sum(valid_values)/len(col))
            values_total.append(len(col))
        df['Valid values_'+samples[s]] = per_val_val
        df['Values_Tissue_'+samples[s]] = values_total

    df['TissueID'] = df[['Valid values_'+samples[0],'Valid values_'+samples[1],'Valid values_'+samples[2],'Valid values_'+samples[3],'Valid values_'+samples[4],'Valid values_'+samples[5],'Valid values_'+samples[6]]].replace(0,np.nan).count(axis=1)
    
        
    df = df.reset_index(drop=True)
    
    #calculate valid values    
    col_val = []
    for i in range(len(list(df.columns))):
        if ((df.columns[i].find('Valid values_')==0)):
            col_val.append(i)

#    #get max valid value value across groups -> at least XX valid values in at least 1 group    
#    for i in range(len(df)):
#        if df.iloc[i,col_val].max()>=cutoff:
#            valid.append(True)
#        else:
#            valid.append(False)
#    df['valid'] = valid
#    df = df[df['valid']==True]
#    df = df.reset_index(drop=True)
#
    #Median normalization -> Substract Median log2 value of column from each member of column
    cols = []
    for i in range(len(list(df.columns))):
        if df.columns[i].find('Intensity wt')==0:
            cols.append(i)
    
    df = df.join(pd.DataFrame((df.iloc[:,cols]-df.iloc[:,cols].median()).to_numpy(), columns = list('Norm_'+df.columns[cols])))
    
    #z-Score across all expandet samples
    cols = []
    for i in range(len(list(df.columns))):
        if df.columns[i].find('Norm_')==0:
            cols.append(i)
    
    #df = df.join(pd.DataFrame(stats.zscore(df.iloc[:,cols], nan_policy = 'omit'), columns = list('Zscore_'+df.columns[cols])))
    zz = stats.zscore(df.iloc[:,cols], nan_policy = 'omit')
    zz.columns = list('Zscore_'+df.columns[cols])        
    df = df.join(zz)
  
    #get median of each group  
    group = {}
    for s in range(len(samples)):
        col = []
        for i in range(len(list(df.columns))):
            if ((df.columns[i].find('Zscore_')==0)&(df.columns[i].find('_'+samples[s])==(len(df.columns[i])-len(samples[s])-1))):
                col.append(i)
            group.update({samples[s]:col})
    for i in range(len(group)):
        df['Median_Z-score_'+list(group.keys())[i]] = df.iloc[:,(list(group.values())[i])].median(1)
    
    return (df)

<h1>formating

In [9]:
def formating(df):
    df['Gene name'] = [df['Gene Names'][i].split(';')[0] if type(df['Gene Names'][i])==str else '' for i in range(len(df)) ]
    df['UniProt ID single'] = [df['Proteins'][i].split(';')[0] if type(df['Proteins'][i])==str else '' for i in range(len(df)) ]
    df['Protein_Identifier'] = [str(df['Gene name'][i])+'_'+str(df['Proteins'][i])for i in range(len(df)) ]
    return df

In [10]:
#path = os.path.dirname(os.path.dirname(os.path.abspath('__file__')))+'\\Tables\\'
path = os.path.dirname(os.path.abspath('__file__'))+'\\MQ_output\\' ## might be adjusted

#with open('Z:/Fynn/005_Mito/Reanalysis/Mito_tissue atlas\Mito_phos_fullprot_tissue/2_Tables_for_Website/Phos_colums.txt') as f:
#    columns_to_import = [line.rstrip() for line in f]

df = pd.read_csv((path+ 'modificationSpecificPeptides.txt'),sep = '\t', low_memory=False)
df = df[(df['Reverse']!='+') & (df['Potential contaminant']!='+')&(df['Phospho (STY)']>0)].reset_index(drop= True)
flat_list = list(set([x for xs in [df['Proteins'][i].split(';') for i in range(len(df))] for x in xs]))
ensemble_dict = map_UniProtID(flat_list,'UniProtKB_AC-ID','Ensembl')
df['ESNG'] = [list(set([x for xs in [re.findall(r"ENSMUSG\d+",ensemble_dict[item]) for item in df['Proteins'][i].split(';') if item in ensemble_dict.keys()] for x in xs])) for i in range(len(df))]
entrez_dict = map_UniProtID(flat_list,'UniProtKB_AC-ID','GeneID')
df['Entrez_GeneID'] = [list(set([entrez_dict[item] for item in df['Proteins'][i].split(';') if item in entrez_dict.keys()])) for i in range(len(df))]
df = get_IMPI_annotation_PTM(df)
df = get_MitoCharta_annotation(df)
df = filter_Mito(df)
df = filter_PTM(df,samples=['B','BAT','H','K','L','S','SKM'])

path_saving = os.path.dirname(os.path.abspath('__file__'))+'\\Prepared_tables\\'
df.to_csv((path_saving+ 'Prep_ModPeptides.csv'), index= False)

Retrying in 3s
Retrying in 3s
Retrying in 3s
Retrying in 3s
