*System requirements: 60GB (48 for the processing, but 60 needed for the pickling)...*

In [1]:
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
import pickle
from scipy import sparse
import math
import itertools
import random

from toolbox import *

%matplotlib inline
sns.set()
sns.set_context("notebook")

In [2]:
sns.set(rc={'figure.figsize':(14,6)})

In [3]:
cfg = load_cfg()

logVersions = load_LogVersions()

In [4]:
def createLongDF(ids):
    
    print(" - Create ID_A")
    idsA = list(itertools.chain.from_iterable(itertools.repeat(x, len(ids)) for x in ids))
    
    print(" - Create ID_B")
    idsB = ids*len(ids)
    
    print(" - Create IDs to keep")
    idx2keep = []
    n = len(ids)
    for i in range(n-1):
        idx2keep += list(range(i*n+i+1, (i+1)*n))
        
    print(' - Filter out ID_A')
    idsA = [idsA[i] for i in idx2keep]
    
    print(' - Filter out ID_B')
    idsB = [idsB[i] for i in idx2keep]
    
    print(' - Create DataFrame')
    df_features = pd.DataFrame({
        'uniprotID_A' : idsA,
        'uniprotID_B' : idsB
    })
    
    return(df_features)

In [5]:
def addCosineSimilarity(df):
    
    print(" - Computing cosine similarity")
    cosMat = cosine_similarity(df, df)
    
    print(" - Flatten the matrix")
    cosMat = cosMat.flatten(order='C')
    
    print(" - Create IDs to keep")
    idx2keep = []
    n = len(df)
    for i in range(n-1):
        idx2keep += list(range(i*n+i+1, (i+1)*n))
    
    print(" - Filter out useless values")
    cosMat = cosMat[idx2keep]
    
    return cosMat

In [6]:
featuresDict = {
    'HPA2': {
        'RNAseqHPA': {
            'path': os.path.join(
                cfg['outputPreprocessingHPA'], 
                "consensusRNAseq_v{}.pkl".format(logVersions['HPA']['preprocessed']['consensusRNAseq'])
            ),
            'method': 'cosine similarity'
        },
    },
    'HPA1': {
         'tissueHPA': {
            'path': os.path.join(
                cfg['outputPreprocessingHPA'], 
                "tissueIHC_tissueOnly_v{}.pkl".format(logVersions['HPA']['preprocessed']['tissueIHC_tissueOnly'])
            ),
            'method': 'cosine similarity'
        },
        'tissueCellHPA': {
            'path': os.path.join(
                cfg['outputPreprocessingHPA'], 
                "tissueIHC_tissueCell_v{}.pkl".format(logVersions['HPA']['preprocessed']['tissueIHC_tissueCell'])
            ),
            'method': 'cosine similarity'
        },
    },
    'HPA3': {
        'subcellularLocationHPA': {
            'path': os.path.join(
                cfg['outputPreprocessingHPA'], 
                "subcellularLocation_v{}.pkl".format(logVersions['HPA']['preprocessed']['subcellularLocation'])
            ),
            'method': 'cosine similarity'
        }
    },
    'uniprot': {
        'bioProcessUniprot': {
            'path': os.path.join(
                cfg['outputPreprocessingUniprot'], 
                "bioProcessUniprot_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
            ),
            'method': 'cosine similarity',
        },
        'cellCompUniprot': {
            'path': os.path.join(
                cfg['outputPreprocessingUniprot'], 
                "cellCompUniprot_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
            ),
            'method': 'cosine similarity',
        },
        'molFuncUniprot': {
            'path': os.path.join(
                cfg['outputPreprocessingUniprot'], 
                "molFuncUniprot_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
            ),
            'method': 'cosine similarity',
        },
        'domainUniprot': {
            'path': os.path.join(
                cfg['outputPreprocessingUniprot'], 
                "domainFT_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
            ),
            'method': 'cosine similarity',
        },
        'motifUniprot': {
            'path': os.path.join(
                cfg['outputPreprocessingUniprot'], 
                "motif_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
            ),
            'method': 'cosine similarity',
        },
    },
    'Bgee': {
        'Bgee': {
            'path': os.path.join(
                cfg['outputPreprocessingBgee'],
                "Bgee_processed_v{}.pkl".format(logVersions['Bgee']['preprocessed'])
            ),
            'method': 'cosine similarity'
        },
    },
}

In [7]:
for group, group_details in featuresDict.items():
    print(group)

HPA2
HPA1
HPA3
uniprot
Bgee


In [8]:
# featuresDict = {
#     'uniprot': {
#         'domainUniprot': {
#             'path': os.path.join(
#                 cfg['outputPreprocessingUniprot'], 
#                 "domainFT_v{}--{}.pkl".format(logVersions['UniProt']['rawData'], logVersions['UniProt']['preprocessed'])
#             ),
#             'method': 'cosine similarity',
#         },
#     }
# }

In [9]:
def addAllFeatures(featuresDict):
    
    print('Create initial idx DF')
    uniprotIDs = pd.read_csv(
        os.path.join(cfg['rawDataUniProt'], 
                     "uniprot_allProteins_Human_v{}.pkl".format(logVersions['UniProt']['rawData'])),
        header=None,
        names=['uniprotID']
    )
    df_features = createLongDF(uniprotIDs.uniprotID.to_list())
    print()
    
    for group, group_details in featuresDict.items():
        
        print('# Starting {}'.format(group))
        
        if group == 'uniprot':
            isFirst = False # means it's not first of this group
            new_idx = df_features
            newIDs = uniprotIDs.uniprotID.copy()
        else:
            isFirst = True # first of its group 
        
        for feature, details in group_details.items():
            
            print(feature)
            
            df = pd.read_pickle(details['path'])
            
            if isFirst:
                print(' - Create new idx dataframe')
                newIDs = df.uniprotID.copy()
                new_idx = createLongDF(list(df.uniprotID))
                isFirst = False
            else:
                # check that the IDs are in the right order
                assert df.uniprotID.equals(newIDs)
                
            df.set_index('uniprotID', inplace=True)
                
            if details['method'] == 'cosine similarity':
                new_idx[feature] = addCosineSimilarity(df)
                print(' - df_features: ', df_features.shape)
            else:
                print('--> wrong method')
        
        if group != 'uniprot':
            print(' - Merging to df_features')
            df_features = df_features.merge(
                new_idx,
                how = 'left',
                on = ['uniprotID_A','uniprotID_B']
            )
            print(' - df_features: ', df_features.shape)
        print()
            
    return df_features

In [10]:
df_features = addAllFeatures(featuresDict)

Create initial idx DF
 - Create ID_A
 - Create ID_B
 - Create IDs to keep
 - Filter out ID_A
 - Filter out ID_B
 - Create DataFrame

# Starting HPA2
RNAseqHPA
 - Create new idx dataframe
 - Create ID_A
 - Create ID_B
 - Create IDs to keep
 - Filter out ID_A
 - Filter out ID_B
 - Create DataFrame
 - Computing cosine similarity
 - Flatten the matrix
 - Create IDs to keep
 - Filter out useless values
 - df_features:  (207784305, 2)
 - Merging to df_features
 - df_features:  (207784305, 3)

# Starting HPA1
tissueHPA
 - Create new idx dataframe
 - Create ID_A
 - Create ID_B
 - Create IDs to keep
 - Filter out ID_A
 - Filter out ID_B
 - Create DataFrame
 - Computing cosine similarity
 - Flatten the matrix
 - Create IDs to keep
 - Filter out useless values
 - df_features:  (207784305, 3)
tissueCellHPA
 - Computing cosine similarity
 - Flatten the matrix
 - Create IDs to keep
 - Filter out useless values
 - df_features:  (207784305, 3)
 - Merging to df_features
 - df_features:  (207784305, 5)


In [11]:
glance(df_features)

DataFrame: 207,784,305 rows 	 12 columns


Unnamed: 0,uniprotID_A,uniprotID_B,RNAseqHPA,tissueHPA,tissueCellHPA,subcellularLocationHPA,bioProcessUniprot,cellCompUniprot,molFuncUniprot,domainUniprot,motifUniprot,Bgee
0,A0A024RBG1,A0A075B6H7,,,,,0.0,0.0,0.0,0.0,0.0,0.133874
1,A0A024RBG1,A0A075B6H8,,,,,0.0,0.0,0.0,0.0,0.0,0.081341
2,A0A024RBG1,A0A075B6H9,,,,,0.0,0.0,0.0,0.0,0.0,0.384274
3,A0A024RBG1,A0A075B6I0,,,,,0.0,0.0,0.0,0.0,0.0,0.35827
4,A0A024RBG1,A0A075B6I1,,,,,0.0,0.0,0.0,0.0,0.0,-0.007139


# Sanity checks

In [12]:
for group, group_details in featuresDict.items():
    for feature, details in group_details.items():
        print(feature)
        foo = df_features.loc[(df_features[feature]>0)&(df_features[feature]<1)]
        foo = foo.iloc[random.randrange(len(foo))]

        df = pd.read_pickle(details['path'])
        df.set_index('uniprotID', inplace=True)

        df = df.loc[[foo.uniprotID_A, foo.uniprotID_B]]

        bar = cosine_similarity(df, df)[0,1]
        baar = foo[feature]

        assert math.isclose(bar,baar, rel_tol=1e-6)

RNAseqHPA
tissueHPA
tissueCellHPA
subcellularLocationHPA
bioProcessUniprot
cellCompUniprot
molFuncUniprot
domainUniprot
motifUniprot
Bgee


In [13]:
assert ~df_features.duplicated(subset=["uniprotID_A","uniprotID_B"]).any()

In [14]:
uniprotIDs = pd.read_csv(
    os.path.join(cfg['rawDataUniProt'], 
                 "uniprot_allProteins_Human_v{}.pkl".format(logVersions['UniProt']['rawData'])),
    header=None,
    names=['uniprotID']
)
assert len(df_features) == len(uniprotIDs)*(len(uniprotIDs)-1)/2

# Export

- v3.0: uses new cleaned data and cosine similarity for all of them (24/08/2020)
- v3.1: uses new cleaned data and cosine similarity for all of them (09/11/2021)

In [15]:
versionFE = '3-1'

# logVersions['featuresEngineering'] = dict()
logVersions['featuresEngineering']['similarityMeasure']=versionFE

dump_LogVersions(logVersions)

In [16]:
df_features.to_pickle(
    os.path.join(
        cfg['outputFeaturesEngineering'],
        "similarityMeasures_v{}.pkl".format(versionFE)
    )
)