---
**Calculate similarity measures**

---

In [1]:
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random
import pickle
from scipy import sparse
import math
import itertools
import random

from toolbox import *

%matplotlib inline
sns.set()
sns.set_context("notebook")

In [2]:
sns.set(rc={'figure.figsize':(14,6)})

In [3]:
cfg = load_cfg()

logVersions = load_LogVersions()

In [4]:
def createLongDF(ids):
    
    print(" - Create ID_A")
    idsA = list(itertools.chain.from_iterable(itertools.repeat(x, len(ids)) for x in ids))
    
    print(" - Create ID_B")
    idsB = ids*len(ids)
    
    print(" - Create IDs to keep")
    idx2keep = []
    n = len(ids)
    for i in range(n-1):
        idx2keep += list(range(i*n+i+1, (i+1)*n))
        
    print(' - Filter out ID_A')
    idsA = [idsA[i] for i in idx2keep]
    
    print(' - Filter out ID_B')
    idsB = [idsB[i] for i in idx2keep]
    
    print(' - Create DataFrame')
    df_features = pd.DataFrame({
        'uniprotID_A' : idsA,
        'uniprotID_B' : idsB
    })
    
    return(df_features)

In [5]:
def addCosineSimilarity(df):
    
    print(" - Computing cosine similarity")
    cosMat = cosine_similarity(df, df)
    
    print(" - Flatten the matrix")
    cosMat = cosMat.flatten(order='C')
    
    print(" - Create IDs to keep")
    idx2keep = []
    n = len(df)
    for i in range(n-1):
        idx2keep += list(range(i*n+i+1, (i+1)*n))
    
    print(" - Filter out useless values")
    cosMat = cosMat[idx2keep]
    
    return cosMat

In [10]:
featuresDict = {
    'uniprot': {
        'bioProcessUniprot': {
            'path': os.path.join(
                cfg['outputPreprocessingUniprot'], 
                "bioProcessUniprot_yeast_v{}--{}.pkl".format(logVersions['UniProt']['yeast']['rawData'], logVersions['UniProt']['yeast']['preprocessed'])
            ),
            'method': 'cosine similarity',
        },
        'cellCompUniprot': {
            'path': os.path.join(
                cfg['outputPreprocessingUniprot'], 
                "cellCompUniprot_yeast_v{}--{}.pkl".format(logVersions['UniProt']['yeast']['rawData'], logVersions['UniProt']['yeast']['preprocessed'])
            ),
            'method': 'cosine similarity',
        },
        'molFuncUniprot': {
            'path': os.path.join(
                cfg['outputPreprocessingUniprot'], 
                "molFuncUniprot_yeast_v{}--{}.pkl".format(logVersions['UniProt']['yeast']['rawData'], logVersions['UniProt']['yeast']['preprocessed'])
            ),
            'method': 'cosine similarity',
        },
        'domainUniprot': {
            'path': os.path.join(
                cfg['outputPreprocessingUniprot'], 
                "domainFT_yeast_v{}--{}.pkl".format(logVersions['UniProt']['yeast']['rawData'], logVersions['UniProt']['yeast']['preprocessed'])
            ),
            'method': 'cosine similarity',
        },
        'motifUniprot': {
            'path': os.path.join(
                cfg['outputPreprocessingUniprot'], 
                "motif_yeast_v{}--{}.pkl".format(logVersions['UniProt']['yeast']['rawData'], logVersions['UniProt']['yeast']['preprocessed'])
            ),
            'method': 'cosine similarity',
        },
    },
}

In [11]:
for group, group_details in featuresDict.items():
    print(group)

uniprot


In [15]:
def addAllFeatures(featuresDict):
    
    print('Create initial idx DF')
    uniprotIDs = pd.read_csv(
        os.path.join(cfg['rawDataUniProt'], 
                     "uniprot_allProteins_yeast_v{}.pkl".format(logVersions['UniProt']['yeast']['rawData'])),
        header=None,
        names=['uniprotID']
    )
    df_features = createLongDF(uniprotIDs.uniprotID.to_list())
    print()
    
    for group, group_details in featuresDict.items():
        
        print('# Starting {}'.format(group))
        
        if group == 'uniprot':
            isFirst = False # means it's not first of this group
            new_idx = df_features
            newIDs = uniprotIDs.uniprotID.copy()
        else:
            isFirst = True # first of its group 
        
        for feature, details in group_details.items():
            
            print(feature)
            
            df = pd.read_pickle(details['path'])
            
            if isFirst:
                print(' - Create new idx dataframe')
                newIDs = df.uniprotID.copy()
                new_idx = createLongDF(list(df.uniprotID))
                isFirst = False
            else:
                # check that the IDs are in the right order
                assert df.uniprotID.equals(newIDs)
                
            df.set_index('uniprotID', inplace=True)
                
            if details['method'] == 'cosine similarity':
                new_idx[feature] = addCosineSimilarity(df)
                print(' - df_features: ', df_features.shape)
            else:
                print('--> wrong method')
        
        if group != 'uniprot':
            print(' - Merging to df_features')
            df_features = df_features.merge(
                new_idx,
                how = 'left',
                on = ['uniprotID_A','uniprotID_B']
            )
            print(' - df_features: ', df_features.shape)
        print()
            
    return df_features

In [16]:
df_features = addAllFeatures(featuresDict)

Create initial idx DF
 - Create ID_A
 - Create ID_B
 - Create IDs to keep
 - Filter out ID_A
 - Filter out ID_B
 - Create DataFrame

# Starting uniprot
bioProcessUniprot
 - Computing cosine similarity
 - Flatten the matrix
 - Create IDs to keep
 - Filter out useless values
 - df_features:  (22582560, 3)
cellCompUniprot
 - Computing cosine similarity
 - Flatten the matrix
 - Create IDs to keep
 - Filter out useless values
 - df_features:  (22582560, 4)
molFuncUniprot
 - Computing cosine similarity
 - Flatten the matrix
 - Create IDs to keep
 - Filter out useless values
 - df_features:  (22582560, 5)
domainUniprot
 - Computing cosine similarity
 - Flatten the matrix
 - Create IDs to keep
 - Filter out useless values
 - df_features:  (22582560, 6)
motifUniprot
 - Computing cosine similarity
 - Flatten the matrix
 - Create IDs to keep
 - Filter out useless values
 - df_features:  (22582560, 7)



In [17]:
glance(df_features)

DataFrame: 22,582,560 rows 	 7 columns


Unnamed: 0,uniprotID_A,uniprotID_B,bioProcessUniprot,cellCompUniprot,molFuncUniprot,domainUniprot,motifUniprot
0,A0A023PXA5,A0A023PXB0,0.0,0.0,0.0,0.0,0.0
1,A0A023PXA5,A0A023PXB5,0.0,0.0,0.0,0.0,0.0
2,A0A023PXA5,A0A023PXB9,0.0,0.0,0.0,0.0,0.0
3,A0A023PXA5,A0A023PXC2,0.0,0.0,0.0,0.0,0.0
4,A0A023PXA5,A0A023PXC7,0.0,0.0,0.0,0.0,0.0


In [26]:
df_features.loc[df_features.cellCompUniprot != 0]

Unnamed: 0,uniprotID_A,uniprotID_B,bioProcessUniprot,cellCompUniprot,molFuncUniprot,domainUniprot,motifUniprot
13439,A0A023PXB5,A0A023PXB9,0.0,1.00000,0.0,0.0,0.0
13440,A0A023PXB5,A0A023PXC2,0.0,1.00000,0.0,0.0,0.0
13441,A0A023PXB5,A0A023PXC7,0.0,1.00000,0.0,0.0,0.0
13443,A0A023PXB5,A0A023PXD5,0.0,1.00000,0.0,0.0,0.0
13444,A0A023PXB5,A0A023PXD9,0.0,1.00000,0.0,0.0,0.0
...,...,...,...,...,...,...,...
22582555,Q9ZZX1,Q9ZZX8,0.0,0.57735,0.0,0.0,0.0
22582556,Q9ZZX1,Q9ZZX9,0.0,0.57735,0.0,0.0,0.0
22582557,Q9ZZX7,Q9ZZX8,0.0,1.00000,0.0,0.0,0.0
22582558,Q9ZZX7,Q9ZZX9,0.0,1.00000,0.0,0.0,0.0


# Sanity checks

In [18]:
for group, group_details in featuresDict.items():
    for feature, details in group_details.items():
        print(feature)
        foo = df_features.loc[(df_features[feature]>0)&(df_features[feature]<1)]
        foo = foo.iloc[random.randrange(len(foo))]

        df = pd.read_pickle(details['path'])
        df.set_index('uniprotID', inplace=True)

        df = df.loc[[foo.uniprotID_A, foo.uniprotID_B]]

        bar = cosine_similarity(df, df)[0,1]
        baar = foo[feature]

        assert math.isclose(bar,baar, rel_tol=1e-6)

bioProcessUniprot
cellCompUniprot
molFuncUniprot
domainUniprot
motifUniprot


In [19]:
assert ~df_features.duplicated(subset=["uniprotID_A","uniprotID_B"]).any()

In [20]:
uniprotIDs = pd.read_csv(
    os.path.join(cfg['rawDataUniProt'], 
                 "uniprot_allProteins_yeast_v{}.pkl".format(logVersions['UniProt']['yeast']['rawData'])),
    header=None,
    names=['uniprotID']
)
assert len(df_features) == len(uniprotIDs)*(len(uniprotIDs)-1)/2

# Export

- v1.0: uses new cleaned data and cosine similarity for all of them (06/12/2021)

In [21]:
versionFE = '1-0'

logVersions['featuresEngineering']['yeast'] = dict()
logVersions['featuresEngineering']['yeast']['similarityMeasure']=versionFE

dump_LogVersions(logVersions)

In [22]:
df_features.to_pickle(
    os.path.join(
        cfg['outputFeaturesEngineering'],
        "similarityMeasures_yeast_v{}.pkl".format(versionFE)
    )
)